-
Notifications
You must be signed in to change notification settings - Fork 41
add exit code dependent retry policy #9276
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
4c94acf
e70893e
595b71b
3614084
b76d758
69afe55
5b92334
9bd8224
9fca35e
8b3db7e
1e980e2
f492af6
d308f89
12183e9
30b6246
b288bb8
ef5ba93
1306f2b
3342bdd
b6ebe51
7f7fa6a
d117922
b2f179d
3bdc41d
359bc85
97d83ec
6cc2f4f
c76f844
d8051c8
2434ef3
e90b1bf
f23a770
72ca8cc
8f596dd
fff0317
034d93b
ab29d86
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,7 @@ | |
| import shutil | ||
| import subprocess | ||
| import socket | ||
| import time | ||
| from collections import namedtuple | ||
|
|
||
| from ServerUtilities import executeCommand, getLock | ||
|
|
@@ -15,6 +16,40 @@ | |
|
|
||
| JOB_RETURN_CODES = namedtuple('JobReturnCodes', 'OK RECOVERABLE_ERROR FATAL_ERROR')(0, 1, 2) | ||
|
|
||
| # ---------------------------------------------------------------------- | ||
| # Exit-code dependent retry policy | ||
| # ---------------------------------------------------------------------- | ||
|
|
||
| EXIT_RETRY_POLICY = { | ||
|
belforte marked this conversation as resolved.
|
||
| 1: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Job failed to bootstrap CMSSW; likely a worker node issue."}, | ||
| 50513: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Job did not find functioning CMSSW on worker node."}, | ||
| 81: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Job did not find functioning CMSSW on worker node."}, | ||
| 50115: {"type": "recoverable", "max_retries": 9, "delay": 900, "msg": "Job did not produce a FJR; will retry."}, | ||
| 195: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Job did not produce a FJR; will retry."}, | ||
| 137: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "SIGKILL; likely an unrelated batch system kill."}, | ||
| 10034: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Required application version not found at the site."}, | ||
| 50: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Required application version not found at the site."}, | ||
| 10040: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Site Error: failed to generate cmsRun cfg file at runtime."}, | ||
| 60403: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Timeout during attempted file stageout."}, | ||
| 243: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Timeout during attempted file stageout."}, | ||
| 60307: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Error during attempted file stageout."}, | ||
| 147: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Error during attempted file stageout."}, | ||
| 60311: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Error during attempted file stageout."}, | ||
| 151: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Error during attempted file stageout."}, | ||
| 8028: {"type": "recoverable", "max_retries": 9, "delay": 900, "msg": "Job failed to open local and fallback files."}, | ||
| 8021: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "FileReadError (May be a site error)."}, | ||
| 8020: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "FileOpenError (Likely a site error)."}, | ||
| 8022: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "FatalRootError."}, | ||
| 84: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Some required file not found; check logs for name of missing file."}, | ||
| 85: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Job failed to open local and fallback files."}, | ||
| 86: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Job failed to open local and fallback files."}, | ||
| 92: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Job failed to open local and fallback files."}, | ||
| 134: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Abort (ANSI) or IOT trap (4.2 BSD) (most likely user application crashed)."}, | ||
| 8001: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "Other CMS Exception."}, | ||
| 65: {"type": "recoverable", "max_retries": 2, "delay": 900, "msg": "End of job from user application (CMSSW)."}, | ||
| "default": {"type": "neutral", "max_retries": 2, "delay": 900, "msg": "Taking default exit code retry policy route."} | ||
| } | ||
|
|
||
| # strings in fatal root exception text which indicate code problem, not corrupted file | ||
| # a small "knowledge data base" | ||
| NOT_FILE_RELATED_FATAL_ROOT_ERRORS = [ | ||
|
|
@@ -98,6 +133,51 @@ def get_job_ad_from_condor_q(self): | |
|
|
||
| # = = = = = RetryJob = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | ||
|
|
||
| def store_retry_delay(self, delay): | ||
| retry_info_file = f"resubmit_info/job.{self.job_id}.txt" | ||
| retry_info = {} | ||
| if os.path.exists(retry_info_file): | ||
| try: | ||
| with open(retry_info_file, "r", encoding="utf-8") as fd: | ||
| retry_info = eval(fd.read()) | ||
| except Exception: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
| retry_info = {} | ||
|
|
||
| key = str(self.crab_retry) | ||
| if key not in retry_info: | ||
| retry_info[key] = {} | ||
|
|
||
| retry_info[key]["retry_delay_until"] = time.time() + delay | ||
|
|
||
| with open(retry_info_file + ".tmp", "w", encoding="utf-8") as fd: | ||
|
belforte marked this conversation as resolved.
|
||
| fd.write(str(retry_info)) | ||
| os.rename(retry_info_file + ".tmp", retry_info_file) | ||
|
|
||
| # = = = = = RetryJob = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | ||
|
|
||
|
|
||
| def apply_retry_policy(self, exitCode): | ||
| """ | ||
| Enforce exit-code dependent retry limits and delay. | ||
| Raises FatalError if retry limit exceeded. | ||
| """ | ||
| policy = EXIT_RETRY_POLICY.get(exitCode, EXIT_RETRY_POLICY["default"]) | ||
|
|
||
| if policy["type"] == "recoverable": | ||
| if self.crab_retry >= policy["max_retries"]: | ||
| raise FatalError(f"Retry limit reached for exit {exitCode}: {policy['msg']}") | ||
| delay = policy.get("delay", 900) | ||
| self.logger.info(f"Sleeping {delay} seconds before retry (exit code {exitCode})") | ||
| self.store_retry_delay(delay) | ||
| if exitCode in [8020, 8021, 8022, 8028, 84, 85, 86, 92, 134, 8001, 65]: | ||
| return | ||
| raise RecoverableError(policy["msg"]) | ||
|
|
||
| if policy["type"] == "neutral": | ||
| return | ||
|
|
||
| # = = = = = RetryJob = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = | ||
|
|
||
| def get_job_ad_from_file(self): | ||
| """ | ||
| Need a doc string here | ||
|
|
@@ -307,6 +387,8 @@ def check_exit_code(self): | |
| return 1 | ||
| try: | ||
| exitCode = int(self.report['exitCode']) | ||
| # Apply retry policy | ||
| self.apply_retry_policy(exitCode) | ||
| except ValueError: | ||
| msg = "Unable to extract job's wrapper exit code from job report." | ||
| self.logger.warning(msg) | ||
|
|
@@ -319,7 +401,6 @@ def check_exit_code(self): | |
| return 0 | ||
|
|
||
| msg = "Job or stageout wrapper finished with exit code %d." % (exitCode) | ||
| msg += " Trying to determine the meaning of the exit code and if it is a recoverable or fatal error." | ||
| self.logger.info(msg) | ||
|
|
||
| # Wrapper script sometimes returns the posix return code (8 bits). | ||
|
|
@@ -335,17 +416,6 @@ def check_exit_code(self): | |
| self.create_fake_fjr(exitMsg, 8022, 8022, fatalError=False) # retry the job | ||
| raise RecoverableError("Job failed to open local and fallback files.") | ||
|
|
||
| if exitCode == 1: | ||
| raise RecoverableError("Job failed to bootstrap CMSSW; likely a worker node issue.") | ||
|
|
||
| if exitCode == 50513 or exitCode == 81: | ||
| raise RecoverableError("Job did not find functioning CMSSW on worker node.") | ||
|
|
||
| # This is a difficult one -- right now CMSRunAnalysis.py will turn things like | ||
| # segfaults into an invalid FJR. Will revisit this decision later. | ||
| if exitCode == 50115 or exitCode == 195: | ||
| raise RecoverableError("Job did not produce a FJR; will retry.") | ||
|
|
||
| if exitCode == 134: | ||
| recoverable_signal = False | ||
| try: | ||
|
|
@@ -379,28 +449,6 @@ def check_exit_code(self): | |
| if cvmfs_issue: | ||
| raise RecoverableError("CVMFS issue detected.") | ||
|
|
||
| # Another difficult case -- so far, SIGKILL has mostly been observed at T2_CH_CERN, and it has nothing to do | ||
| # with an issue of the job itself. Typically, this isn't the user code's fault | ||
| # it was often a site or pilot misconfiguration that led to the pilot exhausting its allocated runtime. | ||
| # We should revisit this issue if we see SIGKILL happening for other cases that are the users' fault. | ||
| if exitCode == 137: | ||
| raise RecoverableError("SIGKILL; likely an unrelated batch system kill.") | ||
|
|
||
| if exitCode == 10034 or exitCode == 50: | ||
| raise RecoverableError("Required application version not found at the site.") | ||
|
|
||
| if exitCode == 10040: | ||
| raise RecoverableError("Site Error: failed to generate cmsRun cfg file at runtime.") | ||
|
|
||
| if exitCode == 60403 or exitCode == 243: | ||
| raise RecoverableError("Timeout during attempted file stageout.") | ||
|
|
||
| if exitCode == 60307 or exitCode == 147: | ||
| raise RecoverableError("Error during attempted file stageout.") | ||
|
|
||
| if exitCode == 60311 or exitCode == 151: | ||
| raise RecoverableError("Error during attempted file stageout.") | ||
|
|
||
| if exitCode: | ||
| raise FatalError("Job wrapper finished with exit code %d.\nExit message:\n %s" % (exitCode, exitMsg.replace('\n', '\n '))) | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.