-
Notifications
You must be signed in to change notification settings - Fork 7.2k
almost fully using new sdk for job stuff #60825
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
6af37ec
722bbe2
f62b7b0
6d39d04
3862945
93e1491
c58c37c
7090e68
78990a6
4c70959
4332427
8e4ba44
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,8 +18,6 @@ | |
| JobBrokenError, | ||
| JobNoLogsError, | ||
| JobOutOfRetriesError, | ||
| JobTerminatedBeforeStartError, | ||
| JobTerminatedError, | ||
| LogsError, | ||
| PrepareCommandError, | ||
| PrepareCommandTimeout, | ||
|
|
@@ -158,26 +156,15 @@ def wait_for_nodes(self, num_nodes: int, timeout: float = 900): | |
| f"python wait_cluster.py {num_nodes} {timeout}", timeout=timeout + 30 | ||
| ) | ||
|
|
||
| def _handle_command_output( | ||
| self, job_status_code: int, error: str, raise_on_timeout: bool = True | ||
| ): | ||
| if job_status_code == -2: | ||
| raise JobBrokenError(f"Job state is 'BROKEN' with error:\n{error}\n") | ||
|
|
||
| if job_status_code == -3: | ||
| raise JobTerminatedError( | ||
| "Job entered 'TERMINATED' state (it was terminated " | ||
| "manually or Ray was stopped):" | ||
| f"\n{error}\n" | ||
| def _handle_command_output(self, job_state: int, raise_on_timeout: bool = True): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The refactoring to use the new SDK has removed the detailed error messages from the exceptions raised on job failure. For example, Please consider re-introducing these error details. The new Anyscale SDK's |
||
| if job_state == -1: | ||
| raise JobOutOfRetriesError( | ||
| "Job returned non-success state: 'FAILED' " | ||
| "(command has not been ran or no logs could have been obtained)." | ||
| ) | ||
|
Comment on lines
+160
to
164
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Please add a check for if job_state == -4:
raise JobTerminatedBeforeStartError(
"Job entered 'TERMINATED' state before it started "
"(most likely due to inability to provision required nodes; "
"otherwise it was terminated manually or Ray was stopped)."
)
if job_state == -1:
raise JobOutOfRetriesError(
"Job returned non-success state: 'FAILED' "
"(command has not been ran or no logs could have been obtained)."
) |
||
|
|
||
| if job_status_code == -4: | ||
| raise JobTerminatedBeforeStartError( | ||
| "Job entered 'TERMINATED' state before it started " | ||
| "(most likely due to inability to provision required nodes; " | ||
| "otherwise it was terminated manually or Ray was stopped):" | ||
| f"\n{error}\n" | ||
| ) | ||
| if job_state == -2: | ||
| raise JobBrokenError("Job state is 'UNKNOWN'.") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Return code -4 unhandled in command output handlerHigh Severity
Additional Locations (1) |
||
|
|
||
| # First try to obtain the output.json from S3. | ||
| # If that fails, try logs. | ||
|
|
@@ -214,8 +201,7 @@ def _handle_command_output( | |
| ) | ||
| raise PrepareCommandError( | ||
| f"Prepare command '{self.prepare_commands[-1]}' returned " | ||
| f"non-success status: {prepare_return_codes[-1]} with error:" | ||
| f"\n{error}\n" | ||
| f"non-success status: {prepare_return_codes[-1]}." | ||
| ) | ||
| else: | ||
| raise JobNoLogsError("Could not obtain logs for the job.") | ||
|
|
@@ -231,15 +217,7 @@ def _handle_command_output( | |
|
|
||
| if workload_status_code is not None and workload_status_code != 0: | ||
| raise TestCommandError( | ||
| f"Command returned non-success status: {workload_status_code} with " | ||
| f"error:\n{error}\n" | ||
| ) | ||
|
|
||
| if job_status_code == -1: | ||
| raise JobOutOfRetriesError( | ||
| "Job returned non-success state: 'OUT_OF_RETRIES' " | ||
| "(command has not been ran or no logs could have been obtained) " | ||
| f"with error:\n{error}\n" | ||
| f"Command returned non-success status: {workload_status_code}." | ||
| ) | ||
|
|
||
| def _get_full_command_env(self, env: Optional[Dict[str, str]] = None): | ||
|
|
@@ -348,18 +326,14 @@ def run_command( | |
| working_dir = azure_file_path | ||
| logger.info(f"Working dir uploaded to {working_dir}") | ||
|
|
||
| job_status_code, time_taken = self.job_manager.run_and_wait( | ||
| job_state, time_taken = self.job_manager.run_and_wait( | ||
| full_command, | ||
| full_env, | ||
| working_dir=working_dir, | ||
| upload_path=self.upload_path, | ||
| timeout=int(timeout), | ||
| ) | ||
| error_message = self.job_manager.job_error_message() | ||
|
|
||
| self._handle_command_output( | ||
| job_status_code, error_message, raise_on_timeout=raise_on_timeout | ||
| ) | ||
| self._handle_command_output(job_state, raise_on_timeout=raise_on_timeout) | ||
|
|
||
| return time_taken | ||
|
|
||
|
|
||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Early FAILED check skips output parsing for running jobs
Medium Severity
When
job_state == -1(aFAILEDjob that was already running),_handle_command_outputimmediately raisesJobOutOfRetriesErrorwithout attempting to parse output. The old code checked this condition last, after parsing output, so it could raise more specific errors likeTestCommandErrororPrepareCommandError. The error message also incorrectly says "command has not been ran" even though the command did run (since-1is only returned whenjob_runningwasTrue).