-
Notifications
You must be signed in to change notification settings - Fork 7.2k
almost fully using new sdk for job stuff #60825
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 11 commits
6af37ec
722bbe2
f62b7b0
6d39d04
3862945
93e1491
c58c37c
7090e68
78990a6
4c70959
4332427
8e4ba44
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,8 +18,6 @@ | |
| JobBrokenError, | ||
| JobNoLogsError, | ||
| JobOutOfRetriesError, | ||
| JobTerminatedBeforeStartError, | ||
| JobTerminatedError, | ||
| LogsError, | ||
| PrepareCommandError, | ||
| PrepareCommandTimeout, | ||
|
|
@@ -158,26 +156,15 @@ def wait_for_nodes(self, num_nodes: int, timeout: float = 900): | |
| f"python wait_cluster.py {num_nodes} {timeout}", timeout=timeout + 30 | ||
| ) | ||
|
|
||
| def _handle_command_output( | ||
| self, job_status_code: int, error: str, raise_on_timeout: bool = True | ||
| ): | ||
| if job_status_code == -2: | ||
| raise JobBrokenError(f"Job state is 'BROKEN' with error:\n{error}\n") | ||
|
|
||
| if job_status_code == -3: | ||
| raise JobTerminatedError( | ||
| "Job entered 'TERMINATED' state (it was terminated " | ||
| "manually or Ray was stopped):" | ||
| f"\n{error}\n" | ||
| def _handle_command_output(self, job_state: int, raise_on_timeout: bool = True): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The refactoring to use the new SDK has removed the detailed error messages from the exceptions raised on job failure. For example, Please consider re-introducing these error details. The new Anyscale SDK's |
||
| if job_state == -1: | ||
| raise JobOutOfRetriesError( | ||
| "Job returned non-success state: 'FAILED' " | ||
| "(command has not been ran or no logs could have been obtained)." | ||
| ) | ||
|
Comment on lines
+160
to
164
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Please add a check for if job_state == -4:
raise JobTerminatedBeforeStartError(
"Job entered 'TERMINATED' state before it started "
"(most likely due to inability to provision required nodes; "
"otherwise it was terminated manually or Ray was stopped)."
)
if job_state == -1:
raise JobOutOfRetriesError(
"Job returned non-success state: 'FAILED' "
"(command has not been ran or no logs could have been obtained)."
) |
||
|
|
||
| if job_status_code == -4: | ||
| raise JobTerminatedBeforeStartError( | ||
| "Job entered 'TERMINATED' state before it started " | ||
| "(most likely due to inability to provision required nodes; " | ||
| "otherwise it was terminated manually or Ray was stopped):" | ||
| f"\n{error}\n" | ||
| ) | ||
| if job_state == -2: | ||
| raise JobBrokenError("Job state is 'UNKNOWN'.") | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Return code -4 unhandled in command output handlerHigh Severity
Additional Locations (1) |
||
|
|
||
| # First try to obtain the output.json from S3. | ||
| # If that fails, try logs. | ||
|
|
@@ -214,8 +201,7 @@ def _handle_command_output( | |
| ) | ||
| raise PrepareCommandError( | ||
| f"Prepare command '{self.prepare_commands[-1]}' returned " | ||
| f"non-success status: {prepare_return_codes[-1]} with error:" | ||
| f"\n{error}\n" | ||
| f"non-success status: {prepare_return_codes[-1]}." | ||
| ) | ||
| else: | ||
| raise JobNoLogsError("Could not obtain logs for the job.") | ||
|
|
@@ -231,15 +217,7 @@ def _handle_command_output( | |
|
|
||
| if workload_status_code is not None and workload_status_code != 0: | ||
| raise TestCommandError( | ||
| f"Command returned non-success status: {workload_status_code} with " | ||
| f"error:\n{error}\n" | ||
| ) | ||
|
|
||
| if job_status_code == -1: | ||
| raise JobOutOfRetriesError( | ||
| "Job returned non-success state: 'OUT_OF_RETRIES' " | ||
| "(command has not been ran or no logs could have been obtained) " | ||
| f"with error:\n{error}\n" | ||
| f"Command returned non-success status: {workload_status_code}." | ||
| ) | ||
|
|
||
| def _get_full_command_env(self, env: Optional[Dict[str, str]] = None): | ||
|
|
@@ -348,18 +326,14 @@ def run_command( | |
| working_dir = azure_file_path | ||
| logger.info(f"Working dir uploaded to {working_dir}") | ||
|
|
||
| job_status_code, time_taken = self.job_manager.run_and_wait( | ||
| job_state, time_taken = self.job_manager.run_and_wait( | ||
| full_command, | ||
| full_env, | ||
| working_dir=working_dir, | ||
| upload_path=self.upload_path, | ||
| timeout=int(timeout), | ||
| ) | ||
| error_message = self.job_manager.job_error_message() | ||
|
|
||
| self._handle_command_output( | ||
| job_status_code, error_message, raise_on_timeout=raise_on_timeout | ||
| ) | ||
| self._handle_command_output(job_state, raise_on_timeout=raise_on_timeout) | ||
|
|
||
| return time_taken | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,11 +3,7 @@ | |
| from typing import Any, Dict, Optional, Tuple | ||
|
|
||
| import anyscale | ||
| from anyscale.sdk.anyscale_client.models import ( | ||
| CreateProductionJob, | ||
| CreateProductionJobConfig, | ||
| HaJobStates, | ||
| ) | ||
| from anyscale.job.models import JobConfig, JobState | ||
|
|
||
| from ray_release.anyscale_util import LAST_LOGS_LENGTH | ||
| from ray_release.cluster_manager.cluster_manager import ClusterManager | ||
|
|
@@ -25,10 +21,9 @@ | |
| ) | ||
|
|
||
| job_status_to_return_code = { | ||
| HaJobStates.SUCCESS: 0, | ||
| HaJobStates.OUT_OF_RETRIES: -1, | ||
| HaJobStates.BROKEN: -2, | ||
| HaJobStates.TERMINATED: -3, | ||
| JobState.SUCCEEDED: 0, | ||
| JobState.FAILED: -1, | ||
| JobState.UNKNOWN: -2, | ||
| } | ||
| terminal_state = set(job_status_to_return_code.keys()) | ||
|
|
||
|
|
@@ -38,7 +33,6 @@ def __init__(self, cluster_manager: ClusterManager): | |
| self.start_time = None | ||
| self.counter = 0 | ||
| self.cluster_manager = cluster_manager | ||
| self._sdk = cluster_manager.sdk | ||
| self._last_job_result = None | ||
| self._job_id: Optional[str] = None | ||
| self._last_logs = None | ||
|
|
@@ -61,44 +55,35 @@ def _run_job( | |
| f"Executing {cmd_to_run} with {env_vars_for_job} via Anyscale job submit" | ||
| ) | ||
|
|
||
| runtime_env = { | ||
| "env_vars": env_vars_for_job, | ||
| } | ||
| if working_dir: | ||
| runtime_env["working_dir"] = working_dir | ||
| if upload_path: | ||
| runtime_env["upload_path"] = upload_path | ||
|
|
||
| try: | ||
| job_request = CreateProductionJob( | ||
| job_config = JobConfig( | ||
| name=self.cluster_manager.cluster_name, | ||
| description=f"Smoke test: {self.cluster_manager.smoke_test}", | ||
| project_id=self.cluster_manager.project_id, | ||
| config=CreateProductionJobConfig( | ||
| entrypoint=cmd_to_run, | ||
| runtime_env=runtime_env, | ||
| build_id=self.cluster_manager.cluster_env_build_id, | ||
| compute_config_id=self.cluster_manager.cluster_compute_id, | ||
| max_retries=0, | ||
| ), | ||
| entrypoint=cmd_to_run, | ||
| image_uri=self.cluster_manager.test.get_anyscale_byod_image(), | ||
| compute_config=self.cluster_manager.cluster_compute_id, | ||
| project=self.cluster_manager.project_name, | ||
| env_vars=env_vars_for_job, | ||
| working_dir=working_dir if working_dir else None, | ||
| max_retries=0, | ||
| ) | ||
| job_response = self._sdk.create_job(job_request) | ||
| self._job_id = anyscale.job.submit(config=job_config) | ||
| except Exception as e: | ||
| raise JobStartupFailed( | ||
| "Error starting job with name " | ||
| f"{self.cluster_manager.cluster_name}: " | ||
| f"{e}" | ||
| ) from e | ||
|
|
||
| self.save_last_job_result(job_response.result) | ||
| self._last_job_result = None | ||
| self.start_time = time.time() | ||
|
|
||
| logger.info(f"Link to job: " f"{format_link(self.job_url())}") | ||
| return | ||
|
|
||
| def save_last_job_result(self, value): | ||
| self._last_job_result = value | ||
| self._job_id = value.id if value else None | ||
| def save_last_job_status(self, status): | ||
| if status and hasattr(status, "id") and status.id != self._job_id: | ||
| logger.warning(f"Job ID mismatch: expected {self._job_id}, got {status.id}") | ||
| self._last_job_result = status | ||
|
|
||
| def job_id(self) -> Optional[str]: | ||
| return self._job_id | ||
|
|
@@ -108,15 +93,10 @@ def job_url(self) -> Optional[str]: | |
| return None | ||
| return anyscale_job_url(self._job_id) | ||
|
|
||
| def _last_job_status(self) -> Optional[HaJobStates]: | ||
| def _last_job_status(self) -> Optional["JobState"]: | ||
| if not self._last_job_result: | ||
| return None | ||
| return self._last_job_result.state.current_state | ||
|
|
||
| def job_error_message(self) -> str: | ||
| if self._last_job_result is None: | ||
| return "" | ||
| return self._last_job_result.state.error | ||
| return self._last_job_result.state | ||
|
|
||
| def _in_progress(self) -> bool: | ||
| if not self._last_job_result: | ||
|
|
@@ -125,18 +105,20 @@ def _in_progress(self) -> bool: | |
|
|
||
| def _get_job_status_with_retry(self): | ||
| return exponential_backoff_retry( | ||
| lambda: self._sdk.get_production_job(self._job_id), | ||
| lambda: anyscale.job.status(id=self._job_id), | ||
| retry_exceptions=Exception, | ||
| initial_retry_delay_s=1, | ||
| max_retries=3, | ||
| ).result | ||
| ) | ||
|
|
||
| def _terminate_job(self, raise_exceptions: bool = False): | ||
| if not self._in_progress(): | ||
| if not self._job_id: | ||
| return | ||
| if self._last_job_result is not None and not self._in_progress(): | ||
| return | ||
| logger.info(f"Terminating job {self._job_id}...") | ||
| try: | ||
| self._sdk.terminate_job(self._job_id) | ||
| anyscale.job.terminate(id=self._job_id) | ||
| logger.info(f"Job {self._job_id} terminated!") | ||
| except Exception: | ||
| msg = f"Couldn't terminate job {self._job_id}!" | ||
|
|
@@ -202,12 +184,12 @@ def _wait_job(self, timeout: int): | |
| next_status += 30 | ||
|
|
||
| result = self._get_job_status_with_retry() | ||
| self.save_last_job_result(result) | ||
| self.save_last_job_status(result) | ||
| status = self._last_job_status() | ||
|
|
||
| if not job_running and status in { | ||
| HaJobStates.RUNNING, | ||
| HaJobStates.ERRORED, | ||
| JobState.STARTING, | ||
| JobState.RUNNING, | ||
| }: | ||
| logger.info( | ||
| f"... job started ...({int(now - start_time)} seconds) ..." | ||
|
|
@@ -223,10 +205,10 @@ def _wait_job(self, timeout: int): | |
| time.sleep(1) | ||
|
|
||
| result = self._get_job_status_with_retry() | ||
| self.save_last_job_result(result) | ||
| self.save_last_job_status(result) | ||
| status = self._last_job_status() | ||
| assert status in terminal_state | ||
| if status == HaJobStates.TERMINATED and not job_running: | ||
| if status == JobState.FAILED and not job_running: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Return code -4 produced but never handledHigh Severity
Additional Locations (1) |
||
| # Soft infra error | ||
| retcode = -4 | ||
| else: | ||
|
|
||


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Early FAILED check skips output parsing for running jobs
Medium Severity
When
job_state == -1(aFAILEDjob that was already running),_handle_command_outputimmediately raisesJobOutOfRetriesErrorwithout attempting to parse output. The old code checked this condition last, after parsing output, so it could raise more specific errors likeTestCommandErrororPrepareCommandError. The error message also incorrectly says "command has not been ran" even though the command did run (since-1is only returned whenjob_runningwasTrue).