-
Notifications
You must be signed in to change notification settings - Fork 96
Fix data store missing info for submit-failed jobs #6926
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
92175e2
78605c1
8f6891b
226850f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Fixed info missing from UI for submit-failed tasks. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -67,7 +67,6 @@ | |
| Set, | ||
| TYPE_CHECKING, | ||
| Tuple, | ||
| Union, | ||
| ) | ||
| import zlib | ||
|
|
||
|
|
@@ -1612,37 +1611,27 @@ def _apply_broadcasts_to_runtime(self, tokens, rtconfig): | |
|
|
||
| def insert_job( | ||
| self, | ||
| name: str, | ||
| cycle_point: Union['PointBase', str], | ||
| itask: 'TaskProxy', | ||
| status: str, | ||
| job_conf: dict, | ||
| ): | ||
| ) -> None: | ||
| """Insert job into data-store. | ||
|
|
||
| Args: | ||
| name: Corresponding task name. | ||
| cycle_point: Cycle point string | ||
| status: The task's state. | ||
| job_conf: | ||
| Dictionary of job configuration used to generate | ||
| the job script. | ||
| (see TaskJobManager._prep_submit_task_job_impl) | ||
|
|
||
| Returns: | ||
|
|
||
| None | ||
|
|
||
| """ | ||
| if status not in JOB_STATUS_SET: | ||
| # Ignore task-only states e.g. preparing | ||
| # https://github.com/cylc/cylc-flow/issues/4994 | ||
| return | ||
|
|
||
| sub_num = job_conf['submit_num'] | ||
| tp_tokens = self.id_.duplicate( | ||
| cycle=str(cycle_point), | ||
| task=name, | ||
| ) | ||
| tp_tokens = self.id_.duplicate(itask.tokens) | ||
| tproxy: Optional[PbTaskProxy] | ||
| tp_id, tproxy = self.store_node_fetcher(tp_tokens) | ||
| if not tproxy: | ||
|
|
@@ -1665,6 +1654,7 @@ def insert_job( | |
| execution_time_limit=job_conf.get('execution_time_limit'), | ||
| platform=job_conf['platform']['name'], | ||
| job_runner_name=job_conf.get('job_runner_name'), | ||
| job_id=itask.summary.get('submit_method_id'), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note, there is no Job ID for a submission failure caused by a platform lookup error because no job submission was made.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| ) | ||
| # Not all fields are populated with some submit-failures, | ||
| # so use task cfg as base. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -794,13 +794,7 @@ def process_message( | |
| self._process_message_submitted(itask, event_time, forced) | ||
| self.spawn_children(itask, TASK_OUTPUT_SUBMITTED, forced) | ||
|
|
||
| # ... but either way update the job ID in the job proxy (it only | ||
| # comes in via the submission message). | ||
|
Comment on lines
-797
to
-798
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now it is done by |
||
| if itask.run_mode != RunMode.SIMULATION: | ||
| self.data_store_mgr.delta_job_attr( | ||
| itask, 'job_id', itask.summary['submit_method_id'] | ||
| ) | ||
| else: | ||
| if itask.run_mode == RunMode.SIMULATION: | ||
| # In simulation mode submitted implies started: | ||
| self.spawn_children(itask, TASK_OUTPUT_STARTED, forced) | ||
|
|
||
|
|
@@ -1465,7 +1459,6 @@ def _process_message_submit_failed( | |
| "time_submit_exit": event_time, | ||
| "submit_status": 1, | ||
| }) | ||
| itask.summary['submit_method_id'] = None | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't know why the job ID was being wiped on submit-failure
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know either, but there may be a reason, if not part of the bugfix, plz bump.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, is this the job ID or the job submit method?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Job ID in the job runner, it's part 3 of the bugfix |
||
| LOG.error(f"[{itask}] {self.EVENT_SUBMIT_FAILED}") | ||
| if ( | ||
| forced | ||
|
|
@@ -1503,6 +1496,7 @@ def _process_message_submit_failed( | |
| self._insert_task_job( | ||
| itask, event_time, self.JOB_SUBMIT_FAIL_FLAG, forced=forced) | ||
| self.data_store_mgr.delta_job_state(itask, TASK_STATUS_SUBMIT_FAILED) | ||
| self.data_store_mgr.delta_job_time(itask, 'submitted', event_time) | ||
| self._reset_job_timers(itask) | ||
|
|
||
| return no_retries | ||
|
|
@@ -1589,11 +1583,9 @@ def _insert_task_job( | |
| except IndexError: | ||
| # we do not have access to the job config (e.g. Scheduler | ||
| # crashed) - https://github.com/cylc/cylc-flow/pull/6326 | ||
| job_id = itask.tokens.duplicate( | ||
| job=itask.submit_num | ||
| ).relative_id | ||
| LOG.warning( | ||
| f'Could not find the job configuration for "{job_id}".' | ||
| 'Could not find the job configuration for ' | ||
| f'"{itask.job_tokens.relative_id}".' | ||
| ) | ||
| itask.jobs.append({"submit_num": itask.submit_num}) | ||
| job_conf = itask.jobs[-1] | ||
|
|
@@ -1610,8 +1602,7 @@ def _insert_task_job( | |
|
|
||
| # insert job into data store | ||
| self.data_store_mgr.insert_job( | ||
| itask.tdef.name, | ||
| itask.point, | ||
| itask, | ||
| job_status, | ||
| { | ||
| **job_conf, | ||
|
|
@@ -1634,7 +1625,7 @@ def _insert_task_job( | |
| # preparation started due to intelligent host (and or | ||
| # platform) selection | ||
| 'platform_name': itask.platform['name'], | ||
| } | ||
| }, | ||
| ) | ||
|
|
||
| def _setup_job_logs_retrieval(self, itask, event) -> None: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please try to avoid passing itask objects to the data store where possible.
We have had to do this in a couple of places, but we don't need to update the remaining interfaces to match.
In theory, we are supposed to be able to populate the data store out of the data base (without the Scheduler or its runtime objects, e.g. TaskProxy) so we can provide offline data.
In truth that isn't possible right now, but we should try to reduce the pain of refactor when the time comes.