Skip to content

Commit 9c65e94

Browse files
authored
Check if job is complete before raising (#285)
* patch out unspeicfied error * change the job id to correct format * add check against job status * add changelog * Update date for version 2.2.1 in CHANGELOG
1 parent 8847d1b commit 9c65e94

File tree

2 files changed

+24
-7
lines changed

2 files changed

+24
-7
lines changed

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
**2.2.1 - 01/06/26**
2+
3+
- Fix noisy psimulate output error
4+
15
**2.2.0 - 01/02/26**
26

37
- Add output data file splitting

src/vivarium_cluster_tools/psimulate/cluster/interface.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,19 +72,32 @@ def submit_worker_jobs(
7272
jt.nativeSpecification = native_specification.to_cli_args()
7373

7474
job_ids = s.runBulkJobs(jt, 1, num_workers, 1)
75-
array_job_id = job_ids[0].split(".")[0]
75+
array_job_id = job_ids[0].split("_")[0]
7676

7777
def kill_jobs() -> None:
7878
try:
7979
s.control(array_job_id, drmaa.JobControlAction.TERMINATE)
80-
# FIXME: Hack around issue where drmaa.errors sometimes doesn't
81-
# exist.
8280
except Exception as e:
83-
if "already completing" in str(e) or "Invalid job" in str(e):
84-
# This is the case where all our workers have already shut down
85-
# on their own, which isn't actually an error.
81+
# Check if the job already finished - if so, this error is expected
82+
try:
83+
status = s.jobStatus(array_job_id)
84+
if status in (drmaa.JobState.DONE, drmaa.JobState.FAILED):
85+
return # Job already finished, nothing to do
86+
except Exception:
87+
# If we can't get status, fall back to string matching
8688
pass
87-
else:
89+
90+
# FIXME: Hack around issue where drmaa.errors sometimes doesn't
91+
# exist.
92+
error_msg = str(e)
93+
# These errors occur when workers have already shut down on their own,
94+
# which isn't actually an error. "Unspecified error" is slurm-drmaa's
95+
# poor translation of ESLURM_ALREADY_DONE (errno 2021).
96+
expected_errors = [
97+
"already completing",
98+
"Invalid job",
99+
]
100+
if not any(err in error_msg for err in expected_errors):
88101
raise
89102

90103
atexit.register(kill_jobs)

0 commit comments

Comments
 (0)