Skip to content

Commit 0ee1b47

Browse files
authored
fix: job status method (#6)
1 parent f721f0d commit 0ee1b47

File tree

1 file changed

+107
-66
lines changed

1 file changed

+107
-66
lines changed

Diff for: snakemake_executor_plugin_htcondor/__init__.py

+107-66
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def __post_init__(self):
7272
# access executor specific settings
7373
self.workflow.executor_settings
7474

75+
# jobDir: Directory where the job will tore log, output and error files.
7576
self.jobDir = self.workflow.executor_settings.jobdir
7677

7778
def run_job(self, job: JobExecutorInterface):
@@ -156,6 +157,13 @@ def run_job(self, job: JobExecutorInterface):
156157
except Exception as e:
157158
raise WorkflowError(f"Failed to submit HTCondor job: {e}")
158159

160+
self.logger.info(
161+
f"Job {job.jobid} submitted to "
162+
"HTCondor Cluster ID {submit_result.cluster()}\n"
163+
f"The logs of the HTCondor job are stored "
164+
f"in {self.jobDir}/{submit_result.cluster()}.log"
165+
)
166+
159167
self.report_job_submission(
160168
SubmittedJobInfo(job=job, external_jobid=submit_result.cluster())
161169
)
@@ -167,79 +175,112 @@ async def check_active_jobs(
167175

168176
for current_job in active_jobs:
169177
async with self.status_rate_limiter:
170-
171-
# Event types that report an error
172-
error_event_type = [
173-
htcondor.JobEventType.JOB_ABORTED,
174-
htcondor.JobEventType.JOB_HELD,
175-
htcondor.JobEventType.EXECUTABLE_ERROR,
176-
htcondor.JobEventType.REMOTE_ERROR,
177-
]
178-
179-
# Event types that report a success
180-
success_event_type = [htcondor.JobEventType.JOB_TERMINATED]
181-
182-
# Event types that report the job is running/other event is happening
183-
running_event_type = [
184-
htcondor.JobEventType.SUBMIT,
185-
htcondor.JobEventType.EXECUTE,
186-
htcondor.JobEventType.IMAGE_SIZE,
187-
]
188-
189-
warning_event_type = [
190-
htcondor.JobEventType.JOB_EVICTED,
191-
htcondor.JobEventType.JOB_SUSPENDED,
192-
]
193-
194-
# Look in the log file to check the status of the job
195-
logFileName = join(self.jobDir, f"{current_job.external_jobid}.log")
196-
jel = htcondor.JobEventLog(logFileName)
197-
198-
# Get the latest event from the iterator
199-
for event in jel.events(stop_after=0):
200-
latest_event = event
201-
202-
if latest_event.type in error_event_type:
203-
# Job has an error
204-
self.logger.debug(
205-
f"HTCondor job {current_job.external_jobid} has "
206-
"JobEventType {latest_event.type}."
207-
)
208-
self.report_job_error(
209-
current_job,
210-
msg=f"HTCondor job {current_job.external_jobid} has "
211-
"JobEventType {latest_event.type}. ",
212-
)
213-
break
214-
elif latest_event.type in running_event_type:
215-
# Job is still running/idle
216-
self.logger.debug(
217-
f"HTCondor job {current_job.external_jobid} has "
218-
"JobEventType {latest_event.type}."
178+
# Get the status of the job from HTCondor
179+
try:
180+
schedd = htcondor.Schedd()
181+
job_status = schedd.query(
182+
constraint=f"ClusterId == {current_job.external_jobid}",
183+
projection=[
184+
"ExitBySignal",
185+
"ExitCode",
186+
"ExitSignal",
187+
"JobStatus",
188+
],
219189
)
190+
# Job is not running anymore, look
191+
if not job_status:
192+
job_status = schedd.history(
193+
constraint=f"ClusterId == {current_job.external_jobid}",
194+
projection=[
195+
"ExitBySignal",
196+
"ExitCode",
197+
"ExitSignal",
198+
"JobStatus",
199+
],
200+
)
201+
# Storing the one event from HistoryIterator to list
202+
job_status = [next(job_status)]
203+
except Exception as e:
204+
self.logger.warning(f"Failed to retrieve HTCondor job status: {e}")
205+
# Assuming the job is still running and retry next time
220206
yield current_job
221-
elif latest_event.type in warning_event_type:
222-
# Job has a warning
223-
self.logger.warning(
224-
f"HTCondor job {current_job.external_jobid} has "
225-
"obEventType {latest_event.type}."
226-
)
207+
self.logger.debug(
208+
f"Job {current_job.job.jobid} with HTCondor Cluster ID "
209+
f"{current_job.external_jobid} has status: {job_status}"
210+
)
211+
212+
# Overview of HTCondor job status:
213+
status_dict = {
214+
"1": "Idle",
215+
"2": "Running",
216+
"3": "Removed",
217+
"4": "Completed",
218+
"5": "Held",
219+
"6": "Transferring Output",
220+
"7": "Suspended",
221+
}
222+
223+
# Running/idle jobs
224+
if job_status[0]["JobStatus"] in [1, 2, 6, 7]:
225+
if job_status[0]["JobStatus"] in [7]:
226+
self.logger.warning(
227+
f"Job {current_job.job.jobid} with "
228+
"HTCondor Cluster ID "
229+
f"{current_job.external_jobid} is suspended."
230+
)
227231
yield current_job
228-
elif latest_event.type in success_event_type:
229-
# Job is terminated
232+
# Completed jobs
233+
elif job_status[0]["JobStatus"] in [4]:
230234
self.logger.debug(
231-
f"HTCondor job {current_job.external_jobid} has "
232-
"JobEventType {latest_event.type}."
235+
f"Check whether Job {current_job.job.jobid} with "
236+
"HTCondor Cluster ID "
237+
f"{current_job.external_jobid} was successful."
238+
)
239+
# Check ExitCode
240+
if job_status[0]["ExitCode"] == 0:
241+
# Job was successful
242+
self.logger.debug(
243+
f"Report Job {current_job.job.jobid} with "
244+
"HTCondor Cluster ID "
245+
f"{current_job.external_jobid} success"
246+
)
247+
self.logger.info(
248+
f"Job {current_job.job.jobid} with "
249+
"HTCondor Cluster ID "
250+
f"{current_job.external_jobid} was successful."
251+
)
252+
self.report_job_success(current_job)
253+
else:
254+
self.logger.debug(
255+
f"Report Job {current_job.job.jobid} with "
256+
"HTCondor Cluster ID "
257+
f"{current_job.external_jobid} error"
258+
)
259+
self.report_job_error(
260+
current_job,
261+
msg=f"Job {current_job.job.jobid} with "
262+
"HTCondor Cluster ID "
263+
f"{current_job.external_jobid} has "
264+
f" status {status_dict[str(job_status[0]['JobStatus'])]}, "
265+
"but failed with"
266+
f"ExitCode {job_status[0]['ExitCode']}.",
267+
)
268+
# Errored jobs
269+
elif job_status[0]["JobStatus"] in [3, 5]:
270+
self.report_job_error(
271+
current_job,
272+
msg=f"Job {current_job.job.jobid} with "
273+
"HTCondor Cluster ID "
274+
f"{current_job.external_jobid} has "
275+
f"status {status_dict[str(job_status[0]['JobStatus'])]}.",
233276
)
234-
self.report_job_success(current_job)
235277
else:
236-
# Unsupported event type
237-
self.logger.debug(
238-
f"HTCondor job {current_job.external_jobid} has "
239-
"JobEventType {latest_event.type}."
240-
"This event type is not supported."
278+
raise WorkflowError(
279+
f"Job {current_job.job.jobid} with "
280+
"HTCondor Cluster ID "
281+
f"{current_job.external_jobid} has "
282+
f"unknown HTCondor job status: {job_status[0]['JobStatus']}"
241283
)
242-
yield current_job
243284

244285
def cancel_jobs(self, active_jobs: List[SubmittedJobInfo]):
245286
# Cancel all active jobs.

0 commit comments

Comments
 (0)