@@ -72,6 +72,7 @@ def __post_init__(self):
72
72
# access executor specific settings
73
73
self .workflow .executor_settings
74
74
75
+ # jobDir: Directory where the job will tore log, output and error files.
75
76
self .jobDir = self .workflow .executor_settings .jobdir
76
77
77
78
def run_job (self , job : JobExecutorInterface ):
@@ -156,6 +157,13 @@ def run_job(self, job: JobExecutorInterface):
156
157
except Exception as e :
157
158
raise WorkflowError (f"Failed to submit HTCondor job: { e } " )
158
159
160
+ self .logger .info (
161
+ f"Job { job .jobid } submitted to "
162
+ "HTCondor Cluster ID {submit_result.cluster()}\n "
163
+ f"The logs of the HTCondor job are stored "
164
+ f"in { self .jobDir } /{ submit_result .cluster ()} .log"
165
+ )
166
+
159
167
self .report_job_submission (
160
168
SubmittedJobInfo (job = job , external_jobid = submit_result .cluster ())
161
169
)
@@ -167,79 +175,112 @@ async def check_active_jobs(
167
175
168
176
for current_job in active_jobs :
169
177
async with self .status_rate_limiter :
170
-
171
- # Event types that report an error
172
- error_event_type = [
173
- htcondor .JobEventType .JOB_ABORTED ,
174
- htcondor .JobEventType .JOB_HELD ,
175
- htcondor .JobEventType .EXECUTABLE_ERROR ,
176
- htcondor .JobEventType .REMOTE_ERROR ,
177
- ]
178
-
179
- # Event types that report a success
180
- success_event_type = [htcondor .JobEventType .JOB_TERMINATED ]
181
-
182
- # Event types that report the job is running/other event is happening
183
- running_event_type = [
184
- htcondor .JobEventType .SUBMIT ,
185
- htcondor .JobEventType .EXECUTE ,
186
- htcondor .JobEventType .IMAGE_SIZE ,
187
- ]
188
-
189
- warning_event_type = [
190
- htcondor .JobEventType .JOB_EVICTED ,
191
- htcondor .JobEventType .JOB_SUSPENDED ,
192
- ]
193
-
194
- # Look in the log file to check the status of the job
195
- logFileName = join (self .jobDir , f"{ current_job .external_jobid } .log" )
196
- jel = htcondor .JobEventLog (logFileName )
197
-
198
- # Get the latest event from the iterator
199
- for event in jel .events (stop_after = 0 ):
200
- latest_event = event
201
-
202
- if latest_event .type in error_event_type :
203
- # Job has an error
204
- self .logger .debug (
205
- f"HTCondor job { current_job .external_jobid } has "
206
- "JobEventType {latest_event.type}."
207
- )
208
- self .report_job_error (
209
- current_job ,
210
- msg = f"HTCondor job { current_job .external_jobid } has "
211
- "JobEventType {latest_event.type}. " ,
212
- )
213
- break
214
- elif latest_event .type in running_event_type :
215
- # Job is still running/idle
216
- self .logger .debug (
217
- f"HTCondor job { current_job .external_jobid } has "
218
- "JobEventType {latest_event.type}."
178
+ # Get the status of the job from HTCondor
179
+ try :
180
+ schedd = htcondor .Schedd ()
181
+ job_status = schedd .query (
182
+ constraint = f"ClusterId == { current_job .external_jobid } " ,
183
+ projection = [
184
+ "ExitBySignal" ,
185
+ "ExitCode" ,
186
+ "ExitSignal" ,
187
+ "JobStatus" ,
188
+ ],
219
189
)
190
+ # Job is not running anymore, look
191
+ if not job_status :
192
+ job_status = schedd .history (
193
+ constraint = f"ClusterId == { current_job .external_jobid } " ,
194
+ projection = [
195
+ "ExitBySignal" ,
196
+ "ExitCode" ,
197
+ "ExitSignal" ,
198
+ "JobStatus" ,
199
+ ],
200
+ )
201
+ # Storing the one event from HistoryIterator to list
202
+ job_status = [next (job_status )]
203
+ except Exception as e :
204
+ self .logger .warning (f"Failed to retrieve HTCondor job status: { e } " )
205
+ # Assuming the job is still running and retry next time
220
206
yield current_job
221
- elif latest_event .type in warning_event_type :
222
- # Job has a warning
223
- self .logger .warning (
224
- f"HTCondor job { current_job .external_jobid } has "
225
- "obEventType {latest_event.type}."
226
- )
207
+ self .logger .debug (
208
+ f"Job { current_job .job .jobid } with HTCondor Cluster ID "
209
+ f"{ current_job .external_jobid } has status: { job_status } "
210
+ )
211
+
212
+ # Overview of HTCondor job status:
213
+ status_dict = {
214
+ "1" : "Idle" ,
215
+ "2" : "Running" ,
216
+ "3" : "Removed" ,
217
+ "4" : "Completed" ,
218
+ "5" : "Held" ,
219
+ "6" : "Transferring Output" ,
220
+ "7" : "Suspended" ,
221
+ }
222
+
223
+ # Running/idle jobs
224
+ if job_status [0 ]["JobStatus" ] in [1 , 2 , 6 , 7 ]:
225
+ if job_status [0 ]["JobStatus" ] in [7 ]:
226
+ self .logger .warning (
227
+ f"Job { current_job .job .jobid } with "
228
+ "HTCondor Cluster ID "
229
+ f"{ current_job .external_jobid } is suspended."
230
+ )
227
231
yield current_job
228
- elif latest_event . type in success_event_type :
229
- # Job is terminated
232
+ # Completed jobs
233
+ elif job_status [ 0 ][ "JobStatus" ] in [ 4 ]:
230
234
self .logger .debug (
231
- f"HTCondor job { current_job .external_jobid } has "
232
- "JobEventType {latest_event.type}."
235
+ f"Check whether Job { current_job .job .jobid } with "
236
+ "HTCondor Cluster ID "
237
+ f"{ current_job .external_jobid } was successful."
238
+ )
239
+ # Check ExitCode
240
+ if job_status [0 ]["ExitCode" ] == 0 :
241
+ # Job was successful
242
+ self .logger .debug (
243
+ f"Report Job { current_job .job .jobid } with "
244
+ "HTCondor Cluster ID "
245
+ f"{ current_job .external_jobid } success"
246
+ )
247
+ self .logger .info (
248
+ f"Job { current_job .job .jobid } with "
249
+ "HTCondor Cluster ID "
250
+ f"{ current_job .external_jobid } was successful."
251
+ )
252
+ self .report_job_success (current_job )
253
+ else :
254
+ self .logger .debug (
255
+ f"Report Job { current_job .job .jobid } with "
256
+ "HTCondor Cluster ID "
257
+ f"{ current_job .external_jobid } error"
258
+ )
259
+ self .report_job_error (
260
+ current_job ,
261
+ msg = f"Job { current_job .job .jobid } with "
262
+ "HTCondor Cluster ID "
263
+ f"{ current_job .external_jobid } has "
264
+ f" status { status_dict [str (job_status [0 ]['JobStatus' ])]} , "
265
+ "but failed with"
266
+ f"ExitCode { job_status [0 ]['ExitCode' ]} ." ,
267
+ )
268
+ # Errored jobs
269
+ elif job_status [0 ]["JobStatus" ] in [3 , 5 ]:
270
+ self .report_job_error (
271
+ current_job ,
272
+ msg = f"Job { current_job .job .jobid } with "
273
+ "HTCondor Cluster ID "
274
+ f"{ current_job .external_jobid } has "
275
+ f"status { status_dict [str (job_status [0 ]['JobStatus' ])]} ." ,
233
276
)
234
- self .report_job_success (current_job )
235
277
else :
236
- # Unsupported event type
237
- self . logger . debug (
238
- f "HTCondor job { current_job . external_jobid } has "
239
- "JobEventType {latest_event.type}. "
240
- "This event type is not supported. "
278
+ raise WorkflowError (
279
+ f"Job { current_job . job . jobid } with "
280
+ "HTCondor Cluster ID "
281
+ f" { current_job . external_jobid } has "
282
+ f"unknown HTCondor job status: { job_status [ 0 ][ 'JobStatus' ] } "
241
283
)
242
- yield current_job
243
284
244
285
def cancel_jobs (self , active_jobs : List [SubmittedJobInfo ]):
245
286
# Cancel all active jobs.
0 commit comments