2
2
import logging
3
3
import multiprocessing
4
4
import os
5
+ import shutil
5
6
import signal
6
7
import sys
8
+ import tempfile
7
9
import threading
8
10
import traceback
9
11
from enum import Enum
@@ -41,14 +43,14 @@ class IPC(Enum):
41
43
BATCH_END = 'BATCH-END'
42
44
GRADING_BEGIN = 'GRADING-BEGIN'
43
45
GRADING_END = 'GRADING-END'
44
- GRADING_ABORTED = 'GRADING-ABORTED'
45
46
UNHANDLED_EXCEPTION = 'UNHANDLED-EXCEPTION'
46
- REQUEST_ABORT = 'REQUEST-ABORT'
47
+
48
+
49
+ class JudgeWorkerAborted (Exception ):
50
+ pass
47
51
48
52
49
53
# This needs to be at least as large as the timeout for the largest compiler time limit, but we don't enforce that here.
50
- # (Otherwise, aborting during a compilation that exceeds this time limit would result in a `TimeoutError` IE instead of
51
- # a `CompileError`.)
52
54
IPC_TIMEOUT = 60 # seconds
53
55
54
56
@@ -128,8 +130,6 @@ def begin_grading(self, submission: Submission, report=logger.info, blocking=Fal
128
130
)
129
131
)
130
132
131
- # FIXME(tbrindus): what if we receive an abort from the judge before IPC handshake completes? We'll send
132
- # an abort request down the pipe, possibly messing up the handshake.
133
133
self .current_judge_worker = JudgeWorker (submission )
134
134
135
135
ipc_ready_signal = threading .Event ()
@@ -147,13 +147,19 @@ def _grading_thread_main(self, ipc_ready_signal: threading.Event, report) -> Non
147
147
assert self .current_judge_worker is not None
148
148
149
149
try :
150
+ worker_tempdir = None
151
+
152
+ def _ipc_hello (_report , tempdir : str ):
153
+ nonlocal worker_tempdir
154
+ ipc_ready_signal .set ()
155
+ worker_tempdir = tempdir
156
+
150
157
ipc_handler_dispatch : Dict [IPC , Callable ] = {
151
- IPC .HELLO : lambda _report : ipc_ready_signal . set () ,
158
+ IPC .HELLO : _ipc_hello ,
152
159
IPC .COMPILE_ERROR : self ._ipc_compile_error ,
153
160
IPC .COMPILE_MESSAGE : self ._ipc_compile_message ,
154
161
IPC .GRADING_BEGIN : self ._ipc_grading_begin ,
155
162
IPC .GRADING_END : self ._ipc_grading_end ,
156
- IPC .GRADING_ABORTED : self ._ipc_grading_aborted ,
157
163
IPC .BATCH_BEGIN : self ._ipc_batch_begin ,
158
164
IPC .BATCH_END : self ._ipc_batch_end ,
159
165
IPC .RESULT : self ._ipc_result ,
@@ -176,12 +182,17 @@ def _grading_thread_main(self, ipc_ready_signal: threading.Event, report) -> Non
176
182
% (self .current_submission .problem_id , self .current_submission .id )
177
183
)
178
184
)
185
+ except JudgeWorkerAborted :
186
+ self .packet_manager .submission_aborted_packet ()
179
187
except Exception : # noqa: E722, we want to catch everything
180
188
self .log_internal_error ()
181
189
finally :
182
190
self .current_judge_worker .wait_with_timeout ()
183
191
self .current_judge_worker = None
184
192
193
+ if worker_tempdir :
194
+ shutil .rmtree (worker_tempdir )
195
+
185
196
# Might not have been set if an exception was encountered before HELLO message, so signal here to keep the
186
197
# other side from waiting forever.
187
198
ipc_ready_signal .set ()
@@ -232,10 +243,6 @@ def _ipc_batch_begin(self, report, batch_number: int) -> None:
232
243
def _ipc_batch_end (self , _report , _batch_number : int ) -> None :
233
244
self .packet_manager .batch_end_packet ()
234
245
235
- def _ipc_grading_aborted (self , report ) -> None :
236
- self .packet_manager .submission_aborted_packet ()
237
- report (ansi_style ('#ansi[Forcefully terminating grading. Temporary files may not be deleted.](red|bold)' ))
238
-
239
246
def _ipc_unhandled_exception (self , _report , message : str ) -> None :
240
247
logger .error ('Unhandled exception in worker process' )
241
248
self .log_internal_error (message = message )
@@ -254,10 +261,9 @@ def abort_grading(self, submission_id: Optional[int] = None) -> None:
254
261
'Received abortion request for %d, but %d is currently running' , submission_id , worker .submission .id
255
262
)
256
263
else :
257
- logger .info ('Received abortion request for %d' , worker .submission .id )
258
- # These calls are idempotent, so it doesn't matter if we raced and the worker has exited already.
259
- worker .request_abort_grading ()
260
- worker .wait_with_timeout ()
264
+ logger .info ('Received abortion request for %d, killing worker' , worker .submission .id )
265
+ # This call is idempotent, so it doesn't matter if we raced and the worker has exited already.
266
+ worker .abort_grading__kill_worker ()
261
267
262
268
def listen (self ) -> None :
263
269
"""
@@ -270,7 +276,8 @@ def murder(self) -> None:
270
276
"""
271
277
End any submission currently executing, and exit the judge.
272
278
"""
273
- self .abort_grading ()
279
+ if self .current_judge_worker :
280
+ self .current_judge_worker .abort_grading__kill_worker ()
274
281
self .updater_exit = True
275
282
self .updater_signal .set ()
276
283
if self .packet_manager :
@@ -304,8 +311,8 @@ def log_internal_error(self, exc: Optional[BaseException] = None, message: Optio
304
311
class JudgeWorker :
305
312
def __init__ (self , submission : Submission ) -> None :
306
313
self .submission = submission
307
- self ._abort_requested = False
308
- self ._sent_sigkill_to_worker_process = False
314
+ self ._aborted = False
315
+ self ._timed_out = False
309
316
# FIXME(tbrindus): marked Any pending grader cleanups.
310
317
self .grader : Any = None
311
318
@@ -331,8 +338,12 @@ def communicate(self) -> Generator[Tuple[IPC, tuple], None, None]:
331
338
self .worker_process .kill ()
332
339
raise
333
340
except EOFError :
334
- if self ._sent_sigkill_to_worker_process :
335
- raise TimeoutError ('worker did not exit in %d seconds, so it was killed' % IPC_TIMEOUT )
341
+ if self ._aborted :
342
+ raise JudgeWorkerAborted () from None
343
+
344
+ if self ._timed_out :
345
+ raise TimeoutError ('worker did not exit in %d seconds, so it was killed' % IPC_TIMEOUT ) from None
346
+
336
347
raise
337
348
except Exception :
338
349
logger .error ('Failed to read IPC message from worker!' )
@@ -354,16 +365,14 @@ def wait_with_timeout(self) -> None:
354
365
finally :
355
366
if self .worker_process .is_alive ():
356
367
logger .error ('Worker is still alive, sending SIGKILL!' )
357
- self ._sent_sigkill_to_worker_process = True
368
+ self ._timed_out = True
358
369
self .worker_process .kill ()
359
370
360
- def request_abort_grading (self ) -> None :
361
- assert self .worker_process_conn
362
-
363
- try :
364
- self .worker_process_conn .send ((IPC .REQUEST_ABORT , ()))
365
- except Exception :
366
- logger .exception ('Failed to send abort request to worker, did it race?' )
371
+ def abort_grading__kill_worker (self ) -> None :
372
+ if self .worker_process and self .worker_process .is_alive ():
373
+ self ._aborted = True
374
+ self .worker_process .kill ()
375
+ self .worker_process .join (timeout = 1 )
367
376
368
377
def _worker_process_main (
369
378
self ,
@@ -384,15 +393,12 @@ def _ipc_recv_thread_main() -> None:
384
393
while True :
385
394
try :
386
395
ipc_type , data = judge_process_conn .recv ()
387
- except : # noqa: E722, whatever happened, we have to abort now.
396
+ except : # noqa: E722, whatever happened, we have to exit now.
388
397
logger .exception ('Judge unexpectedly hung up!' )
389
- self ._do_abort ()
390
398
return
391
399
392
400
if ipc_type == IPC .BYE :
393
401
return
394
- elif ipc_type == IPC .REQUEST_ABORT :
395
- self ._do_abort ()
396
402
else :
397
403
raise RuntimeError ('worker got unexpected IPC message from judge: %s' % ((ipc_type , data ),))
398
404
@@ -402,9 +408,12 @@ def _report_unhandled_exception() -> None:
402
408
judge_process_conn .send ((IPC .UNHANDLED_EXCEPTION , (message ,)))
403
409
judge_process_conn .send ((IPC .BYE , ()))
404
410
411
+ tempdir = tempfile .mkdtemp ('dmoj-judge-worker' )
412
+ tempfile .tempdir = tempdir
413
+
405
414
ipc_recv_thread = None
406
415
try :
407
- judge_process_conn .send ((IPC .HELLO , ()))
416
+ judge_process_conn .send ((IPC .HELLO , (tempdir , )))
408
417
409
418
ipc_recv_thread = threading .Thread (target = _ipc_recv_thread_main , daemon = True )
410
419
ipc_recv_thread .start ()
@@ -439,15 +448,6 @@ def _report_unhandled_exception() -> None:
439
448
if ipc_recv_thread .is_alive ():
440
449
logger .error ('Judge IPC recv thread is still alive after timeout, shutting worker down anyway!' )
441
450
442
- # FIXME(tbrindus): we need to do this because cleaning up temporary directories happens on __del__, which
443
- # won't get called if we exit the process right now (so we'd leak all files created by the grader). This
444
- # should be refactored to have an explicit `cleanup()` or similar, rather than relying on refcounting
445
- # working out.
446
- from dmoj .executors .compiled_executor import _CompiledExecutorMeta
447
-
448
- for cached_executor in _CompiledExecutorMeta .compiled_binary_cache .values ():
449
- cached_executor .is_cached = False
450
- cached_executor .cleanup ()
451
451
self .grader = None
452
452
453
453
def _grade_cases (self ) -> Generator [Tuple [IPC , tuple ], None , None ]:
@@ -503,11 +503,6 @@ def _grade_cases(self) -> Generator[Tuple[IPC, tuple], None, None]:
503
503
else :
504
504
result = self .grader .grade (case )
505
505
506
- # If the submission was killed due to a user-initiated abort, any result is meaningless.
507
- if self ._abort_requested :
508
- yield IPC .GRADING_ABORTED , ()
509
- return
510
-
511
506
if result .result_flag & Result .WA :
512
507
# If we failed a 0-point case, we will short-circuit every case after this.
513
508
is_short_circuiting_enabled |= not case .points
@@ -532,11 +527,6 @@ def _grade_cases(self) -> Generator[Tuple[IPC, tuple], None, None]:
532
527
533
528
yield IPC .GRADING_END , ()
534
529
535
- def _do_abort (self ) -> None :
536
- self ._abort_requested = True
537
- if self .grader :
538
- self .grader .abort_grading ()
539
-
540
530
541
531
class ClassicJudge (Judge ):
542
532
def __init__ (self , host , port , ** kwargs ) -> None :
0 commit comments