@@ -46,7 +46,10 @@ class IPC(Enum):
4646 REQUEST_ABORT = 'REQUEST-ABORT'
4747
4848
49- IPC_TEARDOWN_TIMEOUT = 5 # seconds
49+ # This needs to be at least as large as the timeout for the largest compiler time limit, but we don't enforce that here.
50+ # (Otherwise, aborting during a compilation that exceeds this time limit would result in a `TimeoutError` IE instead of
51+ # a `CompileError`.)
52+ IPC_TIMEOUT = 60 # seconds
5053
5154
5255logger = logging .getLogger (__name__ )
@@ -302,6 +305,7 @@ class JudgeWorker:
302305 def __init__ (self , submission : Submission ) -> None :
303306 self .submission = submission
304307 self ._abort_requested = False
308+ self ._sent_sigkill_to_worker_process = False
305309 # FIXME(tbrindus): marked Any pending grader cleanups.
306310 self .grader : Any = None
307311
@@ -326,6 +330,10 @@ def communicate(self) -> Generator[Tuple[IPC, tuple], None, None]:
326330 logger .error ('Worker has not sent a message in %d seconds, assuming dead and killing.' , recv_timeout )
327331 self .worker_process .kill ()
328332 raise
333+ except EOFError :
334+ if self ._sent_sigkill_to_worker_process :
335+ raise TimeoutError ('worker did not exit in %d seconds, so it was killed' % IPC_TIMEOUT )
336+ raise
329337 except Exception :
330338 logger .error ('Failed to read IPC message from worker!' )
331339 raise
@@ -336,16 +344,17 @@ def communicate(self) -> Generator[Tuple[IPC, tuple], None, None]:
336344 else :
337345 yield ipc_type , data
338346
339- def wait_with_timeout (self , timeout = IPC_TEARDOWN_TIMEOUT ) -> None :
347+ def wait_with_timeout (self ) -> None :
340348 if self .worker_process and self .worker_process .is_alive ():
341349 # Might be None if run was never called, or failed.
342350 try :
343- self .worker_process .join (timeout = timeout )
351+ self .worker_process .join (timeout = IPC_TIMEOUT )
344352 except OSError :
345353 logger .exception ('Exception while waiting for worker to shut down, ignoring...' )
346354 finally :
347355 if self .worker_process .is_alive ():
348356 logger .error ('Worker is still alive, sending SIGKILL!' )
357+ self ._sent_sigkill_to_worker_process = True
349358 self .worker_process .kill ()
350359
351360 def request_abort_grading (self ) -> None :
@@ -426,7 +435,7 @@ def _report_unhandled_exception() -> None:
426435 # We may have failed before sending the IPC.BYE down the connection, in which case the judge will never
427436 # close its side of the connection -- so `ipc_recv_thread` will never exit. But we can't wait forever in
428437 # this case, since we're blocking the main judge from proceeding.
429- ipc_recv_thread .join (timeout = IPC_TEARDOWN_TIMEOUT )
438+ ipc_recv_thread .join (timeout = IPC_TIMEOUT )
430439 if ipc_recv_thread .is_alive ():
431440 logger .error ('Judge IPC recv thread is still alive after timeout, shutting worker down anyway!' )
432441
0 commit comments