Skip to content

Commit 5eb5f59

Browse files
committed
judge: be more graceful around worker timeouts
- Increase the timeout to 60s to avoid killing the worker when a user has aborted a long-running compilation. - Rather than display `EOFError` when we kill the worker, display a more informative `TimeoutError` instead.
1 parent e44898f commit 5eb5f59

File tree

1 file changed

+13
-4
lines changed

1 file changed

+13
-4
lines changed

dmoj/judge.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,10 @@ class IPC(Enum):
4646
REQUEST_ABORT = 'REQUEST-ABORT'
4747

4848

49-
IPC_TEARDOWN_TIMEOUT = 5 # seconds
49+
# This needs to be at least as large as the timeout for the largest compiler time limit, but we don't enforce that here.
50+
# (Otherwise, aborting during a compilation that exceeds this time limit would result in a `TimeoutError` IE instead of
51+
# a `CompileError`.)
52+
IPC_TIMEOUT = 60 # seconds
5053

5154

5255
logger = logging.getLogger(__name__)
@@ -302,6 +305,7 @@ class JudgeWorker:
302305
def __init__(self, submission: Submission) -> None:
303306
self.submission = submission
304307
self._abort_requested = False
308+
self._sent_sigkill_to_worker_process = False
305309
# FIXME(tbrindus): marked Any pending grader cleanups.
306310
self.grader: Any = None
307311

@@ -326,6 +330,10 @@ def communicate(self) -> Generator[Tuple[IPC, tuple], None, None]:
326330
logger.error('Worker has not sent a message in %d seconds, assuming dead and killing.', recv_timeout)
327331
self.worker_process.kill()
328332
raise
333+
except EOFError:
334+
if self._sent_sigkill_to_worker_process:
335+
raise TimeoutError('worker did not exit in %d seconds, so it was killed' % IPC_TIMEOUT)
336+
raise
329337
except Exception:
330338
logger.error('Failed to read IPC message from worker!')
331339
raise
@@ -336,16 +344,17 @@ def communicate(self) -> Generator[Tuple[IPC, tuple], None, None]:
336344
else:
337345
yield ipc_type, data
338346

339-
def wait_with_timeout(self, timeout=IPC_TEARDOWN_TIMEOUT) -> None:
347+
def wait_with_timeout(self) -> None:
340348
if self.worker_process and self.worker_process.is_alive():
341349
# Might be None if run was never called, or failed.
342350
try:
343-
self.worker_process.join(timeout=timeout)
351+
self.worker_process.join(timeout=IPC_TIMEOUT)
344352
except OSError:
345353
logger.exception('Exception while waiting for worker to shut down, ignoring...')
346354
finally:
347355
if self.worker_process.is_alive():
348356
logger.error('Worker is still alive, sending SIGKILL!')
357+
self._sent_sigkill_to_worker_process = True
349358
self.worker_process.kill()
350359

351360
def request_abort_grading(self) -> None:
@@ -426,7 +435,7 @@ def _report_unhandled_exception() -> None:
426435
# We may have failed before sending the IPC.BYE down the connection, in which case the judge will never
427436
# close its side of the connection -- so `ipc_recv_thread` will never exit. But we can't wait forever in
428437
# this case, since we're blocking the main judge from proceeding.
429-
ipc_recv_thread.join(timeout=IPC_TEARDOWN_TIMEOUT)
438+
ipc_recv_thread.join(timeout=IPC_TIMEOUT)
430439
if ipc_recv_thread.is_alive():
431440
logger.error('Judge IPC recv thread is still alive after timeout, shutting worker down anyway!')
432441

0 commit comments

Comments
 (0)