Skip to content

Commit 3a65827

Browse files
authored
Merge pull request #254 from lexming/deadstart
poll job status while waiting for single-user server to be reachable
2 parents 5f84c9c + 451cf0a commit 3a65827

File tree

2 files changed

+47
-24
lines changed

2 files changed

+47
-24
lines changed

batchspawner/batchspawner.py

+7
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,13 @@ async def start(self):
450450
# don't actually run the single-user server yet.
451451
if hasattr(self, "mock_port"):
452452
self.port = self.mock_port
453+
# Check if job is still running
454+
status = await self.poll()
455+
if status:
456+
raise RuntimeError(
457+
"The Jupyter batch job started"
458+
" but died before launching the single-user server."
459+
)
453460

454461
self.db.commit()
455462
self.log.info(

batchspawner/tests/test_spawners.py

+40-24
Original file line numberDiff line numberDiff line change
@@ -349,20 +349,19 @@ async def test_torque(db, event_loop):
349349
re.compile(r"ppn=5"),
350350
re.compile(r"^#PBS some_option_asdf", re.M),
351351
]
352+
poll_running = (
353+
re.compile(r"sudo.*qstat"),
354+
f"<job_state>R</job_state><exec_host>{testhost}/1</exec_host>",
355+
)
352356
script = [
353357
(re.compile(r"sudo.*qsub"), str(testjob)),
354358
(
355359
re.compile(r"sudo.*qstat"),
356360
"<job_state>Q</job_state><exec_host></exec_host>",
357361
), # pending
358-
(
359-
re.compile(r"sudo.*qstat"),
360-
f"<job_state>R</job_state><exec_host>{testhost}/1</exec_host>",
361-
), # running
362-
(
363-
re.compile(r"sudo.*qstat"),
364-
f"<job_state>R</job_state><exec_host>{testhost}/1</exec_host>",
365-
), # running
362+
poll_running,
363+
poll_running,
364+
poll_running,
366365
(re.compile(r"sudo.*qdel"), "STOP"),
367366
(re.compile(r"sudo.*qstat"), ""),
368367
]
@@ -394,17 +393,16 @@ async def test_moab(db, event_loop):
394393
re.compile(r"ppn=5"),
395394
re.compile(r"^#PBS some_option_asdf", re.M),
396395
]
396+
poll_running = (
397+
re.compile(r"sudo.*mdiag"),
398+
f'State="Running" AllocNodeList="{testhost}"',
399+
)
397400
script = [
398401
(re.compile(r"sudo.*msub"), str(testjob)),
399402
(re.compile(r"sudo.*mdiag"), 'State="Idle"'), # pending
400-
(
401-
re.compile(r"sudo.*mdiag"),
402-
f'State="Running" AllocNodeList="{testhost}"',
403-
), # running
404-
(
405-
re.compile(r"sudo.*mdiag"),
406-
f'State="Running" AllocNodeList="{testhost}"',
407-
), # running
403+
poll_running,
404+
poll_running,
405+
poll_running,
408406
(re.compile(r"sudo.*mjobctl.*-c"), "STOP"),
409407
(re.compile(r"sudo.*mdiag"), ""),
410408
]
@@ -436,17 +434,16 @@ async def test_pbs(db, event_loop):
436434
re.compile(r"@some_pbs_admin_node"),
437435
re.compile(r"^#PBS some_option_asdf", re.M),
438436
]
437+
poll_running = (
438+
re.compile(r"sudo.*qstat"),
439+
f"job_state = R\nexec_host = {testhost}/2*1",
440+
)
439441
script = [
440442
(re.compile(r"sudo.*qsub"), str(testjob)),
441443
(re.compile(r"sudo.*qstat"), "job_state = Q"), # pending
442-
(
443-
re.compile(r"sudo.*qstat"),
444-
f"job_state = R\nexec_host = {testhost}/2*1",
445-
), # running
446-
(
447-
re.compile(r"sudo.*qstat"),
448-
f"job_state = R\nexec_host = {testhost}/2*1",
449-
), # running
444+
poll_running,
445+
poll_running,
446+
poll_running,
450447
(re.compile(r"sudo.*qdel"), "STOP"),
451448
(re.compile(r"sudo.*qstat"), ""),
452449
]
@@ -504,6 +501,7 @@ async def test_slurm(db, event_loop):
504501
), # unknown
505502
(re.compile(r"sudo.*squeue"), "RUNNING " + testhost), # running
506503
(re.compile(r"sudo.*squeue"), "RUNNING " + testhost),
504+
(re.compile(r"sudo.*squeue"), "RUNNING " + testhost),
507505
(re.compile(r"sudo.*scancel"), "STOP"),
508506
(re.compile(r"sudo.*squeue"), ""),
509507
]
@@ -573,6 +571,7 @@ async def test_condor(db, event_loop):
573571
(re.compile(r"sudo.*condor_q"), "1,"), # pending
574572
(re.compile(r"sudo.*condor_q"), f"2, @{testhost}"), # runing
575573
(re.compile(r"sudo.*condor_q"), f"2, @{testhost}"),
574+
(re.compile(r"sudo.*condor_q"), f"2, @{testhost}"),
576575
(re.compile(r"sudo.*condor_rm"), "STOP"),
577576
(re.compile(r"sudo.*condor_q"), ""),
578577
]
@@ -611,6 +610,7 @@ async def test_lfs(db, event_loop):
611610
(re.compile(r"sudo.*bjobs"), "PEND "), # pending
612611
(re.compile(r"sudo.*bjobs"), f"RUN {testhost}"), # running
613612
(re.compile(r"sudo.*bjobs"), f"RUN {testhost}"),
613+
(re.compile(r"sudo.*bjobs"), f"RUN {testhost}"),
614614
(re.compile(r"sudo.*bkill"), "STOP"),
615615
(re.compile(r"sudo.*bjobs"), ""),
616616
]
@@ -652,3 +652,19 @@ async def test_keepvars(db, event_loop):
652652
spawner_kwargs=spawner_kwargs,
653653
batch_script_re_list=batch_script_re_list,
654654
)
655+
656+
657+
async def test_early_stop(db, event_loop):
658+
script = [
659+
(re.compile(r"sudo.*sbatch"), str(testjob)),
660+
(re.compile(r"sudo.*squeue"), "PENDING "), # pending
661+
(
662+
re.compile(r"sudo.*squeue"),
663+
"slurm_load_jobs error: Unable to contact slurm controller",
664+
), # unknown
665+
# job exits early during start
666+
(re.compile(r"sudo.*squeue"), ""),
667+
(re.compile(r"sudo.*scancel"), "STOP"),
668+
]
669+
with pytest.raises(RuntimeError, match="job has disappeared"):
670+
await run_spawner_script(db, SlurmSpawner, script)

0 commit comments

Comments
 (0)