Skip to content

Commit abd2a7d

Browse files
committed
added comments and added RESCHEDULED to detect_killed
1 parent 967ee55 commit abd2a7d

File tree

2 files changed

+7
-1
lines changed

2 files changed

+7
-1
lines changed

src/seml/commands/manage.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ def cancel_empty_pending_jobs(db_collection_name: str, *sacred_ids: int):
9393
{'execution'},
9494
)
9595
)
96+
97+
# We exclude all SLURM jobs that are needed for continuing a RESCHEDULED experiment
9698
rescheduled_ids = []
9799
for exp in rescheduled_exps:
98100
execution = exp['execution']
@@ -672,7 +674,7 @@ def detect_killed(db_collection_name: str, print_detected: bool = True):
672674
cluster = get_cluster_name()
673675
exps = collection.find(
674676
{
675-
'status': {'$in': [*States.PENDING, *States.RUNNING]},
677+
'status': {'$in': [*States.PENDING, *States.RUNNING, *States.RESCHEDULED]},
676678
'execution.cluster': cluster, # only check experiments that are running on the current cluster
677679
# Previously we only checked for started experiments by including the following line:
678680
# 'host': {'$exists': True}, # only check experiments that have been started

src/seml/commands/start.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,6 +1133,8 @@ def claim_experiment(db_collection_name: str, exp_ids: Sequence[int]):
11331133
'execution.task_id': task_id,
11341134
'execution.cluster': cluster_name,
11351135
}
1136+
# First, we check whether this SLURM job is responsible for a RESCHEDULED experiment
1137+
# If so, we claim this first
11361138
exp = collection.find_one_and_update(
11371139
{
11381140
'_id': {'$in': list(exp_ids)},
@@ -1144,6 +1146,8 @@ def claim_experiment(db_collection_name: str, exp_ids: Sequence[int]):
11441146
{'$set': {'status': States.RUNNING[0], **update}},
11451147
{'_id': 1, 'slurm': 1},
11461148
)
1149+
# Only after we have checked that this job is not responsible for a RESCHEDULED experiment
1150+
# can we pick up a pending one.
11471151
if exp is None:
11481152
exp = collection.find_one_and_update(
11491153
{

0 commit comments

Comments
 (0)