File tree Expand file tree Collapse file tree 2 files changed +7
-1
lines changed
Expand file tree Collapse file tree 2 files changed +7
-1
lines changed Original file line number Diff line number Diff line change @@ -93,6 +93,8 @@ def cancel_empty_pending_jobs(db_collection_name: str, *sacred_ids: int):
9393 {'execution' },
9494 )
9595 )
96+
97+ # We exclude all SLURM jobs that are needed for continuing a RESCHEDULED experiment
9698 rescheduled_ids = []
9799 for exp in rescheduled_exps :
98100 execution = exp ['execution' ]
@@ -672,7 +674,7 @@ def detect_killed(db_collection_name: str, print_detected: bool = True):
672674 cluster = get_cluster_name ()
673675 exps = collection .find (
674676 {
675- 'status' : {'$in' : [* States .PENDING , * States .RUNNING ]},
677+ 'status' : {'$in' : [* States .PENDING , * States .RUNNING , * States . RESCHEDULED ]},
676678 'execution.cluster' : cluster , # only check experiments that are running on the current cluster
677679 # Previously we only checked for started experiments by including the following line:
678680 # 'host': {'$exists': True}, # only check experiments that have been started
Original file line number Diff line number Diff line change @@ -1133,6 +1133,8 @@ def claim_experiment(db_collection_name: str, exp_ids: Sequence[int]):
11331133 'execution.task_id' : task_id ,
11341134 'execution.cluster' : cluster_name ,
11351135 }
1136+ # First, we check whether this SLURM job is responsible for a RESCHEDULED experiment
1137+ # If so, we claim this first
11361138 exp = collection .find_one_and_update (
11371139 {
11381140 '_id' : {'$in' : list (exp_ids )},
@@ -1144,6 +1146,8 @@ def claim_experiment(db_collection_name: str, exp_ids: Sequence[int]):
11441146 {'$set' : {'status' : States .RUNNING [0 ], ** update }},
11451147 {'_id' : 1 , 'slurm' : 1 },
11461148 )
1149+ # Only after we have checked that this job is not responsible for a RESCHEDULED experiment
1150+ # can we pick up a pending one.
11471151 if exp is None :
11481152 exp = collection .find_one_and_update (
11491153 {
You can’t perform that action at this time.
0 commit comments