Skip to content

Commit 5641999

Browse files
authored
Tigru/fix/experiments killing each other (#159)
* reclaiming rescheduled experiments is now preferred; rescheduled experiments may not be claimed by other jobs anymore * added filter on rescheduled exps in cancel_empty_pending_jobs * added comments and added RESCHEDULED to detect_killed
1 parent 22a9ee8 commit 5641999

File tree

2 files changed

+34
-2
lines changed

2 files changed

+34
-2
lines changed

src/seml/commands/manage.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,27 @@ def cancel_empty_pending_jobs(db_collection_name: str, *sacred_ids: int):
8787
# There are still pending experiments, we don't want to cancel the jobs.
8888
return
8989
pending_exps = list(collection.find({'_id': {'$in': sacred_ids}}, {'slurm'}))
90+
rescheduled_exps = list(
91+
collection.find(
92+
{'_id': {'$in': sacred_ids}, 'status': {'$in': States.RESCHEDULED}},
93+
{'execution'},
94+
)
95+
)
96+
97+
# We exclude all SLURM jobs that are needed for continuing a RESCHEDULED experiment
98+
rescheduled_ids = []
99+
for exp in rescheduled_exps:
100+
execution = exp['execution']
101+
if 'array_id' in execution:
102+
rescheduled_ids.append(execution['array_id'])
103+
90104
array_ids = {
91105
conf['array_id']
92106
for exp in pending_exps
93107
for conf in exp['slurm']
94108
if 'array_id' in conf
95109
}
110+
array_ids = array_ids.difference(rescheduled_ids)
96111
# Only cancel the pending jobs
97112
cancel_slurm_jobs(*array_ids, state=SETTINGS.SLURM_STATES.PENDING[0])
98113

@@ -659,7 +674,7 @@ def detect_killed(db_collection_name: str, print_detected: bool = True):
659674
cluster = get_cluster_name()
660675
exps = collection.find(
661676
{
662-
'status': {'$in': [*States.PENDING, *States.RUNNING]},
677+
'status': {'$in': [*States.PENDING, *States.RUNNING, *States.RESCHEDULED]},
663678
'execution.cluster': cluster, # only check experiments that are running on the current cluster
664679
# Previously we only checked for started experiments by including the following line:
665680
# 'host': {'$exists': True}, # only check experiments that have been started

src/seml/commands/start.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,7 @@ def claim_experiment(db_collection_name: str, exp_ids: Sequence[int]):
11181118
"""
11191119
collection = get_collection(db_collection_name)
11201120
array_id, task_id = get_current_slurm_array_id()
1121+
exp = None
11211122
if array_id is not None and task_id is not None:
11221123
# We are running in slurm
11231124
array_id, task_id = int(array_id), int(task_id)
@@ -1127,14 +1128,30 @@ def claim_experiment(db_collection_name: str, exp_ids: Sequence[int]):
11271128
'execution.task_id': task_id,
11281129
'execution.cluster': cluster_name,
11291130
}
1131+
# First, we check whether this SLURM job is responsible for a RESCHEDULED experiment
1132+
# If so, we claim this first
11301133
exp = collection.find_one_and_update(
11311134
{
11321135
'_id': {'$in': list(exp_ids)},
1133-
'status': {'$in': States.PENDING + States.RESCHEDULED},
1136+
'status': {'$in': States.RESCHEDULED},
1137+
'execution.array_id': array_id,
1138+
'execution.task_id': task_id,
1139+
'execution.cluster': cluster_name,
11341140
},
11351141
{'$set': {'status': States.RUNNING[0], **update}},
11361142
{'_id': 1, 'slurm': 1},
11371143
)
1144+
# Only after we have checked that this job is not responsible for a RESCHEDULED experiment
1145+
# can we pick up a pending one.
1146+
if exp is None:
1147+
exp = collection.find_one_and_update(
1148+
{
1149+
'_id': {'$in': list(exp_ids)},
1150+
'status': {'$in': States.PENDING},
1151+
},
1152+
{'$set': {'status': States.RUNNING[0], **update}},
1153+
{'_id': 1, 'slurm': 1},
1154+
)
11381155
if exp is None:
11391156
exit(3)
11401157
# Set slurm output file

0 commit comments

Comments
 (0)