Skip to content

Commit 8ecde98

Browse files
committed
Addresses the remaining comments
1 parent 79b7ec1 commit 8ecde98

File tree

2 files changed

+35
-21
lines changed
  • community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts

2 files changed

+35
-21
lines changed

community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/repair.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,20 @@ def _get_operations():
4343
return {}
4444

4545
def _write_all_operations(operations):
46-
"""Store the operations to the file."""
46+
"""Store the operations to the file safely."""
4747
try:
48-
with open(REPAIR_FILE, 'w', encoding='utf-8') as f:
49-
fcntl.lockf(f, fcntl.LOCK_EX)
48+
with open(REPAIR_FILE, 'a', encoding='utf-8') as f:
5049
try:
50+
fcntl.lockf(f, fcntl.LOCK_EX | fcntl.LOCK_NB)
51+
except (IOError, BlockingIOError):
52+
log.warning(f"Could not acquire lock on {REPAIR_FILE}. Another process may be running.")
53+
return False
54+
55+
try:
56+
f.seek(0)
57+
f.truncate()
5158
json.dump(operations, f, indent=4)
59+
f.flush()
5260
return True
5361
finally:
5462
fcntl.lockf(f, fcntl.LOCK_UN)

community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -251,33 +251,31 @@ def _find_tpu_node_action(nodename, state) -> NodeAction:
251251
return NodeActionUnchanged()
252252

253253
def get_node_reason(nodename: str) -> Optional[str]:
254-
"""Get the reason for a node's state."""
254+
"""Get the reason for a node's state using JSON output."""
255255
try:
256-
result = run(f"{lookup().scontrol} show node {nodename}")
257-
for line in result.stdout.splitlines():
258-
if "Reason=" in line:
259-
reason = line.split("Reason=")[1].strip()
260-
if "[" in reason:
261-
return reason.split("[")[0].strip()
262-
return reason
256+
# Use --json to get structured data
257+
result = run(f"{lookup().scontrol} show node {nodename} --json")
258+
data = json.loads(result.stdout)
259+
260+
# Access the reason field directly from the JSON structure
261+
nodes = data.get('nodes', [])
262+
if nodes:
263+
reason = nodes[0].get('reason')
264+
# Handle the specific formatting logic for brackets if needed
265+
if reason and "[" in reason:
266+
return reason.split("[")[0].strip()
267+
return reason
268+
except (subprocess.CalledProcessError, json.JSONDecodeError) as e:
269+
log.error(f"Failed to execute scontrol or parse its JSON output for node {nodename}: {e}")
263270
except Exception as e:
264-
log.error(f"Failed to get reason for node {nodename}: {e}")
271+
log.error(f"An unexpected error occurred while getting reason for node {nodename}: {e}")
265272
return None
266273

267274

268275
def get_node_action(nodename: str) -> NodeAction:
269276
"""Determine node/instance status that requires action"""
270277
lkp = lookup()
271278
state = lkp.node_state(nodename)
272-
if state is not None and "DRAIN" in state.flags:
273-
reason = get_node_reason(nodename)
274-
if reason in repair.REPAIR_REASONS:
275-
if repair.is_node_being_repaired(nodename):
276-
return NodeActionUnchanged()
277-
inst = lkp.instance(nodename.split(".")[0])
278-
if inst:
279-
return NodeActionRepair(reason=reason)
280-
281279
if lkp.node_is_gke(nodename):
282280
return NodeActionUnchanged()
283281

@@ -299,6 +297,14 @@ def get_node_action(nodename: str) -> NodeAction:
299297
("POWER_DOWN", "POWERING_UP", "POWERING_DOWN", "POWERED_DOWN")
300298
) & (state.flags if state is not None else set())
301299

300+
if state is not None and "DRAIN" in state.flags:
301+
reason = get_node_reason(nodename)
302+
if reason in repair.REPAIR_REASONS:
303+
if repair.is_node_being_repaired(nodename):
304+
return NodeActionUnchanged()
305+
if inst:
306+
return NodeActionRepair(reason=reason)
307+
302308
if (state is None) and (inst is None):
303309
# Should never happen
304310
return NodeActionUnknown(None, None)

0 commit comments

Comments
 (0)