@@ -251,33 +251,31 @@ def _find_tpu_node_action(nodename, state) -> NodeAction:
251251 return NodeActionUnchanged ()
252252
253253def get_node_reason (nodename : str ) -> Optional [str ]:
254- """Get the reason for a node's state."""
254+ """Get the reason for a node's state using JSON output ."""
255255 try :
256- result = run (f"{ lookup ().scontrol } show node { nodename } " )
257- for line in result .stdout .splitlines ():
258- if "Reason=" in line :
259- reason = line .split ("Reason=" )[1 ].strip ()
260- if "[" in reason :
261- return reason .split ("[" )[0 ].strip ()
262- return reason
256+ # Use --json to get structured data
257+ result = run (f"{ lookup ().scontrol } show node { nodename } --json" )
258+ data = json .loads (result .stdout )
259+
260+ # Access the reason field directly from the JSON structure
261+ nodes = data .get ('nodes' , [])
262+ if nodes :
263+ reason = nodes [0 ].get ('reason' )
264+ # Handle the specific formatting logic for brackets if needed
265+ if reason and "[" in reason :
266+ return reason .split ("[" )[0 ].strip ()
267+ return reason
268+ except (subprocess .CalledProcessError , json .JSONDecodeError ) as e :
269+ log .error (f"Failed to execute scontrol or parse its JSON output for node { nodename } : { e } " )
263270 except Exception as e :
264- log .error (f"Failed to get reason for node { nodename } : { e } " )
271+ log .error (f"An unexpected error occurred while getting reason for node { nodename } : { e } " )
265272 return None
266273
267274
268275def get_node_action (nodename : str ) -> NodeAction :
269276 """Determine node/instance status that requires action"""
270277 lkp = lookup ()
271278 state = lkp .node_state (nodename )
272- if state is not None and "DRAIN" in state .flags :
273- reason = get_node_reason (nodename )
274- if reason in repair .REPAIR_REASONS :
275- if repair .is_node_being_repaired (nodename ):
276- return NodeActionUnchanged ()
277- inst = lkp .instance (nodename .split ("." )[0 ])
278- if inst :
279- return NodeActionRepair (reason = reason )
280-
281279 if lkp .node_is_gke (nodename ):
282280 return NodeActionUnchanged ()
283281
@@ -299,6 +297,14 @@ def get_node_action(nodename: str) -> NodeAction:
299297 ("POWER_DOWN" , "POWERING_UP" , "POWERING_DOWN" , "POWERED_DOWN" )
300298 ) & (state .flags if state is not None else set ())
301299
300+ if state is not None and "DRAIN" in state .flags :
301+ reason = get_node_reason (nodename )
302+ if reason in repair .REPAIR_REASONS :
303+ if repair .is_node_being_repaired (nodename ):
304+ return NodeActionUnchanged ()
305+ if inst :
306+ return NodeActionRepair (reason = reason )
307+
302308 if (state is None ) and (inst is None ):
303309 # Should never happen
304310 return NodeActionUnknown (None , None )
0 commit comments