@@ -196,10 +196,59 @@ def update(self) -> None:
196196 commands, and correlating this information to determine the state of each node and the user running jobs on
197197 each node.
198198 """
199- squeue_output , _ = self .fetch_command_output ("squeue -o '%N|%u' --noheader" )
200- sinfo_output , _ = self .fetch_command_output ("sinfo" )
201- node_user_map = self .parse_squeue_output (squeue_output )
202- self .parse_sinfo_output (sinfo_output , node_user_map )
199+ all_nodes = self .nodes_from_sinfo ()
200+ self .update_nodes_state_and_user (all_nodes , insert_new = True )
201+ self .update_nodes_state_and_user (self .nodes_from_squeue ())
202+
203+ def nodes_from_sinfo (self ) -> list [SlurmNode ]:
204+ sinfo_output , _ = self .fetch_command_output ("sinfo -o '%P|%t|%u|%N'" )
205+ nodes : list [SlurmNode ] = []
206+ for line in sinfo_output .split ("\n " ):
207+ if not line .strip ():
208+ continue
209+ parts = line .split ("|" )
210+ if len (parts ) < 4 :
211+ continue
212+ partition , state , user , nodelist = parts [:4 ]
213+ partition = partition .rstrip ("*" ).strip ()
214+ node_names = parse_node_list (nodelist )
215+ logging .debug (f"{ partition = } , { state = } , { nodelist = } , { node_names = } " )
216+ for node_name in node_names :
217+ nodes .append (
218+ SlurmNode (name = node_name , partition = partition , state = self .convert_state_to_enum (state ), user = user )
219+ )
220+ return nodes
221+
222+ def nodes_from_squeue (self ) -> list [SlurmNode ]:
223+ squeue_output , _ = self .fetch_command_output ("squeue --states=running,pending --noheader -o '%P|%T|%N|%u'" )
224+ nodes : list [SlurmNode ] = []
225+ for line in squeue_output .split ("\n " ):
226+ parts = line .split ("|" )
227+ if len (parts ) < 4 :
228+ continue
229+ partition , _ , nodelist , user = parts [:4 ]
230+ node_names = parse_node_list (nodelist )
231+ for node in node_names :
232+ nodes .append (SlurmNode (name = node , partition = partition , state = SlurmNodeState .ALLOCATED , user = user ))
233+ return nodes
234+
235+ def update_nodes_state_and_user (self , nodes : list [SlurmNode ], insert_new : bool = False ) -> None :
236+ for node in nodes :
237+ for part in self .partitions :
238+ if part .name != node .partition :
239+ continue
240+
241+ found = False
242+ for pnode in part .slurm_nodes :
243+ if pnode .name != node .name :
244+ continue
245+ pnode .state = node .state
246+ pnode .user = node .user
247+ found = True
248+ break
249+
250+ if not found and insert_new :
251+ part .slurm_nodes .append (node )
203252
204253 def is_job_running (self , job : BaseJob , retry_threshold : int = 3 ) -> bool :
205254 """
@@ -580,79 +629,6 @@ def fetch_command_output(self, command: str) -> Tuple[str, str]:
580629 logging .error (f"Error executing command '{ command } ': { stderr } " )
581630 return stdout , stderr
582631
583- def parse_squeue_output (self , squeue_output : str ) -> Dict [str , str ]:
584- """
585- Parse the output from the 'squeue' command to map nodes to users.
586-
587- The expected format of squeue_output is lines of 'node_spec|user', where node_spec can include comma-separated
588- node names or ranges.
589-
590- Args:
591- squeue_output (str): The raw output from the squeue command.
592-
593- Returns:
594- Dict[str, str]: A dictionary mapping node names to usernames.
595- """
596- node_user_map = {}
597- for line in squeue_output .split ("\n " ):
598- if line .strip ():
599- # Split the line into node list and user, handling only the first '|'
600- parts = line .split ("|" )
601- if len (parts ) < 2 :
602- continue # Skip malformed lines
603-
604- node_list_part , user = parts [0 ], "|" .join (parts [1 :])
605- # Handle cases where multiple node groups or ranges are specified
606- for node in parse_node_list (node_list_part ):
607- node_user_map [node ] = user .strip ()
608-
609- return node_user_map
610-
611- def parse_sinfo_output (self , sinfo_output : str , node_user_map : Dict [str , str ]) -> None :
612- """
613- Parse the output from the 'sinfo' command to update node states.
614-
615- Args:
616- sinfo_output (str): The output from the sinfo command.
617- node_user_map (dict): A dictionary mapping node names to users.
618- """
619- for line in sinfo_output .split ("\n " )[1 :]: # Skip the header line
620- if not line .strip ():
621- continue
622- parts = line .split ()
623- if len (parts ) < 6 :
624- continue
625- partition , _ , _ , _ , state , nodelist = parts [:6 ]
626- partition = partition .rstrip ("*" )
627- node_names = parse_node_list (nodelist )
628-
629- # Convert state to enum, handling states with suffixes
630- state_enum = self .convert_state_to_enum (state )
631-
632- for node_name in node_names :
633- # Find the partition and node to update the state
634- for part in self .partitions :
635- if part .name != partition :
636- continue
637-
638- found = False
639- for node in part .slurm_nodes :
640- if node .name == node_name :
641- found = True
642- node .state = state_enum
643- node .user = node_user_map .get (node_name , "N/A" )
644- break
645-
646- if not found :
647- part .slurm_nodes .append (
648- SlurmNode (
649- name = node_name ,
650- partition = partition ,
651- state = state_enum ,
652- user = node_user_map .get (node_name , "N/A" ),
653- )
654- )
655-
656632 def convert_state_to_enum (self , state_str : str ) -> SlurmNodeState :
657633 """
658634 Convert a Slurm node state string to its corresponding enum member.
@@ -768,7 +744,7 @@ def get_nodes_by_spec(self, num_nodes: int, nodes: list[str]) -> Tuple[int, list
768744 if parsed_nodes :
769745 num_nodes = len (parsed_nodes )
770746 node_list = parsed_nodes
771- return num_nodes , node_list
747+ return num_nodes , sorted ( node_list )
772748
773749 def system_installables (self ) -> list [Installable ]:
774750 return [File (Path (__file__ ).parent .absolute () / "slurm-metadata.sh" )]
0 commit comments