Merge pull request #577 from NVIDIA/am/group-nodes-alloc

amaslenn · web-flow · commit 82ee1dd2692d · 2025-06-24T18:41:35.000+02:00
Re-work slurm node status update
diff --git a/src/cloudai/systems/slurm/slurm_node.py b/src/cloudai/systems/slurm/slurm_node.py
@@ -140,6 +140,10 @@ def allocatable(self, free_only: bool = True) -> bool:
                 SlurmNodeState.RESERVED,
             ]
 
+    def __hash__(self) -> int:
+        """Provide a hash of the Slurm node, including its name, state, and partition."""
+        return hash((self.name, self.partition, self.state, self.user))
+
     def __repr__(self) -> str:
         """
         Provide a structured string representation of the Slurm node, including its name, state, and partition.
diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py
@@ -196,10 +196,59 @@ def update(self) -> None:
         commands, and correlating this information to determine the state of each node and the user running jobs on
         each node.
         """
-        squeue_output, _ = self.fetch_command_output("squeue -o '%N|%u' --noheader")
-        sinfo_output, _ = self.fetch_command_output("sinfo")
-        node_user_map = self.parse_squeue_output(squeue_output)
-        self.parse_sinfo_output(sinfo_output, node_user_map)
+        all_nodes = self.nodes_from_sinfo()
+        self.update_nodes_state_and_user(all_nodes, insert_new=True)
+        self.update_nodes_state_and_user(self.nodes_from_squeue())
+
+    def nodes_from_sinfo(self) -> list[SlurmNode]:
+        sinfo_output, _ = self.fetch_command_output("sinfo -o '%P|%t|%u|%N'")
+        nodes: list[SlurmNode] = []
+        for line in sinfo_output.split("\n"):
+            if not line.strip():
+                continue
+            parts = line.split("|")
+            if len(parts) < 4:
+                continue
+            partition, state, user, nodelist = parts[:4]
+            partition = partition.rstrip("*").strip()
+            node_names = parse_node_list(nodelist)
+            logging.debug(f"{partition=}, {state=}, {nodelist=}, {node_names=}")
+            for node_name in node_names:
+                nodes.append(
+                    SlurmNode(name=node_name, partition=partition, state=self.convert_state_to_enum(state), user=user)
+                )
+        return nodes
+
+    def nodes_from_squeue(self) -> list[SlurmNode]:
+        squeue_output, _ = self.fetch_command_output("squeue --states=running,pending --noheader -o '%P|%T|%N|%u'")
+        nodes: list[SlurmNode] = []
+        for line in squeue_output.split("\n"):
+            parts = line.split("|")
+            if len(parts) < 4:
+                continue
+            partition, _, nodelist, user = parts[:4]
+            node_names = parse_node_list(nodelist)
+            for node in node_names:
+                nodes.append(SlurmNode(name=node, partition=partition, state=SlurmNodeState.ALLOCATED, user=user))
+        return nodes
+
+    def update_nodes_state_and_user(self, nodes: list[SlurmNode], insert_new: bool = False) -> None:
+        for node in nodes:
+            for part in self.partitions:
+                if part.name != node.partition:
+                    continue
+
+                found = False
+                for pnode in part.slurm_nodes:
+                    if pnode.name != node.name:
+                        continue
+                    pnode.state = node.state
+                    pnode.user = node.user
+                    found = True
+                    break
+
+                if not found and insert_new:
+                    part.slurm_nodes.append(node)
 
     def is_job_running(self, job: BaseJob, retry_threshold: int = 3) -> bool:
         """
@@ -580,79 +629,6 @@ def fetch_command_output(self, command: str) -> Tuple[str, str]:
             logging.error(f"Error executing command '{command}': {stderr}")
         return stdout, stderr
 
-    def parse_squeue_output(self, squeue_output: str) -> Dict[str, str]:
-        """
-        Parse the output from the 'squeue' command to map nodes to users.
-
-        The expected format of squeue_output is lines of 'node_spec|user', where node_spec can include comma-separated
-        node names or ranges.
-
-        Args:
-            squeue_output (str): The raw output from the squeue command.
-
-        Returns:
-            Dict[str, str]: A dictionary mapping node names to usernames.
-        """
-        node_user_map = {}
-        for line in squeue_output.split("\n"):
-            if line.strip():
-                # Split the line into node list and user, handling only the first '|'
-                parts = line.split("|")
-                if len(parts) < 2:
-                    continue  # Skip malformed lines
-
-                node_list_part, user = parts[0], "|".join(parts[1:])
-                # Handle cases where multiple node groups or ranges are specified
-                for node in parse_node_list(node_list_part):
-                    node_user_map[node] = user.strip()
-
-        return node_user_map
-
-    def parse_sinfo_output(self, sinfo_output: str, node_user_map: Dict[str, str]) -> None:
-        """
-        Parse the output from the 'sinfo' command to update node states.
-
-        Args:
-            sinfo_output (str): The output from the sinfo command.
-            node_user_map (dict): A dictionary mapping node names to users.
-        """
-        for line in sinfo_output.split("\n")[1:]:  # Skip the header line
-            if not line.strip():
-                continue
-            parts = line.split()
-            if len(parts) < 6:
-                continue
-            partition, _, _, _, state, nodelist = parts[:6]
-            partition = partition.rstrip("*")
-            node_names = parse_node_list(nodelist)
-
-            # Convert state to enum, handling states with suffixes
-            state_enum = self.convert_state_to_enum(state)
-
-            for node_name in node_names:
-                # Find the partition and node to update the state
-                for part in self.partitions:
-                    if part.name != partition:
-                        continue
-
-                    found = False
-                    for node in part.slurm_nodes:
-                        if node.name == node_name:
-                            found = True
-                            node.state = state_enum
-                            node.user = node_user_map.get(node_name, "N/A")
-                            break
-
-                    if not found:
-                        part.slurm_nodes.append(
-                            SlurmNode(
-                                name=node_name,
-                                partition=partition,
-                                state=state_enum,
-                                user=node_user_map.get(node_name, "N/A"),
-                            )
-                        )
-
     def convert_state_to_enum(self, state_str: str) -> SlurmNodeState:
         """
         Convert a Slurm node state string to its corresponding enum member.
@@ -768,7 +744,7 @@ def get_nodes_by_spec(self, num_nodes: int, nodes: list[str]) -> Tuple[int, list
         if parsed_nodes:
             num_nodes = len(parsed_nodes)
             node_list = parsed_nodes
-        return num_nodes, node_list
+        return num_nodes, sorted(node_list)
 
     def system_installables(self) -> list[Installable]:
         return [File(Path(__file__).parent.absolute() / "slurm-metadata.sh")]
diff --git a/tests/test_slurm_allocation.py b/tests/test_slurm_allocation.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cloudai.systems.slurm import SlurmGroup, SlurmNode, SlurmNodeState, SlurmPartition, SlurmSystem, parse_node_list
+
+
+class TestGroupAllocation:
+    def prepare(
+        self, slurm_system: SlurmSystem, taken_node_names: list[str], monkeypatch: pytest.MonkeyPatch
+    ) -> tuple[SlurmSystem, list[SlurmNode], list[SlurmNode]]:
+        slurm_system.partitions = [
+            SlurmPartition(name="main", groups=[SlurmGroup(name="group1", nodes=["node0[1-5]"])])
+        ]
+        all_nodes = [
+            SlurmNode(name=name, partition="main", state=SlurmNodeState.IDLE)
+            for name in parse_node_list(slurm_system.partitions[0].groups[0].nodes[0])
+        ]
+        taken_nodes = [
+            SlurmNode(name=node.name, partition="main", state=SlurmNodeState.ALLOCATED)
+            for node in all_nodes
+            if node.name in taken_node_names
+        ]
+
+        mod_path = "cloudai.systems.slurm.slurm_system.SlurmSystem"
+        monkeypatch.setattr(f"{mod_path}.nodes_from_sinfo", lambda *args, **kwargs: all_nodes)
+        monkeypatch.setattr(f"{mod_path}.nodes_from_squeue", lambda *args, **kwargs: taken_nodes)
+        return slurm_system, all_nodes, taken_nodes
+
+    def test_all_nodes_in_group_are_idle(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch):
+        system, *_ = self.prepare(slurm_system, [], monkeypatch)
+        nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:5"])
+        assert nodes_list == parse_node_list(slurm_system.partitions[0].groups[0].nodes[0])
+        assert nnodes == len(nodes_list)
+
+    def test_enough_free_nodes_for_allocation(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch):
+        system, all_nodes, taken_nodes = self.prepare(slurm_system, ["node01", "node02"], monkeypatch)
+        nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:3"])
+        assert nnodes == 3
+        assert nodes_list == sorted([n.name for n in set(all_nodes) - set(taken_nodes)])
+
+    def test_not_enough_nodes_for_allocation(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch):
+        """In this scenario we still return required number of nodes to put job into the queue"""
+        system, all_nodes, _ = self.prepare(slurm_system, ["node01", "node02"], monkeypatch)
+        nnodes, nodes_list = system.get_nodes_by_spec(1, ["main:group1:5"])
+        assert nnodes == 5
+        assert nodes_list == sorted([n.name for n in all_nodes])
+
+    @pytest.mark.xfail(reason="This is a bug in the code, RM4471870")
+    def test_two_cases_one_group(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch):
+        # system has 5 nodes in the group
+        system, *_ = self.prepare(slurm_system, [], monkeypatch)
+
+        # first case asks for 2 nodes
+        nnodes, nodes_list1 = system.get_nodes_by_spec(1, ["main:group1:2"])
+        assert nnodes == 2
+
+        # second case asks for another 2 nodes
+        nnodes, nodes_list2 = system.get_nodes_by_spec(1, ["main:group1:2"])
+        assert nnodes == 2
+
+        assert nodes_list1 != nodes_list2, "Same nodes we allocated for two different requests"
diff --git a/tests/test_slurm_system.py b/tests/test_slurm_system.py