Merge pull request #586 from NVIDIA/am/group-nodes-alloc2

amaslenn · web-flow · commit bc73d05a8f76 · 2025-06-25T18:32:54.000+02:00
Fix nodes allocation from the same group
diff --git a/src/cloudai/systems/slurm/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/slurm_command_gen_strategy.py
@@ -483,7 +483,7 @@ def get_cached_nodes_spec(self, tr: TestRun) -> tuple[int, list[str]]:
         It is needed to avoid multiple calls to the system.get_nodes_by_spec method which in turn queries the Slurm API.
         For a single test run it is not required, we can get actual nodes status only once.
         """
-        cache_key = f"{tr.current_iteration}:{tr.step}:{tr.num_nodes}:{','.join(tr.nodes)}"
+        cache_key = f"{tr.name}:{tr.current_iteration}:{tr.step}:{tr.num_nodes}:{','.join(tr.nodes)}"
 
         if cache_key in self._node_spec_cache:
             logging.debug(f"Using cached node allocation for {cache_key}: {self._node_spec_cache[cache_key]}")
diff --git a/src/cloudai/systems/slurm/slurm_runner.py b/src/cloudai/systems/slurm/slurm_runner.py
@@ -39,6 +39,7 @@ class SlurmRunner(BaseRunner):
 
     def __init__(self, mode: str, system: System, test_scenario: TestScenario, output_path: Path) -> None:
         super().__init__(mode, system, test_scenario, output_path)
+        self.system = cast(SlurmSystem, system)
         self.cmd_shell = CommandShell()
 
     def _submit_test(self, tr: TestRun) -> SlurmJob:
@@ -62,6 +63,7 @@ def _submit_test(self, tr: TestRun) -> SlurmJob:
 
     def on_job_completion(self, job: BaseJob) -> None:
         logging.debug(f"Job completion callback for job {job.id}")
+        self.system.complete_job(cast(SlurmJob, job))
         self.store_job_metadata(cast(SlurmJob, job))
 
     def _mock_job_metadata(self) -> SlurmStepMetadata:
diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py
@@ -18,15 +18,17 @@
 
 import logging
 import re
+from copy import copy
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator
 
 from cloudai.core import BaseJob, File, Installable, System
 from cloudai.models.scenario import ReportConfig, parse_reports_spec
 from cloudai.util import CommandShell
 
+from .slurm_job import SlurmJob
 from .slurm_metadata import SlurmStepMetadata
 from .slurm_node import SlurmNode, SlurmNodeState
 
@@ -137,6 +139,8 @@ class SlurmSystem(BaseModel, System):
     data_repository: Optional[DataRepositoryConfig] = None
     reports: Optional[dict[str, ReportConfig]] = None
 
+    group_allocated: set[SlurmNode] = Field(default_factory=set, exclude=True)
+
     @field_validator("reports", mode="before")
     @classmethod
     def parse_reports(cls, value: dict[str, Any] | None) -> dict[str, ReportConfig] | None:
@@ -199,6 +203,7 @@ def update(self) -> None:
         all_nodes = self.nodes_from_sinfo()
         self.update_nodes_state_and_user(all_nodes, insert_new=True)
         self.update_nodes_state_and_user(self.nodes_from_squeue())
+        self.update_nodes_state_and_user(self.group_allocated)
 
     def nodes_from_sinfo(self) -> list[SlurmNode]:
         sinfo_output, _ = self.fetch_command_output("sinfo -o '%P|%t|%u|%N'")
@@ -232,7 +237,7 @@ def nodes_from_squeue(self) -> list[SlurmNode]:
                 nodes.append(SlurmNode(name=node, partition=partition, state=SlurmNodeState.ALLOCATED, user=user))
         return nodes
 
-    def update_nodes_state_and_user(self, nodes: list[SlurmNode], insert_new: bool = False) -> None:
+    def update_nodes_state_and_user(self, nodes: Iterable[SlurmNode], insert_new: bool = False) -> None:
         for node in nodes:
             for part in self.partitions:
                 if part.name != node.partition:
@@ -595,13 +600,18 @@ def allocate_nodes(
                     f"and ensure there are enough resources to meet the requested node count. Additionally, "
                     f"verify that the system can accommodate the number of nodes required by the test scenario."
                 )
+
         else:
             raise ValueError(
                 f"The 'number_of_nodes' argument must be either an integer specifying the number of nodes to allocate,"
                 f" or 'max_avail' to allocate all available nodes. Received: '{number_of_nodes}'. "
                 "Please correct the input."
             )
 
+        for node in allocated_nodes:
+            node.state = SlurmNodeState.ALLOCATED
+        self.group_allocated.update(copy(node) for node in allocated_nodes)
+
         return allocated_nodes
 
     def scancel(self, job_id: int) -> None:
@@ -748,3 +758,10 @@ def get_nodes_by_spec(self, num_nodes: int, nodes: list[str]) -> Tuple[int, list
 
     def system_installables(self) -> list[Installable]:
         return [File(Path(__file__).parent.absolute() / "slurm-metadata.sh")]
+
+    def complete_job(self, job: SlurmJob) -> None:
+        out, _ = self.fetch_command_output(f"sacct -j {job.id} -p --noheader -X --format=NodeList")
+        spec = out.splitlines()[0] if out.splitlines() else out
+        nodelist = set(parse_node_list(spec.strip().replace("|", "")))
+        to_unlock = [node for node in self.group_allocated if node.name in nodelist]
+        self.group_allocated.difference_update(to_unlock)
diff --git a/tests/test_slurm_allocation.py b/tests/test_slurm_allocation.py
@@ -14,9 +14,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from unittest.mock import Mock, patch
+
 import pytest
 
-from cloudai.systems.slurm import SlurmGroup, SlurmNode, SlurmNodeState, SlurmPartition, SlurmSystem, parse_node_list
+from cloudai.systems.slurm import (
+    SlurmGroup,
+    SlurmJob,
+    SlurmNode,
+    SlurmNodeState,
+    SlurmPartition,
+    SlurmSystem,
+    parse_node_list,
+)
 
 
 class TestGroupAllocation:
@@ -60,7 +70,6 @@ def test_not_enough_nodes_for_allocation(self, slurm_system: SlurmSystem, monkey
         assert nnodes == 5
         assert nodes_list == sorted([n.name for n in all_nodes])
 
-    @pytest.mark.xfail(reason="This is a bug in the code, RM4471870")
     def test_two_cases_one_group(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch):
         # system has 5 nodes in the group
         system, *_ = self.prepare(slurm_system, [], monkeypatch)
@@ -73,4 +82,36 @@ def test_two_cases_one_group(self, slurm_system: SlurmSystem, monkeypatch: pytes
         nnodes, nodes_list2 = system.get_nodes_by_spec(1, ["main:group1:2"])
         assert nnodes == 2
 
-        assert nodes_list1 != nodes_list2, "Same nodes we allocated for two different requests"
+        assert nodes_list1 != nodes_list2, "Same nodes were allocated for two different requests"
+
+    def test_completion_clears_group_allocation_state(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch):
+        system, all_nodes, taken_nodes = self.prepare(slurm_system, ["node01", "node02"], monkeypatch)
+        system.group_allocated.clear()
+        _, nodes_list = system.get_nodes_by_spec(1, ["main:group1:3"])
+        assert system.group_allocated == set(all_nodes) - set(taken_nodes)
+        assert all(node.state == SlurmNodeState.ALLOCATED for node in system.group_allocated)
+
+        with patch(
+            "cloudai.systems.slurm.slurm_system.SlurmSystem.fetch_command_output",
+            return_value=(f"{','.join(nodes_list)}|", ""),
+        ):
+            system.complete_job(SlurmJob(id=1, test_run=Mock()))
+
+        assert len(system.group_allocated) == 0
+
+    def test_group_allocation_is_preserved_on_updated(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch):
+        system, all_nodes, _ = self.prepare(slurm_system, [], monkeypatch)
+        system.group_allocated.clear()
+        _ = system.get_nodes_by_spec(1, ["main:group1:5"])
+        assert system.group_allocated == set(all_nodes)
+        assert all(node.state == SlurmNodeState.ALLOCATED for node in system.group_allocated)
+
+        # Simulate scenario when sinfo still reports group allocated nodes as idle
+        with patch(
+            "cloudai.systems.slurm.slurm_system.SlurmSystem.nodes_from_sinfo",
+            return_value=[
+                SlurmNode(name=node.name, partition=node.partition, state=SlurmNodeState.IDLE) for node in all_nodes
+            ],
+        ):
+            system.update()
+        assert all(node.state == SlurmNodeState.ALLOCATED for node in system.group_allocated)