Skip to content

Commit bc73d05

Browse files
authored
Merge pull request #586 from NVIDIA/am/group-nodes-alloc2
Fix nodes allocation from the same group
2 parents 19ba946 + 17307ad commit bc73d05

File tree

4 files changed

+66
-6
lines changed

4 files changed

+66
-6
lines changed

src/cloudai/systems/slurm/slurm_command_gen_strategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,7 @@ def get_cached_nodes_spec(self, tr: TestRun) -> tuple[int, list[str]]:
483483
It is needed to avoid multiple calls to the system.get_nodes_by_spec method which in turn queries the Slurm API.
484484
For a single test run it is not required, we can get actual nodes status only once.
485485
"""
486-
cache_key = f"{tr.current_iteration}:{tr.step}:{tr.num_nodes}:{','.join(tr.nodes)}"
486+
cache_key = f"{tr.name}:{tr.current_iteration}:{tr.step}:{tr.num_nodes}:{','.join(tr.nodes)}"
487487

488488
if cache_key in self._node_spec_cache:
489489
logging.debug(f"Using cached node allocation for {cache_key}: {self._node_spec_cache[cache_key]}")

src/cloudai/systems/slurm/slurm_runner.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class SlurmRunner(BaseRunner):
3939

4040
def __init__(self, mode: str, system: System, test_scenario: TestScenario, output_path: Path) -> None:
4141
super().__init__(mode, system, test_scenario, output_path)
42+
self.system = cast(SlurmSystem, system)
4243
self.cmd_shell = CommandShell()
4344

4445
def _submit_test(self, tr: TestRun) -> SlurmJob:
@@ -62,6 +63,7 @@ def _submit_test(self, tr: TestRun) -> SlurmJob:
6263

6364
def on_job_completion(self, job: BaseJob) -> None:
6465
logging.debug(f"Job completion callback for job {job.id}")
66+
self.system.complete_job(cast(SlurmJob, job))
6567
self.store_job_metadata(cast(SlurmJob, job))
6668

6769
def _mock_job_metadata(self) -> SlurmStepMetadata:

src/cloudai/systems/slurm/slurm_system.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,17 @@
1818

1919
import logging
2020
import re
21+
from copy import copy
2122
from pathlib import Path
22-
from typing import Any, Dict, List, Optional, Tuple, Union
23+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
2324

2425
from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator
2526

2627
from cloudai.core import BaseJob, File, Installable, System
2728
from cloudai.models.scenario import ReportConfig, parse_reports_spec
2829
from cloudai.util import CommandShell
2930

31+
from .slurm_job import SlurmJob
3032
from .slurm_metadata import SlurmStepMetadata
3133
from .slurm_node import SlurmNode, SlurmNodeState
3234

@@ -137,6 +139,8 @@ class SlurmSystem(BaseModel, System):
137139
data_repository: Optional[DataRepositoryConfig] = None
138140
reports: Optional[dict[str, ReportConfig]] = None
139141

142+
group_allocated: set[SlurmNode] = Field(default_factory=set, exclude=True)
143+
140144
@field_validator("reports", mode="before")
141145
@classmethod
142146
def parse_reports(cls, value: dict[str, Any] | None) -> dict[str, ReportConfig] | None:
@@ -199,6 +203,7 @@ def update(self) -> None:
199203
all_nodes = self.nodes_from_sinfo()
200204
self.update_nodes_state_and_user(all_nodes, insert_new=True)
201205
self.update_nodes_state_and_user(self.nodes_from_squeue())
206+
self.update_nodes_state_and_user(self.group_allocated)
202207

203208
def nodes_from_sinfo(self) -> list[SlurmNode]:
204209
sinfo_output, _ = self.fetch_command_output("sinfo -o '%P|%t|%u|%N'")
@@ -232,7 +237,7 @@ def nodes_from_squeue(self) -> list[SlurmNode]:
232237
nodes.append(SlurmNode(name=node, partition=partition, state=SlurmNodeState.ALLOCATED, user=user))
233238
return nodes
234239

235-
def update_nodes_state_and_user(self, nodes: list[SlurmNode], insert_new: bool = False) -> None:
240+
def update_nodes_state_and_user(self, nodes: Iterable[SlurmNode], insert_new: bool = False) -> None:
236241
for node in nodes:
237242
for part in self.partitions:
238243
if part.name != node.partition:
@@ -595,13 +600,18 @@ def allocate_nodes(
595600
f"and ensure there are enough resources to meet the requested node count. Additionally, "
596601
f"verify that the system can accommodate the number of nodes required by the test scenario."
597602
)
603+
598604
else:
599605
raise ValueError(
600606
f"The 'number_of_nodes' argument must be either an integer specifying the number of nodes to allocate,"
601607
f" or 'max_avail' to allocate all available nodes. Received: '{number_of_nodes}'. "
602608
"Please correct the input."
603609
)
604610

611+
for node in allocated_nodes:
612+
node.state = SlurmNodeState.ALLOCATED
613+
self.group_allocated.update(copy(node) for node in allocated_nodes)
614+
605615
return allocated_nodes
606616

607617
def scancel(self, job_id: int) -> None:
@@ -748,3 +758,10 @@ def get_nodes_by_spec(self, num_nodes: int, nodes: list[str]) -> Tuple[int, list
748758

749759
def system_installables(self) -> list[Installable]:
750760
return [File(Path(__file__).parent.absolute() / "slurm-metadata.sh")]
761+
762+
def complete_job(self, job: SlurmJob) -> None:
763+
out, _ = self.fetch_command_output(f"sacct -j {job.id} -p --noheader -X --format=NodeList")
764+
spec = out.splitlines()[0] if out.splitlines() else out
765+
nodelist = set(parse_node_list(spec.strip().replace("|", "")))
766+
to_unlock = [node for node in self.group_allocated if node.name in nodelist]
767+
self.group_allocated.difference_update(to_unlock)

tests/test_slurm_allocation.py

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,19 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17+
from unittest.mock import Mock, patch
18+
1719
import pytest
1820

19-
from cloudai.systems.slurm import SlurmGroup, SlurmNode, SlurmNodeState, SlurmPartition, SlurmSystem, parse_node_list
21+
from cloudai.systems.slurm import (
22+
SlurmGroup,
23+
SlurmJob,
24+
SlurmNode,
25+
SlurmNodeState,
26+
SlurmPartition,
27+
SlurmSystem,
28+
parse_node_list,
29+
)
2030

2131

2232
class TestGroupAllocation:
@@ -60,7 +70,6 @@ def test_not_enough_nodes_for_allocation(self, slurm_system: SlurmSystem, monkey
6070
assert nnodes == 5
6171
assert nodes_list == sorted([n.name for n in all_nodes])
6272

63-
@pytest.mark.xfail(reason="This is a bug in the code, RM4471870")
6473
def test_two_cases_one_group(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch):
6574
# system has 5 nodes in the group
6675
system, *_ = self.prepare(slurm_system, [], monkeypatch)
@@ -73,4 +82,36 @@ def test_two_cases_one_group(self, slurm_system: SlurmSystem, monkeypatch: pytes
7382
nnodes, nodes_list2 = system.get_nodes_by_spec(1, ["main:group1:2"])
7483
assert nnodes == 2
7584

76-
assert nodes_list1 != nodes_list2, "Same nodes we allocated for two different requests"
85+
assert nodes_list1 != nodes_list2, "Same nodes were allocated for two different requests"
86+
87+
def test_completion_clears_group_allocation_state(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch):
88+
system, all_nodes, taken_nodes = self.prepare(slurm_system, ["node01", "node02"], monkeypatch)
89+
system.group_allocated.clear()
90+
_, nodes_list = system.get_nodes_by_spec(1, ["main:group1:3"])
91+
assert system.group_allocated == set(all_nodes) - set(taken_nodes)
92+
assert all(node.state == SlurmNodeState.ALLOCATED for node in system.group_allocated)
93+
94+
with patch(
95+
"cloudai.systems.slurm.slurm_system.SlurmSystem.fetch_command_output",
96+
return_value=(f"{','.join(nodes_list)}|", ""),
97+
):
98+
system.complete_job(SlurmJob(id=1, test_run=Mock()))
99+
100+
assert len(system.group_allocated) == 0
101+
102+
def test_group_allocation_is_preserved_on_updated(self, slurm_system: SlurmSystem, monkeypatch: pytest.MonkeyPatch):
103+
system, all_nodes, _ = self.prepare(slurm_system, [], monkeypatch)
104+
system.group_allocated.clear()
105+
_ = system.get_nodes_by_spec(1, ["main:group1:5"])
106+
assert system.group_allocated == set(all_nodes)
107+
assert all(node.state == SlurmNodeState.ALLOCATED for node in system.group_allocated)
108+
109+
# Simulate scenario when sinfo still reports group allocated nodes as idle
110+
with patch(
111+
"cloudai.systems.slurm.slurm_system.SlurmSystem.nodes_from_sinfo",
112+
return_value=[
113+
SlurmNode(name=node.name, partition=node.partition, state=SlurmNodeState.IDLE) for node in all_nodes
114+
],
115+
):
116+
system.update()
117+
assert all(node.state == SlurmNodeState.ALLOCATED for node in system.group_allocated)

0 commit comments

Comments
 (0)