Skip to content

Commit 21290b1

Browse files
authored
Merge pull request #746 from NVIDIA/am/nodes-vs-nodelist
Do not set -N/--nodes if nodelist is specified
2 parents f95506e + 0b513cc commit 21290b1

File tree

3 files changed

+46
-18
lines changed

3 files changed

+46
-18
lines changed

src/cloudai/systems/slurm/slurm_command_gen_strategy.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@ def __init__(self, system: System, test_run: TestRun) -> None:
5252

5353
self._node_spec_cache: dict[str, tuple[int, list[str]]] = {}
5454

55+
@property
56+
def nodelist_in_use(self) -> bool:
57+
_, nodes = self.get_cached_nodes_spec()
58+
return len(nodes) > 0
59+
5560
@abstractmethod
5661
def _container_mounts(self) -> list[str]:
5762
"""Return CommandGenStrategy specific container mounts for the test run."""
@@ -240,7 +245,7 @@ def image_path(self) -> Optional[str]:
240245
def gen_srun_prefix(self, use_pretest_extras: bool = False, with_num_nodes: bool = True) -> List[str]:
241246
num_nodes, _ = self.get_cached_nodes_spec()
242247
srun_command_parts = ["srun", "--export=ALL", f"--mpi={self.system.mpi}"]
243-
if with_num_nodes:
248+
if with_num_nodes and not self.nodelist_in_use:
244249
srun_command_parts.append(f"-N{num_nodes}")
245250
if use_pretest_extras and self.test_run.pre_test:
246251
for pre_tr in self.test_run.pre_test.test_runs:

tests/slurm_command_gen_strategy/test_common_slurm_command_gen_strategy.py

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def testrun_fixture(tmp_path: Path) -> TestRun:
4040
extra_env_vars={"TEST_VAR": "VALUE"},
4141
)
4242

43-
return TestRun(name="test_job", test=tdef, output_path=tmp_path, num_nodes=2, nodes=["node1", "node2"])
43+
return TestRun(name="test_job", test=tdef, output_path=tmp_path, num_nodes=2, nodes=[])
4444

4545

4646
@pytest.fixture
@@ -49,30 +49,33 @@ def strategy_fixture(slurm_system: SlurmSystem, testrun_fixture: TestRun) -> Slu
4949
return strategy
5050

5151

52-
def test_filename_generation(strategy_fixture: SlurmCommandGenStrategy):
53-
srun_command = strategy_fixture._gen_srun_command()
52+
def test_filename_generation(slurm_system: SlurmSystem, testrun_fixture: TestRun):
53+
testrun_fixture.nodes = ["node1", "node2"]
54+
strategy = MySlurmCommandGenStrategy(slurm_system, testrun_fixture)
55+
srun_command = strategy._gen_srun_command()
5456

55-
sbatch_command = strategy_fixture._write_sbatch_script(srun_command)
57+
sbatch_command = strategy._write_sbatch_script(srun_command)
5658
filepath_from_command = sbatch_command.split()[-1]
5759

58-
assert strategy_fixture.test_run.output_path.joinpath("cloudai_sbatch_script.sh").exists()
60+
assert strategy.test_run.output_path.joinpath("cloudai_sbatch_script.sh").exists()
5961

6062
with open(filepath_from_command, "r") as file:
6163
file_contents = file.read()
6264

6365
assert "node1,node2" in file_contents
6466
assert "srun" in file_contents
65-
assert f"--mpi={strategy_fixture.system.mpi}" in file_contents
67+
assert f"--mpi={strategy.system.mpi}" in file_contents
6668

6769

68-
def test_num_nodes_and_nodes(strategy_fixture: SlurmCommandGenStrategy):
69-
tr = make_test_run(name="test_job", output_dir=strategy_fixture.system.output_path)
70+
def test_num_nodes_and_nodes(slurm_system: SlurmSystem):
71+
tr = make_test_run(name="test_job", output_dir=slurm_system.output_path)
7072
tr.nodes = ["node1", "node2"]
7173
tr.num_nodes = 3
7274
tr.output_path.mkdir(parents=True, exist_ok=True)
75+
strategy = MySlurmCommandGenStrategy(slurm_system, tr)
7376

7477
lines: list[str] = []
75-
strategy_fixture._append_sbatch_directives(lines)
78+
strategy._append_sbatch_directives(lines)
7679

7780
assert f"#SBATCH -N {tr.num_nodes}" not in lines
7881
assert f"#SBATCH --nodelist={','.join(tr.nodes)}" in lines
@@ -88,14 +91,16 @@ def test_only_num_nodes(strategy_fixture: SlurmCommandGenStrategy):
8891
assert f"#SBATCH --nodelist={','.join(strategy_fixture.test_run.nodes)}" not in lines
8992

9093

91-
def test_only_nodes(strategy_fixture: SlurmCommandGenStrategy):
92-
tr = make_test_run(name="test_job", output_dir=strategy_fixture.system.output_path)
94+
def test_only_nodes(slurm_system: SlurmSystem):
95+
tr = make_test_run(name="test_job", output_dir=slurm_system.output_path)
9396
tr.num_nodes = 0
9497
tr.nodes = ["node1", "node2"]
9598
tr.output_path.mkdir(parents=True, exist_ok=True)
9699

100+
strategy = MySlurmCommandGenStrategy(slurm_system, tr)
101+
97102
lines: list[str] = []
98-
strategy_fixture._append_sbatch_directives(lines)
103+
strategy._append_sbatch_directives(lines)
99104

100105
assert f"#SBATCH --nodelist={','.join(tr.nodes)}" in lines
101106
assert f"#SBATCH -N {tr.num_nodes}" not in lines
@@ -329,16 +334,18 @@ def test_gen_srun_prefix_tr_extra_srun_args(strategy_fixture: SlurmCommandGenStr
329334
assert "--arg val --flag" in srun_prefix # added as a single element
330335

331336

332-
def test_append_distribution_and_hostfile_with_nodes(strategy_fixture: SlurmCommandGenStrategy) -> None:
333-
strategy_fixture.system.distribution = "block"
334-
strategy_fixture.system.ntasks_per_node = 2
337+
def test_append_distribution_and_hostfile_with_nodes(slurm_system: SlurmSystem, testrun_fixture: TestRun) -> None:
338+
slurm_system.distribution = "block"
339+
slurm_system.ntasks_per_node = 2
340+
testrun_fixture.nodes = ["node1", "node2"]
341+
strategy = MySlurmCommandGenStrategy(slurm_system, testrun_fixture)
335342
content: List[str] = []
336-
strategy_fixture._append_nodes_related_directives(content)
343+
strategy._append_nodes_related_directives(content)
337344

338345
assert "#SBATCH --distribution=arbitrary" in content
339346
assert "#SBATCH --nodelist=node1,node2" in content
340347

341-
hostfile_path = strategy_fixture.test_run.output_path / "hostfile.txt"
348+
hostfile_path = strategy.test_run.output_path / "hostfile.txt"
342349
assert hostfile_path.exists()
343350
lines: List[str] = hostfile_path.read_text().splitlines()
344351
assert lines == ["node1", "node1", "node2", "node2"]
@@ -352,3 +359,17 @@ def test_distribution_fallback_when_no_nodes(strategy_fixture: SlurmCommandGenSt
352359

353360
assert "#SBATCH --distribution=cyclic" in content
354361
assert "#SBATCH --nodelist=" not in content
362+
363+
364+
def test_nodelist_over_num_nodes(slurm_system: SlurmSystem, testrun_fixture: TestRun) -> None:
365+
testrun_fixture.nodes = ["nodeA", "nodeB", "nodeC"]
366+
testrun_fixture.num_nodes = 5
367+
strategy = MySlurmCommandGenStrategy(slurm_system, testrun_fixture)
368+
content: list[str] = []
369+
370+
strategy._append_sbatch_directives(content)
371+
assert "#SBATCH --nodelist=nodeA,nodeB,nodeC" in content
372+
assert "#SBATCH -N" not in content
373+
374+
cmd = strategy.gen_srun_prefix(with_num_nodes=True)
375+
assert " -N" not in " ".join(cmd)

tests/test_slurm_system.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,8 @@ def test_per_test_isolation(self, mock_get_nodes: Mock, slurm_system: SlurmSyste
437437
ConcreteSlurmStrategy(slurm_system, test_run),
438438
)
439439

440+
assert mock_get_nodes.call_count == 0
441+
440442
res = strategy1.get_cached_nodes_spec()
441443
assert mock_get_nodes.call_count == 1
442444
assert res == (2, ["node01", "node02"])

0 commit comments

Comments
 (0)