Skip to content

Commit d947e02

Browse files
authored
Merge pull request #540 from NVIDIA/am/nixlbench
Add NIXL bench workload
2 parents 889a44e + 106e1a6 commit d947e02

File tree

8 files changed

+324
-1
lines changed

8 files changed

+324
-1
lines changed

src/cloudai/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@
113113
NeMoRunSlurmCommandGenStrategy,
114114
NeMoRunTestDefinition,
115115
)
116+
from .workloads.nixl_bench import NIXLBenchSlurmCommandGenStrategy, NIXLBenchTestDefinition
116117
from .workloads.sleep import (
117118
SleepGradingStrategy,
118119
SleepKubernetesJsonGenStrategy,
@@ -168,6 +169,7 @@
168169
CommandGenStrategy, [SlurmSystem], [NeMoLauncherTestDefinition], NeMoLauncherSlurmCommandGenStrategy
169170
)
170171
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoRunTestDefinition], NeMoRunSlurmCommandGenStrategy)
172+
Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NIXLBenchTestDefinition], NIXLBenchSlurmCommandGenStrategy)
171173

172174
Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncherTestDefinition], NeMoLauncherGradingStrategy)
173175
Registry().add_strategy(
@@ -199,6 +201,7 @@
199201
SlurmContainerTestDefinition,
200202
MegatronRunTestDefinition,
201203
TritonInferenceTestDefinition,
204+
NIXLBenchTestDefinition,
202205
],
203206
SlurmJobIdRetrievalStrategy,
204207
)
@@ -242,6 +245,7 @@
242245
SlurmContainerTestDefinition,
243246
MegatronRunTestDefinition,
244247
TritonInferenceTestDefinition,
248+
NIXLBenchTestDefinition,
245249
],
246250
DefaultJobStatusRetrievalStrategy,
247251
)
@@ -295,6 +299,7 @@
295299
Registry().add_test_definition("SlurmContainer", SlurmContainerTestDefinition)
296300
Registry().add_test_definition("MegatronRun", MegatronRunTestDefinition)
297301
Registry().add_test_definition("TritonInference", TritonInferenceTestDefinition)
302+
Registry().add_test_definition("NIXLBench", NIXLBenchTestDefinition)
298303

299304
Registry().add_agent("grid_search", GridSearchAgent)
300305

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from .nixl_bench import NIXLBenchCmdArgs, NIXLBenchTestDefinition
18+
from .slurm_command_gen_strategy import NIXLBenchSlurmCommandGenStrategy
19+
20+
__all__ = [
21+
"NIXLBenchCmdArgs",
22+
"NIXLBenchSlurmCommandGenStrategy",
23+
"NIXLBenchTestDefinition",
24+
]
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from typing import Optional
18+
19+
from cloudai import DockerImage, Installable
20+
21+
from ...models.workload import CmdArgs, TestDefinition
22+
23+
24+
class NIXLBenchCmdArgs(CmdArgs):
25+
"""Command line arguments for a NIXL Bench test."""
26+
27+
docker_image_url: str
28+
etcd_endpoint: str
29+
30+
31+
class NIXLBenchTestDefinition(TestDefinition):
32+
"""Test definition for a NIXL Bench test."""
33+
34+
cmd_args: NIXLBenchCmdArgs
35+
etcd_image_url: str
36+
_nixl_image: Optional[DockerImage] = None
37+
_etcd_image: Optional[DockerImage] = None
38+
39+
@property
40+
def docker_image(self) -> DockerImage:
41+
if not self._nixl_image:
42+
self._nixl_image = DockerImage(url=self.cmd_args.docker_image_url)
43+
return self._nixl_image
44+
45+
@property
46+
def etcd_image(self) -> DockerImage:
47+
if not self._etcd_image:
48+
self._etcd_image = DockerImage(url=self.etcd_image_url)
49+
return self._etcd_image
50+
51+
@property
52+
def installables(self) -> list[Installable]:
53+
return [self.docker_image, *self.git_repos, self.etcd_image]
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from typing import Any, cast
18+
19+
from cloudai import TestRun
20+
from cloudai.systems.slurm.slurm_system import SlurmSystem
21+
from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy
22+
23+
from .nixl_bench import NIXLBenchTestDefinition
24+
25+
26+
class NIXLBenchSlurmCommandGenStrategy(SlurmCommandGenStrategy):
27+
"""Command generation strategy for NIXL Bench tests."""
28+
29+
def __init__(self, system: SlurmSystem, cmd_args: dict[str, Any]) -> None:
30+
super().__init__(system, cmd_args)
31+
32+
self._current_image_url: str | None = None
33+
34+
def image_path(self, tr: TestRun) -> str | None:
35+
return self._current_image_url
36+
37+
def _container_mounts(self, tr: TestRun) -> list[str]:
38+
return []
39+
40+
def _gen_srun_command(
41+
self, env_vars: dict[str, str | list[str]], cmd_args: dict[str, str | list[str]], tr: TestRun
42+
) -> str:
43+
etcd_command: list[str] = self.gen_etcd_srun_command(tr)
44+
nixl_command: list[str] = self.gen_nixl_srun_command(tr)
45+
return " ".join(etcd_command) + "\nsleep 5\n" + " ".join(nixl_command)
46+
47+
def gen_etcd_srun_command(self, tr: TestRun) -> list[str]:
48+
tdef: NIXLBenchTestDefinition = cast(NIXLBenchTestDefinition, tr.test.test_definition)
49+
self._current_image_url = str(tdef.etcd_image.installed_path)
50+
etcd_cmd = [
51+
"/usr/local/bin/etcd",
52+
"--listen-client-urls",
53+
"http://0.0.0.0:2379",
54+
"--advertise-client-urls",
55+
"http://$(hostname -I | awk '{print $1}'):2379",
56+
]
57+
cmd = [
58+
*self.gen_srun_prefix(tr),
59+
"--overlap",
60+
"--ntasks-per-node=1",
61+
"--ntasks=1",
62+
"--nodelist=$SLURM_JOB_MASTER_NODE",
63+
"-N1",
64+
"bash",
65+
"-c",
66+
f'"{" ".join(etcd_cmd)}" &',
67+
]
68+
self._current_image_url = None
69+
return cmd
70+
71+
def gen_nixlbench_command(self, tr: TestRun) -> list[str]:
72+
tdef: NIXLBenchTestDefinition = cast(NIXLBenchTestDefinition, tr.test.test_definition)
73+
cmd = ["./nixlbench", f"--etcd-endpoints {tdef.cmd_args.etcd_endpoint}"]
74+
75+
other_args = tdef.cmd_args.model_dump(exclude={"docker_image_url", "etcd_endpoint"})
76+
for k, v in other_args.items():
77+
cmd.append(f"--{k} {v}")
78+
79+
return cmd
80+
81+
def gen_nixl_srun_command(self, tr: TestRun) -> list[str]:
82+
tdef: NIXLBenchTestDefinition = cast(NIXLBenchTestDefinition, tr.test.test_definition)
83+
self._current_image_url = str(tdef.docker_image.installed_path)
84+
nnodes, _ = self.get_cached_nodes_spec(tr)
85+
cmd = [
86+
*self.gen_srun_prefix(tr),
87+
"--overlap",
88+
"--ntasks-per-node=1",
89+
f"--ntasks={nnodes}",
90+
f"-N{nnodes}",
91+
"bash",
92+
"-c",
93+
f'"{" ".join(self.gen_nixlbench_command(tr))}"',
94+
]
95+
self._current_image_url = None
96+
return cmd

tests/ref_data/nixl_bench.sbatch

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
# generated by CloudAI@__CLOUDAI_VERSION__
3+
#SBATCH --job-name=__JOB_NAME__
4+
#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
5+
#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
6+
#SBATCH --partition=main
7+
#SBATCH -N 2
8+
9+
export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
10+
11+
srun --export=ALL --mpi=pmix --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
12+
13+
srun --export=ALL --mpi=pmix --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash __INSTALL_DIR__/slurm-metadata.sh
14+
15+
srun --export=ALL --mpi=pmix --container-image=url.com/docker:1 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install --overlap --ntasks-per-node=1 --ntasks=1 --nodelist=$SLURM_JOB_MASTER_NODE -N1 bash -c "/usr/local/bin/etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://$(hostname -I | awk '{print $1}'):2379" &
16+
sleep 5
17+
srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install --overlap --ntasks-per-node=1 --ntasks=2 -N2 bash -c "./nixlbench --etcd-endpoints http://$SLURM_JOB_MASTER_NODE:2379"
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from typing import cast
18+
19+
import pytest
20+
21+
from cloudai._core.test import Test
22+
from cloudai._core.test_scenario import TestRun
23+
from cloudai._core.test_template import TestTemplate
24+
from cloudai.systems.slurm.slurm_system import SlurmSystem
25+
from cloudai.workloads.nixl_bench.nixl_bench import NIXLBenchCmdArgs, NIXLBenchTestDefinition
26+
from cloudai.workloads.nixl_bench.slurm_command_gen_strategy import NIXLBenchSlurmCommandGenStrategy
27+
28+
29+
@pytest.fixture
30+
def nixl_bench_tr(slurm_system: SlurmSystem):
31+
return TestRun(
32+
name="nixl-bench",
33+
num_nodes=2,
34+
nodes=[],
35+
test=Test(
36+
test_template=TestTemplate(slurm_system),
37+
test_definition=NIXLBenchTestDefinition(
38+
etcd_image_url="docker.io/library/etcd:3.5.1",
39+
cmd_args=NIXLBenchCmdArgs(
40+
docker_image_url="docker.io/library/ubuntu:22.04", etcd_endpoint="http://127.0.0.1:2379"
41+
),
42+
name="nixl-bench",
43+
description="NIXL Bench",
44+
test_template_name="NIXLBench",
45+
),
46+
),
47+
)
48+
49+
50+
class TestNIXLBenchCommand:
51+
def test_default(self, nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
52+
strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, {})
53+
cmd = strategy.gen_nixlbench_command(nixl_bench_tr)
54+
assert cmd == ["./nixlbench", "--etcd-endpoints http://127.0.0.1:2379"]
55+
56+
def test_can_set_any_cmd_arg(self, nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
57+
in_args = {"backend": "MPI", "dashed-opt": "DRAM", "under_score_opt": "VRAM"}
58+
cmd_args = NIXLBenchCmdArgs.model_validate(
59+
{"docker_image_url": "docker.io/library/ubuntu:22.04", "etcd_endpoint": "http://127.0.0.1:2379", **in_args}
60+
)
61+
strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, {})
62+
nixl_bench_tr.test.test_definition.cmd_args = cmd_args
63+
64+
cmd = " ".join(strategy.gen_nixlbench_command(nixl_bench_tr))
65+
66+
for k, v in in_args.items():
67+
assert f"--{k} {v}" in cmd
68+
69+
70+
def test_gen_etcd_srun_command(nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
71+
strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, {})
72+
cmd = " ".join(strategy.gen_etcd_srun_command(nixl_bench_tr))
73+
assert (
74+
"/usr/local/bin/etcd --listen-client-urls http://0.0.0.0:2379 "
75+
"--advertise-client-urls http://$(hostname -I | awk '{print $1}'):2379"
76+
) in cmd
77+
78+
tdef: NIXLBenchTestDefinition = cast(NIXLBenchTestDefinition, nixl_bench_tr.test.test_definition)
79+
assert f"--container-image={tdef.etcd_image.installed_path}" in cmd
80+
assert "--container-mounts" in cmd
81+
assert "--overlap" in cmd
82+
assert "--ntasks-per-node=1" in cmd
83+
assert "--ntasks=1" in cmd
84+
assert "--nodelist=$SLURM_JOB_MASTER_NODE" in cmd
85+
assert "-N1" in cmd
86+
87+
88+
def test_gen_nixl_srun_command(nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
89+
strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, {})
90+
cmd = " ".join(strategy.gen_nixl_srun_command(nixl_bench_tr))
91+
tdef: NIXLBenchTestDefinition = cast(NIXLBenchTestDefinition, nixl_bench_tr.test.test_definition)
92+
assert f"--container-image={tdef.docker_image.installed_path}" in cmd
93+
assert "--overlap" in cmd
94+
assert "--ntasks-per-node=1" in cmd
95+
assert f"--ntasks={nixl_bench_tr.num_nodes}" in cmd
96+
assert f"-N{nixl_bench_tr.num_nodes}" in cmd
97+
98+
99+
def test_gen_srun_command(nixl_bench_tr: TestRun, slurm_system: SlurmSystem):
100+
strategy = NIXLBenchSlurmCommandGenStrategy(slurm_system, {})
101+
cmd = strategy._gen_srun_command({}, {}, nixl_bench_tr)
102+
assert "sleep 5" in cmd

tests/test_acceptance.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
NeMoLauncherTestDefinition,
4949
)
5050
from cloudai.workloads.nemo_run import NeMoRunCmdArgs, NeMoRunSlurmCommandGenStrategy, NeMoRunTestDefinition
51+
from cloudai.workloads.nixl_bench import NIXLBenchCmdArgs, NIXLBenchSlurmCommandGenStrategy, NIXLBenchTestDefinition
5152
from cloudai.workloads.sleep import SleepCmdArgs, SleepSlurmCommandGenStrategy, SleepTestDefinition
5253
from cloudai.workloads.slurm_container import (
5354
SlurmContainerCmdArgs,
@@ -268,6 +269,7 @@ def build_special_test_run(
268269
"slurm_container",
269270
"megatron-run",
270271
"triton-inference",
272+
"nixl_bench",
271273
]
272274
)
273275
def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> Tuple[TestRun, str, Optional[str]]:
@@ -357,6 +359,21 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
357359
),
358360
TritonInferenceSlurmCommandGenStrategy,
359361
),
362+
"nixl_bench": lambda: create_test_run(
363+
partial_tr,
364+
slurm_system,
365+
"nixl_bench",
366+
NIXLBenchTestDefinition(
367+
name="nixl_bench",
368+
description="nixl_bench",
369+
test_template_name="nixl_bench",
370+
etcd_image_url="url.com/docker:1",
371+
cmd_args=NIXLBenchCmdArgs(
372+
docker_image_url="url.com/docker:2", etcd_endpoint="http://$SLURM_JOB_MASTER_NODE:2379"
373+
),
374+
),
375+
NIXLBenchSlurmCommandGenStrategy,
376+
),
360377
}
361378

362379
if request.param.startswith(("gpt-", "grok-", "nemo-run-", "nemo-launcher")):
@@ -373,6 +390,8 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
373390
tr.num_nodes = 3
374391
tr.test.test_definition.extra_env_vars["NIM_MODEL_NAME"] = str(tr.output_path)
375392
tr.test.test_definition.extra_env_vars["NIM_CACHE_PATH"] = str(tr.output_path)
393+
if request.param == "nixl_bench":
394+
tr.num_nodes = 2
376395
return tr, f"{request.param}.sbatch", None
377396

378397
raise ValueError(f"Unknown test: {request.param}")
@@ -388,6 +407,7 @@ def test_sbatch_generation(slurm_system: SlurmSystem, test_req: tuple[TestRun, s
388407
ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path.parent))
389408
.replace("__JOB_NAME__", "job_name")
390409
.replace("__CLOUDAI_DIR__", str(Path(__file__).parent.parent))
410+
.replace("__INSTALL_DIR__", str(slurm_system.install_path.absolute()))
391411
)
392412
ref = ref.replace("__CLOUDAI_VERSION__", version("cloudai"))
393413

0 commit comments

Comments
 (0)