Skip to content

Commit caa903c

Browse files
authored
Merge pull request #742 from NVIDIA/ako/osu-benchmark
Add workload for OSU Micro Benchmark
2 parents b3b9b23 + 921b9af commit caa903c

File tree

10 files changed

+394
-1
lines changed

10 files changed

+394
-1
lines changed

conf/common/test/osu_test.toml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "osu_test"
18+
test_template_name = "OSUBench"
19+
description = "OSU Benchmark example"
20+
21+
[cmd_args]
22+
"docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3"
23+
"benchmarks_dir" = "/opt/hpcx/ompi/tests/osu-micro-benchmarks"
24+
"benchmark" = "osu_allreduce"
25+
"iterations" = 10
26+
"message_size" = "1024"
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "osu_test_scenario"
18+
job_status_check = true
19+
20+
[[Tests]]
21+
id = "Tests.1"
22+
test_name = "osu_test"
23+
num_nodes = "2"
24+
time_limit = "00:20:00"

doc/workloads/osu.rst

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
OSU
2+
===
3+
4+
This workload (``test_template_name`` is ``OSUBench``) allows you to execute OSU Micro Benchmarks
5+
within the CloudAI framework.
6+
7+
Usage example
8+
-------------
9+
10+
Test example:
11+
12+
.. code-block:: toml
13+
14+
name = "osu_example"
15+
test_template_name = "OSUBench"
16+
description = "OSU Benchmark example"
17+
18+
[cmd_args]
19+
"docker_image_url" = "docker-image-with-osu-benchmark:latest"
20+
"benchmarks_dir" = "/directory/with/osu/binaries/in/container"
21+
"benchmark" = ["osu_allreduce", "osu_allgather"]
22+
"iterations" = 10
23+
"message_size" = "1024"
24+
25+
Test Scenario example:
26+
27+
.. code-block:: toml
28+
29+
name = "osu_example"
30+
31+
[[Tests]]
32+
id = "Tests.1"
33+
test_name = "osu_example"
34+
num_nodes = "2"
35+
time_limit = "00:20:00"
36+
37+
Test-in-Scenario example:
38+
39+
.. code-block:: toml
40+
41+
name = "osu-test"
42+
43+
[[Tests]]
44+
id = "Tests.osu_allreduce"
45+
num_nodes = 2
46+
time_limit = "00:05:00"
47+
48+
name = "osu_example"
49+
description = "OSU allreduce 1KB"
50+
test_template_name = "OSUBench"
51+
52+
[Tests.cmd_args]
53+
docker_image_url = "docker-image-with-osu-benchmark:latest"
54+
benchmarks_dir = "/directory/with/osu/binaries/in/container"
55+
benchmark = "osu_allreduce"
56+
iterations = 10
57+
message_size = "1024"
58+
59+
API Documentation
60+
-----------------
61+
62+
Command Arguments
63+
~~~~~~~~~~~~~~~~~
64+
65+
.. autoclass:: cloudai.workloads.osu_bench.osu_bench.OSUBenchCmdArgs
66+
:members:
67+
:show-inheritance:
68+
69+
Test Definition
70+
~~~~~~~~~~~~~~~
71+
72+
.. autoclass:: cloudai.workloads.osu_bench.osu_bench.OSUBenchTestDefinition
73+
:members:
74+
:show-inheritance:

src/cloudai/registration.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,10 @@ def register_all():
130130
NixlPerftestSlurmCommandGenStrategy,
131131
NixlPerftestTestDefinition,
132132
)
133+
from cloudai.workloads.osu_bench import (
134+
OSUBenchSlurmCommandGenStrategy,
135+
OSUBenchTestDefinition,
136+
)
133137
from cloudai.workloads.sleep import (
134138
SleepGradingStrategy,
135139
SleepKubernetesJsonGenStrategy,
@@ -203,6 +207,7 @@ def register_all():
203207
Registry().add_command_gen_strategy(SlurmSystem, AIDynamoTestDefinition, AIDynamoSlurmCommandGenStrategy)
204208
Registry().add_command_gen_strategy(SlurmSystem, BashCmdTestDefinition, BashCmdCommandGenStrategy)
205209
Registry().add_command_gen_strategy(SlurmSystem, NIXLKVBenchTestDefinition, NIXLKVBenchSlurmCommandGenStrategy)
210+
Registry().add_command_gen_strategy(SlurmSystem, OSUBenchTestDefinition, OSUBenchSlurmCommandGenStrategy)
206211

207212
Registry().add_installer("slurm", SlurmInstaller)
208213
Registry().add_installer("standalone", StandaloneInstaller)
@@ -236,6 +241,7 @@ def register_all():
236241
Registry().add_test_definition("NixlPerftest", NixlPerftestTestDefinition)
237242
Registry().add_test_definition("NIXLKVBench", NIXLKVBenchTestDefinition)
238243
Registry().add_test_definition("Aiconfigurator", AiconfiguratorTestDefinition)
244+
Registry().add_test_definition("OSUBench", OSUBenchTestDefinition)
239245

240246
Registry().add_agent("grid_search", GridSearchAgent)
241247

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from .osu_bench import OSUBenchCmdArgs, OSUBenchTestDefinition
18+
from .slurm_command_gen_strategy import OSUBenchSlurmCommandGenStrategy
19+
20+
__all__ = [
21+
"OSUBenchCmdArgs",
22+
"OSUBenchSlurmCommandGenStrategy",
23+
"OSUBenchTestDefinition",
24+
]
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from __future__ import annotations
18+
19+
from typing import Any, List, Optional, Union
20+
21+
from pydantic import Field
22+
23+
from cloudai.core import DockerImage, Installable, JobStatusResult, TestRun
24+
from cloudai.models.workload import CmdArgs, TestDefinition
25+
26+
27+
class OSUBenchCmdArgs(CmdArgs):
28+
"""Command line arguments for a OSU Benchmark test."""
29+
30+
docker_image_url: str
31+
"""URL of the Docker image to use for the test."""
32+
33+
benchmarks_dir: str
34+
"""Directory with the OSU Benchmark binaries inside the container. """
35+
36+
benchmark: Union[str, List[str]]
37+
"""Name of the benchmark to run. """
38+
39+
message_size: Optional[Union[str, List[str]]] = Field(default=None)
40+
"""Message size for the benchmark.
41+
42+
Examples::
43+
44+
128 // min = default, max = 128
45+
2:128 // min = 2, max = 128
46+
2: // min 2, max = default
47+
"""
48+
49+
iterations: Optional[int] = Field(default=None)
50+
"""Number of iterations for the benchmark."""
51+
52+
warmup: Optional[int] = Field(default=None)
53+
"""Number of warmup iterations to skip before timing."""
54+
55+
mem_limit: Optional[int] = Field(default=None)
56+
"""Per-process maximum memory consumption in bytes."""
57+
58+
full: bool = Field(default=True)
59+
"""Print full format listing of results."""
60+
61+
62+
class OSUBenchTestDefinition(TestDefinition):
63+
"""Test definition for OSU Benchmark test."""
64+
65+
cmd_args: OSUBenchCmdArgs
66+
_osu_image: DockerImage | None = None
67+
68+
@property
69+
def docker_image(self) -> DockerImage:
70+
if not self._osu_image:
71+
self._osu_image = DockerImage(url=self.cmd_args.docker_image_url)
72+
73+
return self._osu_image
74+
75+
@property
76+
def installables(self) -> list[Installable]:
77+
return [self.docker_image]
78+
79+
@property
80+
def cmd_args_dict(self) -> dict[str, Any]:
81+
return self.cmd_args.model_dump(exclude={"docker_image_url", "benchmarks_dir", "benchmark"})
82+
83+
def was_run_successful(self, tr: TestRun) -> JobStatusResult:
84+
stdout_path = tr.output_path / "stdout.txt"
85+
stderr_path = tr.output_path / "stderr.txt"
86+
87+
if not stdout_path.is_file():
88+
return JobStatusResult(
89+
is_successful=False,
90+
error_message=(
91+
f"stdout.txt file not found in the specified output directory {tr.output_path}. "
92+
"This file is expected to be created as a result of the OSU Benchmark test run."
93+
),
94+
)
95+
96+
with open(stdout_path, "r") as f:
97+
content = f.read()
98+
99+
if not content.strip():
100+
return JobStatusResult(
101+
is_successful=False,
102+
error_message=(
103+
f"stdout.txt file is empty in the specified output directory {tr.output_path}. "
104+
f"Please check for fatal errors in {stderr_path}"
105+
),
106+
)
107+
108+
# Check for basic OSU benchmark output format
109+
if "# Size" not in content:
110+
return JobStatusResult(
111+
is_successful=False,
112+
error_message=(
113+
f"Expected OSU benchmark output marker not found in stdout.txt in {tr.output_path}. "
114+
f"Check for errors in the execution or for a different output format."
115+
),
116+
)
117+
118+
# Additional validation could be added here to verify specific benchmark types
119+
# based on the full header format once benchmark-specific validation is needed
120+
121+
return JobStatusResult(is_successful=True)
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from typing import List, cast
18+
19+
from cloudai.systems.slurm import SlurmCommandGenStrategy
20+
21+
from .osu_bench import OSUBenchCmdArgs, OSUBenchTestDefinition
22+
23+
FULL_FLAG_UNSUPPORTED = [
24+
"osu_latency",
25+
"osu_latency_mt",
26+
"osu_latency_mp",
27+
"osu_bw",
28+
"osu_bibw",
29+
"osu_latency_persistent",
30+
"osu_bw_persistent",
31+
"osu_bibw_persistent",
32+
"osu_multi_lat",
33+
"osu_mbw_mr",
34+
"osu_put_latency",
35+
"osu_get_latency",
36+
"osu_acc_latency",
37+
"osu_get_acc_latency",
38+
"osu_cas_latency",
39+
"osu_fop_latency",
40+
"osu_put_bw",
41+
"osu_get_bw",
42+
"osu_put_bibw",
43+
"osu_init",
44+
"osu_hello",
45+
]
46+
47+
48+
class OSUBenchSlurmCommandGenStrategy(SlurmCommandGenStrategy):
49+
"""Command generation strategy for OSU Benchmark test on Slurm systems."""
50+
51+
def _container_mounts(self) -> List[str]:
52+
return []
53+
54+
def image_path(self) -> str:
55+
tdef: OSUBenchTestDefinition = cast(OSUBenchTestDefinition, self.test_run.test)
56+
return str(tdef.docker_image.installed_path)
57+
58+
def generate_test_command(self) -> List[str]:
59+
args: OSUBenchCmdArgs = cast(OSUBenchCmdArgs, self.test_run.test.cmd_args)
60+
61+
binary = f"{args.benchmarks_dir}/{args.benchmark}"
62+
srun_command_parts = [binary]
63+
64+
for name, value in self.test_run.test.cmd_args_dict.items():
65+
if value is None:
66+
continue
67+
68+
flag = f"--{name.replace('_', '-')}"
69+
70+
argument = flag if isinstance(value, bool) and value else f"{flag} {value}"
71+
72+
if name == "full" and args.benchmark in FULL_FLAG_UNSUPPORTED:
73+
continue
74+
75+
srun_command_parts.append(argument)
76+
77+
if self.test_run.test.extra_cmd_args:
78+
srun_command_parts.append(self.test_run.test.extra_args_str)
79+
80+
return srun_command_parts

tests/ref_data/osu-bench.sbatch

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
# generated by CloudAI@__CLOUDAI_VERSION__
3+
#SBATCH --job-name=__JOB_NAME__
4+
#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
5+
#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
6+
#SBATCH --partition=main
7+
#SBATCH -N 1
8+
#SBATCH --gpus-per-node=8
9+
#SBATCH --gres=gpu:8
10+
11+
export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
12+
13+
srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
14+
15+
srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
16+
17+
srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; /opt/hpcx/ompi/tests/osu-micro-benchmarks/osu_allreduce --message-size 1024 --iterations 10 --full"

0 commit comments

Comments
 (0)