Skip to content

Commit 0f69871

Browse files
authored
Merge pull request #711 from nsarka/nsarka/ddlb-integration
Add DDLB workload
2 parents 8008f95 + 58ebe25 commit 0f69871

File tree

11 files changed

+374
-1
lines changed

11 files changed

+374
-1
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "ddlb_test"
18+
description = "DDLB test configuration"
19+
test_template_name = "DDLBTest"
20+
21+
[cmd_args]
22+
docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest"
23+
primitive = "tp_columnwise"
24+
m = [1024, 8192]
25+
n = 128
26+
k = 1024
27+
dtype = "float16"
28+
num_iterations = 50
29+
num_warmups = 10
30+
# Make sure to specify only one configuration per --impl argument. i.e., do not write in one impl "order=AG_before,AG_after"
31+
impl = [
32+
"pytorch;backend=nccl;order=AG_before",
33+
"fuser;algorithm=p2p_pipeline;backend=cuda;order=AG_before",
34+
]
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "ddlb-test"
18+
19+
[[Tests]]
20+
id = "Tests.ddlb"
21+
test_name = "ddlb_test"
22+
num_nodes = 1
23+
time_limit = "00:30:00"

doc/workloads/ddlb.rst

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
DDLB
2+
====
3+
4+
This workload (`test_template_name` is ``DDLB``) allows you to execute DDLB (Distributed Deep Learning Benchmarks) within the CloudAI framework. Please find the DDLB README at https://github.com/samnordmann/ddlb.
5+
6+
Usage Example
7+
-------------
8+
9+
Test TOML example:
10+
11+
.. code-block:: toml
12+
13+
name = "my_ddlb_test"
14+
description = "Example DDLB test"
15+
test_template_name = "DDLB"
16+
17+
[cmd_args]
18+
docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest"
19+
primitive = "tp_columnwise"
20+
dtype = "float16"
21+
22+
Test Scenario example:
23+
24+
.. code-block:: toml
25+
26+
name = "ddlb-test"
27+
28+
[[Tests]]
29+
id = "ddlb.1"
30+
num_nodes = 1
31+
time_limit = "00:10:00"
32+
33+
test_name = "my_ddlb_test"
34+
35+
Test-in-Scenario example:
36+
37+
.. code-block:: toml
38+
39+
name = "ddlb-test"
40+
41+
[[Tests]]
42+
id = "ddlb.1"
43+
num_nodes = 1
44+
time_limit = "00:10:00"
45+
46+
name = "my_ddlb_test"
47+
description = "Example DDLB test"
48+
test_template_name = "DDLB"
49+
50+
[Tests.cmd_args]
51+
docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest"
52+
primitive = "tp_columnwise"
53+
m = 1024
54+
n = 128
55+
k = 1024
56+
dtype = "float16"
57+
num_iterations = 50
58+
num_warmups = 5
59+
impl = "pytorch;backend=nccl;order=AG_before"
60+
61+
API Documentation
62+
---------------------------------
63+
64+
Command Arguments
65+
~~~~~~~~~~~~~~~~~
66+
67+
.. autoclass:: cloudai.workloads.ddlb.ddlb.DDLBCmdArgs
68+
:members:
69+
:show-inheritance:
70+
71+
Test Definition
72+
~~~~~~~~~~~~~~~
73+
74+
.. autoclass:: cloudai.workloads.ddlb.ddlb.DDLBTestDefinition
75+
:members:
76+
:show-inheritance:
77+

doc/workloads/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ ai_dynamo
1212
bash_cmd
1313
chakra_replay
1414
nccl
15+
ddlb
1516
nemo_run
1617
nixl_bench
1718
nixl_kvbench

src/cloudai/registration.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ def register_all():
6969
ChakraReplaySlurmCommandGenStrategy,
7070
ChakraReplayTestDefinition,
7171
)
72+
from cloudai.workloads.ddlb import (
73+
DDLBTestDefinition,
74+
DDLBTestSlurmCommandGenStrategy,
75+
)
7276
from cloudai.workloads.jax_toolbox import (
7377
GPTTestDefinition,
7478
GrokTestDefinition,
@@ -163,6 +167,7 @@ def register_all():
163167

164168
Registry().add_command_gen_strategy(SlurmSystem, MegatronRunTestDefinition, MegatronRunSlurmCommandGenStrategy)
165169
Registry().add_command_gen_strategy(SlurmSystem, NCCLTestDefinition, NcclTestSlurmCommandGenStrategy)
170+
Registry().add_command_gen_strategy(SlurmSystem, DDLBTestDefinition, DDLBTestSlurmCommandGenStrategy)
166171

167172
Registry().add_command_gen_strategy(SlurmSystem, NeMoLauncherTestDefinition, NeMoLauncherSlurmCommandGenStrategy)
168173
Registry().add_command_gen_strategy(SlurmSystem, NeMoRunTestDefinition, NeMoRunSlurmCommandGenStrategy)
@@ -199,6 +204,7 @@ def register_all():
199204

200205
Registry().add_test_definition("UCCTest", UCCTestDefinition)
201206
Registry().add_test_definition("NcclTest", NCCLTestDefinition)
207+
Registry().add_test_definition("DDLBTest", DDLBTestDefinition)
202208
Registry().add_test_definition("ChakraReplay", ChakraReplayTestDefinition)
203209
Registry().add_test_definition("Sleep", SleepTestDefinition)
204210
Registry().add_test_definition("NeMoLauncher", NeMoLauncherTestDefinition)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from .ddlb import DDLBCmdArgs, DDLBTestDefinition
18+
from .slurm_command_gen_strategy import DDLBTestSlurmCommandGenStrategy
19+
20+
__all__ = [
21+
"DDLBCmdArgs",
22+
"DDLBTestDefinition",
23+
"DDLBTestSlurmCommandGenStrategy",
24+
]

src/cloudai/workloads/ddlb/ddlb.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from typing import Optional, Union
18+
19+
from cloudai.core import DockerImage, Installable, JobStatusResult, TestRun
20+
from cloudai.models.workload import CmdArgs, TestDefinition
21+
22+
23+
class DDLBCmdArgs(CmdArgs):
24+
"""DDLB test command arguments."""
25+
26+
docker_image_url: str
27+
primitive: str
28+
m: Union[int, list[int]] = 1024
29+
n: Union[int, list[int]] = 128
30+
k: Union[int, list[int]] = 1024
31+
dtype: str
32+
num_iterations: int = 50
33+
num_warmups: int = 5
34+
impl: Union[str, list[str]] = "pytorch;backend=nccl;order=AG_before"
35+
36+
37+
class DDLBTestDefinition(TestDefinition):
38+
"""Test object for DDLB."""
39+
40+
cmd_args: DDLBCmdArgs
41+
_docker_image: Optional[DockerImage] = None
42+
43+
@property
44+
def extra_args_str(self) -> str:
45+
parts = []
46+
for k, v in self.extra_cmd_args.items():
47+
parts.append(f"{k} {v}" if v else k)
48+
return " ".join(parts)
49+
50+
@property
51+
def docker_image(self) -> DockerImage:
52+
if not self._docker_image:
53+
self._docker_image = DockerImage(url=self.cmd_args.docker_image_url)
54+
return self._docker_image
55+
56+
@property
57+
def installables(self) -> list[Installable]:
58+
return [self.docker_image]
59+
60+
def was_run_successful(self, tr: TestRun) -> JobStatusResult:
61+
stdout_path = tr.output_path / "stdout.txt"
62+
if stdout_path.is_file():
63+
with stdout_path.open("r") as file:
64+
content = file.read()
65+
66+
# Check for specific error patterns
67+
if "Error" in content:
68+
return JobStatusResult(
69+
is_successful=False,
70+
error_message=(
71+
f"DDLB test failure detected in {stdout_path}. "
72+
"Possible reasons include network errors or remote process exits. "
73+
"Please review the DDLB test output and errors in the file first. "
74+
"If the issue persists, contact the system administrator."
75+
),
76+
)
77+
78+
# Identify missing success indicators
79+
if "Benchmark Results" not in content:
80+
error_message = (
81+
f"Missing success indicators in {stdout_path}: 'Benchmark Results'. "
82+
"These keywords are expected to be present in stdout.txt, usually towards the end of the file. "
83+
"Please review the DDLB test output and errors in the file. "
84+
"Ensure the DDLB test ran to completion. You can run the generated sbatch script manually "
85+
f"and check if {stdout_path} is created and contains the expected keywords. "
86+
"If the issue persists, contact the system administrator."
87+
)
88+
89+
return JobStatusResult(is_successful=False, error_message=error_message)
90+
91+
return JobStatusResult(is_successful=True)
92+
93+
return JobStatusResult(
94+
is_successful=False,
95+
error_message=(
96+
f"stdout.txt file not found in the specified output directory {tr.output_path}. "
97+
"This file is expected to be created as a result of the DDLB test run. "
98+
"Please ensure the DDLB test was executed properly and that stdout.txt is generated. "
99+
f"You can run the generated DDLB test command manually and verify the creation of {stdout_path}. "
100+
"If the issue persists, contact the system administrator."
101+
),
102+
)
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from typing import List, cast
18+
19+
from cloudai.systems.slurm import SlurmCommandGenStrategy
20+
21+
from .ddlb import DDLBTestDefinition
22+
23+
24+
class DDLBTestSlurmCommandGenStrategy(SlurmCommandGenStrategy):
25+
"""Command generation strategy for DDLB tests on Slurm systems."""
26+
27+
def _container_mounts(self) -> List[str]:
28+
return []
29+
30+
def image_path(self) -> str | None:
31+
tdef: DDLBTestDefinition = cast(DDLBTestDefinition, self.test_run.test)
32+
return str(tdef.docker_image.installed_path)
33+
34+
def generate_test_command(self) -> List[str]:
35+
tdef: DDLBTestDefinition = cast(DDLBTestDefinition, self.test_run.test)
36+
srun_command_parts = ["python ddlb/cli/benchmark.py"]
37+
ddlb_test_args = tdef.cmd_args.model_dump().keys()
38+
for arg in ddlb_test_args:
39+
if arg == "docker_image_url":
40+
continue
41+
42+
value = getattr(tdef.cmd_args, arg)
43+
if value is None:
44+
continue
45+
46+
match arg:
47+
case "m" | "n" | "k":
48+
srun_command_parts.append(f"-{arg} {value}")
49+
case "num_iterations" | "num_warmups":
50+
srun_command_parts.append(f"--{arg.replace('_', '-')} {value}")
51+
case _:
52+
srun_command_parts.append(f"--{arg} {value}")
53+
54+
if self.test_run.test.extra_cmd_args:
55+
srun_command_parts.append(self.test_run.test.extra_args_str)
56+
57+
return srun_command_parts
58+
59+
def gen_srun_success_check(self) -> str:
60+
output_file = self.test_run.output_path / "stdout.txt"
61+
return f'grep -q "Benchmark Results" {output_file} && echo 1 || echo 0'

tests/ref_data/ddlb.sbatch

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
# generated by CloudAI@__CLOUDAI_VERSION__
3+
#SBATCH --job-name=__JOB_NAME__
4+
#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
5+
#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
6+
#SBATCH --partition=main
7+
#SBATCH -N 1
8+
#SBATCH --gpus-per-node=8
9+
#SBATCH --gres=gpu:8
10+
11+
export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
12+
13+
srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
14+
15+
srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
16+
17+
srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python ddlb/cli/benchmark.py --primitive tp_columnwise -m 1024 -n 128 -k 1024 --dtype float16 --num-iterations 50 --num-warmups 5 --impl pytorch;backend=nccl;order=AG_before"

0 commit comments

Comments
 (0)