Merge pull request #711 from nsarka/nsarka/ddlb-integration

amaslenn · web-flow · commit 0f698718478a · 2025-11-20T23:43:47.000+01:00
Add DDLB workload
diff --git a/conf/experimental/test/ddlb_test.toml b/conf/experimental/test/ddlb_test.toml
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "ddlb_test"
+description = "DDLB test configuration"
+test_template_name = "DDLBTest"
+
+[cmd_args]
+docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest"
+primitive = "tp_columnwise"
+m = [1024, 8192]
+n = 128
+k = 1024
+dtype = "float16"
+num_iterations = 50
+num_warmups = 10
+# Make sure to specify only one configuration per --impl argument. i.e., do not write in one impl "order=AG_before,AG_after"
+impl = [
+  "pytorch;backend=nccl;order=AG_before",
+  "fuser;algorithm=p2p_pipeline;backend=cuda;order=AG_before",
+]
diff --git a/conf/experimental/test_scenario/ddlb_test.toml b/conf/experimental/test_scenario/ddlb_test.toml
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "ddlb-test"
+
+[[Tests]]
+id = "Tests.ddlb"
+test_name = "ddlb_test"
+num_nodes = 1
+time_limit = "00:30:00"
diff --git a/doc/workloads/ddlb.rst b/doc/workloads/ddlb.rst
@@ -0,0 +1,77 @@
+DDLB
+====
+
+This workload (`test_template_name` is ``DDLB``) allows you to execute DDLB (Distributed Deep Learning Benchmarks) within the CloudAI framework. Please find the DDLB README at https://github.com/samnordmann/ddlb.
+
+Usage Example
+-------------
+
+Test TOML example:
+
+.. code-block:: toml
+
+   name = "my_ddlb_test"
+   description = "Example DDLB test"
+   test_template_name = "DDLB"
+
+   [cmd_args]
+   docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest"
+   primitive = "tp_columnwise"
+   dtype = "float16"
+
+Test Scenario example:
+
+.. code-block:: toml
+
+   name = "ddlb-test"
+
+   [[Tests]]
+   id = "ddlb.1"
+   num_nodes = 1
+   time_limit = "00:10:00"
+
+   test_name = "my_ddlb_test"
+
+Test-in-Scenario example:
+
+.. code-block:: toml
+
+   name = "ddlb-test"
+
+   [[Tests]]
+   id = "ddlb.1"
+   num_nodes = 1
+   time_limit = "00:10:00"
+
+   name = "my_ddlb_test"
+   description = "Example DDLB test"
+   test_template_name = "DDLB"
+
+     [Tests.cmd_args]
+     docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest"
+     primitive = "tp_columnwise"
+     m = 1024
+     n = 128
+     k = 1024
+     dtype = "float16"
+     num_iterations = 50
+     num_warmups = 5
+     impl = "pytorch;backend=nccl;order=AG_before"
+
+API Documentation
+---------------------------------
+
+Command Arguments
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: cloudai.workloads.ddlb.ddlb.DDLBCmdArgs
+   :members:
+   :show-inheritance:
+
+Test Definition
+~~~~~~~~~~~~~~~
+
+.. autoclass:: cloudai.workloads.ddlb.ddlb.DDLBTestDefinition
+   :members:
+   :show-inheritance:
+
diff --git a/doc/workloads/index.md b/doc/workloads/index.md
@@ -12,6 +12,7 @@ ai_dynamo
 bash_cmd
 chakra_replay
 nccl
+ddlb
 nemo_run
 nixl_bench
 nixl_kvbench
diff --git a/src/cloudai/registration.py b/src/cloudai/registration.py
@@ -69,6 +69,10 @@ def register_all():
         ChakraReplaySlurmCommandGenStrategy,
         ChakraReplayTestDefinition,
     )
+    from cloudai.workloads.ddlb import (
+        DDLBTestDefinition,
+        DDLBTestSlurmCommandGenStrategy,
+    )
     from cloudai.workloads.jax_toolbox import (
         GPTTestDefinition,
         GrokTestDefinition,
@@ -163,6 +167,7 @@ def register_all():
 
     Registry().add_command_gen_strategy(SlurmSystem, MegatronRunTestDefinition, MegatronRunSlurmCommandGenStrategy)
     Registry().add_command_gen_strategy(SlurmSystem, NCCLTestDefinition, NcclTestSlurmCommandGenStrategy)
+    Registry().add_command_gen_strategy(SlurmSystem, DDLBTestDefinition, DDLBTestSlurmCommandGenStrategy)
 
     Registry().add_command_gen_strategy(SlurmSystem, NeMoLauncherTestDefinition, NeMoLauncherSlurmCommandGenStrategy)
     Registry().add_command_gen_strategy(SlurmSystem, NeMoRunTestDefinition, NeMoRunSlurmCommandGenStrategy)
@@ -199,6 +204,7 @@ def register_all():
 
     Registry().add_test_definition("UCCTest", UCCTestDefinition)
     Registry().add_test_definition("NcclTest", NCCLTestDefinition)
+    Registry().add_test_definition("DDLBTest", DDLBTestDefinition)
     Registry().add_test_definition("ChakraReplay", ChakraReplayTestDefinition)
     Registry().add_test_definition("Sleep", SleepTestDefinition)
     Registry().add_test_definition("NeMoLauncher", NeMoLauncherTestDefinition)
diff --git a/src/cloudai/workloads/ddlb/__init__.py b/src/cloudai/workloads/ddlb/__init__.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .ddlb import DDLBCmdArgs, DDLBTestDefinition
+from .slurm_command_gen_strategy import DDLBTestSlurmCommandGenStrategy
+
+__all__ = [
+    "DDLBCmdArgs",
+    "DDLBTestDefinition",
+    "DDLBTestSlurmCommandGenStrategy",
+]
diff --git a/src/cloudai/workloads/ddlb/ddlb.py b/src/cloudai/workloads/ddlb/ddlb.py
@@ -0,0 +1,102 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+from cloudai.core import DockerImage, Installable, JobStatusResult, TestRun
+from cloudai.models.workload import CmdArgs, TestDefinition
+
+
+class DDLBCmdArgs(CmdArgs):
+    """DDLB test command arguments."""
+
+    docker_image_url: str
+    primitive: str
+    m: Union[int, list[int]] = 1024
+    n: Union[int, list[int]] = 128
+    k: Union[int, list[int]] = 1024
+    dtype: str
+    num_iterations: int = 50
+    num_warmups: int = 5
+    impl: Union[str, list[str]] = "pytorch;backend=nccl;order=AG_before"
+
+
+class DDLBTestDefinition(TestDefinition):
+    """Test object for DDLB."""
+
+    cmd_args: DDLBCmdArgs
+    _docker_image: Optional[DockerImage] = None
+
+    @property
+    def extra_args_str(self) -> str:
+        parts = []
+        for k, v in self.extra_cmd_args.items():
+            parts.append(f"{k} {v}" if v else k)
+        return " ".join(parts)
+
+    @property
+    def docker_image(self) -> DockerImage:
+        if not self._docker_image:
+            self._docker_image = DockerImage(url=self.cmd_args.docker_image_url)
+        return self._docker_image
+
+    @property
+    def installables(self) -> list[Installable]:
+        return [self.docker_image]
+
+    def was_run_successful(self, tr: TestRun) -> JobStatusResult:
+        stdout_path = tr.output_path / "stdout.txt"
+        if stdout_path.is_file():
+            with stdout_path.open("r") as file:
+                content = file.read()
+
+                # Check for specific error patterns
+                if "Error" in content:
+                    return JobStatusResult(
+                        is_successful=False,
+                        error_message=(
+                            f"DDLB test failure detected in {stdout_path}. "
+                            "Possible reasons include network errors or remote process exits. "
+                            "Please review the DDLB test output and errors in the file first. "
+                            "If the issue persists, contact the system administrator."
+                        ),
+                    )
+
+                # Identify missing success indicators
+                if "Benchmark Results" not in content:
+                    error_message = (
+                        f"Missing success indicators in {stdout_path}: 'Benchmark Results'. "
+                        "These keywords are expected to be present in stdout.txt, usually towards the end of the file. "
+                        "Please review the DDLB test output and errors in the file. "
+                        "Ensure the DDLB test ran to completion. You can run the generated sbatch script manually "
+                        f"and check if {stdout_path} is created and contains the expected keywords. "
+                        "If the issue persists, contact the system administrator."
+                    )
+
+                    return JobStatusResult(is_successful=False, error_message=error_message)
+
+                return JobStatusResult(is_successful=True)
+
+        return JobStatusResult(
+            is_successful=False,
+            error_message=(
+                f"stdout.txt file not found in the specified output directory {tr.output_path}. "
+                "This file is expected to be created as a result of the DDLB test run. "
+                "Please ensure the DDLB test was executed properly and that stdout.txt is generated. "
+                f"You can run the generated DDLB test command manually and verify the creation of {stdout_path}. "
+                "If the issue persists, contact the system administrator."
+            ),
+        )
diff --git a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py
@@ -0,0 +1,61 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, cast
+
+from cloudai.systems.slurm import SlurmCommandGenStrategy
+
+from .ddlb import DDLBTestDefinition
+
+
+class DDLBTestSlurmCommandGenStrategy(SlurmCommandGenStrategy):
+    """Command generation strategy for DDLB tests on Slurm systems."""
+
+    def _container_mounts(self) -> List[str]:
+        return []
+
+    def image_path(self) -> str | None:
+        tdef: DDLBTestDefinition = cast(DDLBTestDefinition, self.test_run.test)
+        return str(tdef.docker_image.installed_path)
+
+    def generate_test_command(self) -> List[str]:
+        tdef: DDLBTestDefinition = cast(DDLBTestDefinition, self.test_run.test)
+        srun_command_parts = ["python ddlb/cli/benchmark.py"]
+        ddlb_test_args = tdef.cmd_args.model_dump().keys()
+        for arg in ddlb_test_args:
+            if arg == "docker_image_url":
+                continue
+
+            value = getattr(tdef.cmd_args, arg)
+            if value is None:
+                continue
+
+            match arg:
+                case "m" | "n" | "k":
+                    srun_command_parts.append(f"-{arg} {value}")
+                case "num_iterations" | "num_warmups":
+                    srun_command_parts.append(f"--{arg.replace('_', '-')} {value}")
+                case _:
+                    srun_command_parts.append(f"--{arg} {value}")
+
+        if self.test_run.test.extra_cmd_args:
+            srun_command_parts.append(self.test_run.test.extra_args_str)
+
+        return srun_command_parts
+
+    def gen_srun_success_check(self) -> str:
+        output_file = self.test_run.output_path / "stdout.txt"
+        return f'grep -q "Benchmark Results" {output_file} && echo 1 || echo 0'
diff --git a/tests/ref_data/ddlb.sbatch b/tests/ref_data/ddlb.sbatch
@@ -0,0 +1,17 @@
+#!/bin/bash
+# generated by CloudAI@__CLOUDAI_VERSION__
+#SBATCH --job-name=__JOB_NAME__
+#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
+#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
+#SBATCH --partition=main
+#SBATCH -N 1
+#SBATCH --gpus-per-node=8
+#SBATCH --gres=gpu:8
+
+export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
+
+srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
+
+srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
+
+srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python ddlb/cli/benchmark.py --primitive tp_columnwise -m 1024 -n 128 -k 1024 --dtype float16 --num-iterations 50 --num-warmups 5 --impl pytorch;backend=nccl;order=AG_before"
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
diff --git a/tests/test_init.py b/tests/test_init.py