Merge pull request #742 from NVIDIA/ako/osu-benchmark

amaslenn · web-flow · commit caa903c2c79f · 2025-12-19T11:29:12.000+01:00
Add workload for OSU Micro Benchmark
diff --git a/conf/common/test/osu_test.toml b/conf/common/test/osu_test.toml
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "osu_test"
+test_template_name = "OSUBench"
+description = "OSU Benchmark example"
+
+[cmd_args]
+"docker_image_url" = "nvcr.io#nvidia/pytorch:25.06-py3"
+"benchmarks_dir" = "/opt/hpcx/ompi/tests/osu-micro-benchmarks"
+"benchmark" = "osu_allreduce"
+"iterations" = 10
+"message_size" = "1024"
diff --git a/conf/common/test_scenario/osu_test.toml b/conf/common/test_scenario/osu_test.toml
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "osu_test_scenario"
+job_status_check = true
+
+[[Tests]]
+id = "Tests.1"
+test_name = "osu_test"
+num_nodes = "2"
+time_limit = "00:20:00"
diff --git a/doc/workloads/osu.rst b/doc/workloads/osu.rst
@@ -0,0 +1,74 @@
+OSU
+===
+
+This workload (``test_template_name`` is ``OSUBench``) allows you to execute OSU Micro Benchmarks
+within the CloudAI framework.
+
+Usage example
+-------------
+
+Test example:
+
+.. code-block:: toml
+
+    name = "osu_example"
+    test_template_name = "OSUBench"
+    description = "OSU Benchmark example"
+
+    [cmd_args]
+    "docker_image_url" = "docker-image-with-osu-benchmark:latest"
+    "benchmarks_dir" = "/directory/with/osu/binaries/in/container"
+    "benchmark" = ["osu_allreduce", "osu_allgather"]
+    "iterations" = 10
+    "message_size" = "1024"
+
+Test Scenario example:
+
+.. code-block:: toml
+
+    name = "osu_example"
+
+    [[Tests]]
+    id = "Tests.1"
+    test_name = "osu_example"
+    num_nodes = "2"
+    time_limit = "00:20:00"
+
+Test-in-Scenario example:
+
+.. code-block:: toml
+
+    name = "osu-test"
+
+    [[Tests]]
+    id = "Tests.osu_allreduce"
+    num_nodes = 2
+    time_limit = "00:05:00"
+
+    name = "osu_example"
+    description = "OSU allreduce 1KB"
+    test_template_name = "OSUBench"
+
+        [Tests.cmd_args]
+        docker_image_url = "docker-image-with-osu-benchmark:latest"
+        benchmarks_dir = "/directory/with/osu/binaries/in/container"
+        benchmark = "osu_allreduce"
+        iterations = 10
+        message_size = "1024"
+
+API Documentation
+-----------------
+
+Command Arguments
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: cloudai.workloads.osu_bench.osu_bench.OSUBenchCmdArgs
+   :members:
+   :show-inheritance:
+
+Test Definition
+~~~~~~~~~~~~~~~
+
+.. autoclass:: cloudai.workloads.osu_bench.osu_bench.OSUBenchTestDefinition
+   :members:
+   :show-inheritance:
diff --git a/src/cloudai/registration.py b/src/cloudai/registration.py
@@ -130,6 +130,10 @@ def register_all():
         NixlPerftestSlurmCommandGenStrategy,
         NixlPerftestTestDefinition,
     )
+    from cloudai.workloads.osu_bench import (
+        OSUBenchSlurmCommandGenStrategy,
+        OSUBenchTestDefinition,
+    )
     from cloudai.workloads.sleep import (
         SleepGradingStrategy,
         SleepKubernetesJsonGenStrategy,
@@ -203,6 +207,7 @@ def register_all():
     Registry().add_command_gen_strategy(SlurmSystem, AIDynamoTestDefinition, AIDynamoSlurmCommandGenStrategy)
     Registry().add_command_gen_strategy(SlurmSystem, BashCmdTestDefinition, BashCmdCommandGenStrategy)
     Registry().add_command_gen_strategy(SlurmSystem, NIXLKVBenchTestDefinition, NIXLKVBenchSlurmCommandGenStrategy)
+    Registry().add_command_gen_strategy(SlurmSystem, OSUBenchTestDefinition, OSUBenchSlurmCommandGenStrategy)
 
     Registry().add_installer("slurm", SlurmInstaller)
     Registry().add_installer("standalone", StandaloneInstaller)
@@ -236,6 +241,7 @@ def register_all():
     Registry().add_test_definition("NixlPerftest", NixlPerftestTestDefinition)
     Registry().add_test_definition("NIXLKVBench", NIXLKVBenchTestDefinition)
     Registry().add_test_definition("Aiconfigurator", AiconfiguratorTestDefinition)
+    Registry().add_test_definition("OSUBench", OSUBenchTestDefinition)
 
     Registry().add_agent("grid_search", GridSearchAgent)
 
diff --git a/src/cloudai/workloads/osu_bench/__init__.py b/src/cloudai/workloads/osu_bench/__init__.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .osu_bench import OSUBenchCmdArgs, OSUBenchTestDefinition
+from .slurm_command_gen_strategy import OSUBenchSlurmCommandGenStrategy
+
+__all__ = [
+    "OSUBenchCmdArgs",
+    "OSUBenchSlurmCommandGenStrategy",
+    "OSUBenchTestDefinition",
+]
diff --git a/src/cloudai/workloads/osu_bench/osu_bench.py b/src/cloudai/workloads/osu_bench/osu_bench.py
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import Any, List, Optional, Union
+
+from pydantic import Field
+
+from cloudai.core import DockerImage, Installable, JobStatusResult, TestRun
+from cloudai.models.workload import CmdArgs, TestDefinition
+
+
+class OSUBenchCmdArgs(CmdArgs):
+    """Command line arguments for a OSU Benchmark test."""
+
+    docker_image_url: str
+    """URL of the Docker image to use for the test."""
+
+    benchmarks_dir: str
+    """Directory with the OSU Benchmark binaries inside the container. """
+
+    benchmark: Union[str, List[str]]
+    """Name of the benchmark to run. """
+
+    message_size: Optional[Union[str, List[str]]] = Field(default=None)
+    """Message size for the benchmark.
+
+    Examples::
+
+        128    // min = default, max = 128
+        2:128  // min = 2, max = 128
+        2:     // min 2, max = default
+    """
+
+    iterations: Optional[int] = Field(default=None)
+    """Number of iterations for the benchmark."""
+
+    warmup: Optional[int] = Field(default=None)
+    """Number of warmup iterations to skip before timing."""
+
+    mem_limit: Optional[int] = Field(default=None)
+    """Per-process maximum memory consumption in bytes."""
+
+    full: bool = Field(default=True)
+    """Print full format listing of results."""
+
+
+class OSUBenchTestDefinition(TestDefinition):
+    """Test definition for OSU Benchmark test."""
+
+    cmd_args: OSUBenchCmdArgs
+    _osu_image: DockerImage | None = None
+
+    @property
+    def docker_image(self) -> DockerImage:
+        if not self._osu_image:
+            self._osu_image = DockerImage(url=self.cmd_args.docker_image_url)
+
+        return self._osu_image
+
+    @property
+    def installables(self) -> list[Installable]:
+        return [self.docker_image]
+
+    @property
+    def cmd_args_dict(self) -> dict[str, Any]:
+        return self.cmd_args.model_dump(exclude={"docker_image_url", "benchmarks_dir", "benchmark"})
+
+    def was_run_successful(self, tr: TestRun) -> JobStatusResult:
+        stdout_path = tr.output_path / "stdout.txt"
+        stderr_path = tr.output_path / "stderr.txt"
+
+        if not stdout_path.is_file():
+            return JobStatusResult(
+                is_successful=False,
+                error_message=(
+                    f"stdout.txt file not found in the specified output directory {tr.output_path}. "
+                    "This file is expected to be created as a result of the OSU Benchmark test run."
+                ),
+            )
+
+        with open(stdout_path, "r") as f:
+            content = f.read()
+
+        if not content.strip():
+            return JobStatusResult(
+                is_successful=False,
+                error_message=(
+                    f"stdout.txt file is empty in the specified output directory {tr.output_path}. "
+                    f"Please check for fatal errors in {stderr_path}"
+                ),
+            )
+
+        # Check for basic OSU benchmark output format
+        if "# Size" not in content:
+            return JobStatusResult(
+                is_successful=False,
+                error_message=(
+                    f"Expected OSU benchmark output marker not found in stdout.txt in {tr.output_path}. "
+                    f"Check for errors in the execution or for a different output format."
+                ),
+            )
+
+        # Additional validation could be added here to verify specific benchmark types
+        # based on the full header format once benchmark-specific validation is needed
+
+        return JobStatusResult(is_successful=True)
diff --git a/src/cloudai/workloads/osu_bench/slurm_command_gen_strategy.py b/src/cloudai/workloads/osu_bench/slurm_command_gen_strategy.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, cast
+
+from cloudai.systems.slurm import SlurmCommandGenStrategy
+
+from .osu_bench import OSUBenchCmdArgs, OSUBenchTestDefinition
+
+FULL_FLAG_UNSUPPORTED = [
+    "osu_latency",
+    "osu_latency_mt",
+    "osu_latency_mp",
+    "osu_bw",
+    "osu_bibw",
+    "osu_latency_persistent",
+    "osu_bw_persistent",
+    "osu_bibw_persistent",
+    "osu_multi_lat",
+    "osu_mbw_mr",
+    "osu_put_latency",
+    "osu_get_latency",
+    "osu_acc_latency",
+    "osu_get_acc_latency",
+    "osu_cas_latency",
+    "osu_fop_latency",
+    "osu_put_bw",
+    "osu_get_bw",
+    "osu_put_bibw",
+    "osu_init",
+    "osu_hello",
+]
+
+
+class OSUBenchSlurmCommandGenStrategy(SlurmCommandGenStrategy):
+    """Command generation strategy for OSU Benchmark test on Slurm systems."""
+
+    def _container_mounts(self) -> List[str]:
+        return []
+
+    def image_path(self) -> str:
+        tdef: OSUBenchTestDefinition = cast(OSUBenchTestDefinition, self.test_run.test)
+        return str(tdef.docker_image.installed_path)
+
+    def generate_test_command(self) -> List[str]:
+        args: OSUBenchCmdArgs = cast(OSUBenchCmdArgs, self.test_run.test.cmd_args)
+
+        binary = f"{args.benchmarks_dir}/{args.benchmark}"
+        srun_command_parts = [binary]
+
+        for name, value in self.test_run.test.cmd_args_dict.items():
+            if value is None:
+                continue
+
+            flag = f"--{name.replace('_', '-')}"
+
+            argument = flag if isinstance(value, bool) and value else f"{flag} {value}"
+
+            if name == "full" and args.benchmark in FULL_FLAG_UNSUPPORTED:
+                continue
+
+            srun_command_parts.append(argument)
+
+        if self.test_run.test.extra_cmd_args:
+            srun_command_parts.append(self.test_run.test.extra_args_str)
+
+        return srun_command_parts
diff --git a/tests/ref_data/osu-bench.sbatch b/tests/ref_data/osu-bench.sbatch
@@ -0,0 +1,17 @@
+#!/bin/bash
+# generated by CloudAI@__CLOUDAI_VERSION__
+#SBATCH --job-name=__JOB_NAME__
+#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt
+#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt
+#SBATCH --partition=main
+#SBATCH -N 1
+#SBATCH --gpus-per-node=8
+#SBATCH --gres=gpu:8
+
+export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
+
+srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
+
+srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
+
+srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; /opt/hpcx/ompi/tests/osu-micro-benchmarks/osu_allreduce --message-size 1024 --iterations 10 --full"
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
diff --git a/tests/test_init.py b/tests/test_init.py