NVIDIA
diff --git a/‎conf/experimental/test/deepep_low_latency.toml‎
Lines changed: 43 additions & 0 deletions b/‎conf/experimental/test/deepep_low_latency.toml‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎conf/experimental/test/deepep_standard.toml‎
Lines changed: 43 additions & 0 deletions b/‎conf/experimental/test/deepep_standard.toml‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎conf/experimental/test_scenario/deepep.toml‎
Lines changed: 29 additions & 0 deletions b/‎conf/experimental/test_scenario/deepep.toml‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎doc/workloads/deepep.rst‎
Lines changed: 109 additions & 0 deletions b/‎doc/workloads/deepep.rst‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎doc/workloads/index.md‎
Lines changed: 1 addition & 0 deletions b/‎doc/workloads/index.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/cloudai/registration.py‎
Lines changed: 8 additions & 0 deletions b/‎src/cloudai/registration.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/cloudai/workloads/deepep/__init__.py‎
Lines changed: 26 additions & 0 deletions b/‎src/cloudai/workloads/deepep/__init__.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎src/cloudai/workloads/deepep/deepep.py‎
Lines changed: 77 additions & 0 deletions b/‎src/cloudai/workloads/deepep/deepep.py‎
Lines changed: 77 additions & 0 deletions
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "deepep_low_latency"
+description = "DeepEP MoE Benchmark - Low Latency Mode"
+test_template_name = "DeepEP"
+
+[cmd_args]
+# Local .sqsh file:
+# docker_image_url = "/.autodirect/mswg2/E2E/Regression_logs/squash/yoel/dp-benchmark-shuffle.sqsh"
+# Container registry:
+docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
+
+mode = "low_latency"
+
+tokens = 128
+num_experts = 256
+num_topk = 1
+hidden_size = 7168
+data_type = "bfloat16"
+allow_nvlink_for_low_latency = false
+allow_mnnvl = false
+round_scale = false
+use_ue8m0 = false
+num_warmups = 20
+num_iterations = 50
+shuffle_columns = false
+use_kineto_profiler = false
+config_file_path = "/tmp/config.yaml"
+results_dir = "/workspace/dp-benchmark/results"
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "deepep_standard"
+description = "DeepEP MoE Benchmark - Standard Mode"
+test_template_name = "DeepEP"
+
+[cmd_args]
+# Local .sqsh file:
+# docker_image_url = "/.autodirect/mswg2/E2E/Regression_logs/squash/yoel/dp-benchmark-shuffle.sqsh"
+# Container registry (uses your Docker credentials):
+docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
+
+mode = "standard"
+
+tokens = 1024
+num_experts = 256
+num_topk = 8
+hidden_size = 7168
+data_type = "bfloat16"
+allow_nvlink_for_low_latency = false
+allow_mnnvl = false
+round_scale = false
+use_ue8m0 = false
+num_warmups = 20
+num_iterations = 50
+shuffle_columns = false
+use_kineto_profiler = false
+config_file_path = "/tmp/config.yaml"
+results_dir = "/workspace/dp-benchmark/results"
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "deepep-benchmark"
+
+[[Tests]]
+id = "Tests.1"
+test_name = "deepep_standard"
+num_nodes = 2
+time_limit = "00:30:00"
+
+[[Tests]]
+id = "Tests.2"
+test_name = "deepep_low_latency"
+num_nodes = 2
+time_limit = "00:30:00"
@@ -0,0 +1,109 @@
+DeepEP Benchmark
+================
+
+This workload (``test_template_name`` is ``DeepEP``) allows you to execute DeepEP (Deep Expert Parallelism) MoE (Mixture of Experts) benchmarks within the CloudAI framework.
+
+Overview
+--------
+
+DeepEP is a benchmark for measuring the performance of MoE models with distributed expert parallelism. It supports:
+
+- **Two operation modes**: Standard and Low-Latency
+- **Multiple data types**: bfloat16 and FP8
+- **Flexible network configurations**: With or without NVLink
+- **Configurable model parameters**: Experts, tokens, hidden size, top-k
+- **Performance profiling**: Kineto profiler support
+
+Usage Example
+-------------
+
+Test TOML example (Standard Mode):
+
+.. code-block:: toml
+
+   name = "deepep_standard"
+   description = "DeepEP MoE Benchmark - Standard Mode"
+   test_template_name = "DeepEP"
+
+   [cmd_args]
+   docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
+   mode = "standard"
+   tokens = 1024
+   num_experts = 256
+   num_topk = 8
+   hidden_size = 7168
+   data_type = "bfloat16"
+   num_warmups = 20
+   num_iterations = 50
+
+Test TOML example (Low-Latency Mode):
+
+.. code-block:: toml
+
+   name = "deepep_low_latency"
+   description = "DeepEP MoE Benchmark - Low Latency Mode"
+   test_template_name = "DeepEP"
+
+   [cmd_args]
+   docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
+   mode = "low_latency"
+   tokens = 128
+   num_experts = 256
+   num_topk = 1
+   hidden_size = 7168
+   data_type = "bfloat16"
+   allow_nvlink_for_low_latency = false
+   allow_mnnvl = false
+
+Test Scenario example:
+
+.. code-block:: toml
+
+   name = "deepep-benchmark"
+
+   [[Tests]]
+   id = "Tests.1"
+   test_name = "deepep_standard"
+   num_nodes = 2
+   time_limit = "00:30:00"
+
+Test-in-Scenario example:
+
+.. code-block:: toml
+
+   name = "deepep-benchmark"
+
+   [[Tests]]
+   id = "Tests.1"
+   num_nodes = 2
+   nodes = "GAIA:standard_nodes:2"
+   time_limit = "00:30:00"
+
+   name = "deepep_standard"
+   description = "DeepEP MoE Benchmark"
+   test_template_name = "DeepEP"
+
+     [Tests.cmd_args]
+     docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
+     mode = "standard"
+     tokens = 1024
+     num_experts = 256
+     num_topk = 8
+
+API Documentation
+-----------------
+
+Command Arguments
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: cloudai.workloads.deepep.deepep.DeepEPCmdArgs
+   :members:
+   :show-inheritance:
+
+Test Definition
+~~~~~~~~~~~~~~~
+
+.. autoclass:: cloudai.workloads.deepep.deepep.DeepEPTestDefinition
+   :members:
+   :show-inheritance:
+
@@ -11,6 +11,7 @@ This section contains automatically generated documentation for all CloudAI work
 ai_dynamo
 bash_cmd
 chakra_replay
+deepep
 nccl
 ddlb
 nemo_run
 
@@ -73,6 +73,11 @@ def register_all():
         DDLBTestDefinition,
         DDLBTestSlurmCommandGenStrategy,
     )
+    from cloudai.workloads.deepep import (
+        DeepEPReportGenerationStrategy,
+        DeepEPSlurmCommandGenStrategy,
+        DeepEPTestDefinition,
+    )
     from cloudai.workloads.jax_toolbox import (
         GPTTestDefinition,
         GrokTestDefinition,
@@ -180,6 +185,7 @@ def register_all():
     Registry().add_command_gen_strategy(SlurmSystem, UCCTestDefinition, UCCTestSlurmCommandGenStrategy)
 
     Registry().add_command_gen_strategy(SlurmSystem, ChakraReplayTestDefinition, ChakraReplaySlurmCommandGenStrategy)
+    Registry().add_command_gen_strategy(SlurmSystem, DeepEPTestDefinition, DeepEPSlurmCommandGenStrategy)
     Registry().add_command_gen_strategy(SlurmSystem, SlurmContainerTestDefinition, SlurmContainerCommandGenStrategy)
     Registry().add_command_gen_strategy(
         SlurmSystem, TritonInferenceTestDefinition, TritonInferenceSlurmCommandGenStrategy
@@ -206,6 +212,7 @@ def register_all():
     Registry().add_test_definition("NcclTest", NCCLTestDefinition)
     Registry().add_test_definition("DDLBTest", DDLBTestDefinition)
     Registry().add_test_definition("ChakraReplay", ChakraReplayTestDefinition)
+    Registry().add_test_definition("DeepEP", DeepEPTestDefinition)
     Registry().add_test_definition("Sleep", SleepTestDefinition)
     Registry().add_test_definition("NeMoLauncher", NeMoLauncherTestDefinition)
     Registry().add_test_definition("NeMoRun", NeMoRunTestDefinition)
@@ -224,6 +231,7 @@ def register_all():
     Registry().add_agent("grid_search", GridSearchAgent)
 
     Registry().add_report(ChakraReplayTestDefinition, ChakraReplayReportGenerationStrategy)
+    Registry().add_report(DeepEPTestDefinition, DeepEPReportGenerationStrategy)
     Registry().add_report(GPTTestDefinition, JaxToolboxReportGenerationStrategy)
     Registry().add_report(GrokTestDefinition, JaxToolboxReportGenerationStrategy)
     Registry().add_report(MegatronRunTestDefinition, CheckpointTimingReportGenerationStrategy)
 
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .deepep import DeepEPCmdArgs, DeepEPTestDefinition
+from .report_generation_strategy import DeepEPReportGenerationStrategy
+from .slurm_command_gen_strategy import DeepEPSlurmCommandGenStrategy
+
+__all__ = [
+    "DeepEPCmdArgs",
+    "DeepEPReportGenerationStrategy",
+    "DeepEPSlurmCommandGenStrategy",
+    "DeepEPTestDefinition",
+]
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Literal, Optional
+
+from cloudai.core import DockerImage, Installable
+from cloudai.models.workload import CmdArgs, TestDefinition
+
+
+class DeepEPCmdArgs(CmdArgs):
+    """DeepEP benchmark command arguments."""
+
+    docker_image_url: str
+    mode: Literal["standard", "low_latency"] = "standard"
+    tokens: int = 1024
+    num_experts: int = 256
+    num_topk: int = 8
+    hidden_size: int = 7168
+    data_type: Literal["bfloat16", "fp8"] = "bfloat16"
+    allow_nvlink_for_low_latency: bool = False
+    allow_mnnvl: bool = False
+    round_scale: bool = False
+    use_ue8m0: bool = False
+    num_warmups: int = 20
+    num_iterations: int = 50
+    shuffle_columns: bool = False
+    use_kineto_profiler: bool = False
+    num_sms: int = 24
+    num_qps_per_rank: int = 12
+    config_file_path: str = "/tmp/config.yaml"
+    results_dir: str = "/workspace/dp-benchmark/results"
+
+
+class DeepEPTestDefinition(TestDefinition):
+    """Test object for DeepEP MoE benchmark."""
+
+    cmd_args: DeepEPCmdArgs
+    _docker_image: Optional[DockerImage] = None
+
+    @property
+    def docker_image(self) -> DockerImage:
+        if not self._docker_image:
+            if not self.cmd_args.docker_image_url:
+                raise ValueError("docker_image_url is required for DeepEP benchmark")
+            self._docker_image = DockerImage(url=self.cmd_args.docker_image_url)
+        return self._docker_image
+
+    @property
+    def installables(self) -> list[Installable]:
+        return [self.docker_image]
+
+    @property
+    def cmd_args_dict(self) -> dict:
+        """Return command arguments as dict, excluding CloudAI-specific fields."""
+        return self.cmd_args.model_dump(
+            exclude={
+                "docker_image_url",
+                "mode",
+                "num_sms",
+                "num_qps_per_rank",
+                "config_file_path",
+                "results_dir",
+            }
+        )