Skip to content

Commit a48d097

Browse files
authored
Merge pull request #723 from ybenvidia/dp-benchmark
DeepEP benchmark
2 parents c9176b9 + 6456229 commit a48d097

File tree

14 files changed

+654
-2
lines changed

14 files changed

+654
-2
lines changed
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "deepep_low_latency"
18+
description = "DeepEP MoE Benchmark - Low Latency Mode"
19+
test_template_name = "DeepEP"
20+
21+
[cmd_args]
22+
# Local .sqsh file:
23+
# docker_image_url = "/.autodirect/mswg2/E2E/Regression_logs/squash/yoel/dp-benchmark-shuffle.sqsh"
24+
# Container registry:
25+
docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
26+
27+
mode = "low_latency"
28+
29+
tokens = 128
30+
num_experts = 256
31+
num_topk = 1
32+
hidden_size = 7168
33+
data_type = "bfloat16"
34+
allow_nvlink_for_low_latency = false
35+
allow_mnnvl = false
36+
round_scale = false
37+
use_ue8m0 = false
38+
num_warmups = 20
39+
num_iterations = 50
40+
shuffle_columns = false
41+
use_kineto_profiler = false
42+
config_file_path = "/tmp/config.yaml"
43+
results_dir = "/workspace/dp-benchmark/results"
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "deepep_standard"
18+
description = "DeepEP MoE Benchmark - Standard Mode"
19+
test_template_name = "DeepEP"
20+
21+
[cmd_args]
22+
# Local .sqsh file:
23+
# docker_image_url = "/.autodirect/mswg2/E2E/Regression_logs/squash/yoel/dp-benchmark-shuffle.sqsh"
24+
# Container registry (uses your Docker credentials):
25+
docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
26+
27+
mode = "standard"
28+
29+
tokens = 1024
30+
num_experts = 256
31+
num_topk = 8
32+
hidden_size = 7168
33+
data_type = "bfloat16"
34+
allow_nvlink_for_low_latency = false
35+
allow_mnnvl = false
36+
round_scale = false
37+
use_ue8m0 = false
38+
num_warmups = 20
39+
num_iterations = 50
40+
shuffle_columns = false
41+
use_kineto_profiler = false
42+
config_file_path = "/tmp/config.yaml"
43+
results_dir = "/workspace/dp-benchmark/results"
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "deepep-benchmark"
18+
19+
[[Tests]]
20+
id = "Tests.1"
21+
test_name = "deepep_standard"
22+
num_nodes = 2
23+
time_limit = "00:30:00"
24+
25+
[[Tests]]
26+
id = "Tests.2"
27+
test_name = "deepep_low_latency"
28+
num_nodes = 2
29+
time_limit = "00:30:00"

doc/workloads/deepep.rst

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
DeepEP Benchmark
2+
================
3+
4+
This workload (``test_template_name`` is ``DeepEP``) allows you to execute DeepEP (Deep Expert Parallelism) MoE (Mixture of Experts) benchmarks within the CloudAI framework.
5+
6+
Overview
7+
--------
8+
9+
DeepEP is a benchmark for measuring the performance of MoE models with distributed expert parallelism. It supports:
10+
11+
- **Two operation modes**: Standard and Low-Latency
12+
- **Multiple data types**: bfloat16 and FP8
13+
- **Flexible network configurations**: With or without NVLink
14+
- **Configurable model parameters**: Experts, tokens, hidden size, top-k
15+
- **Performance profiling**: Kineto profiler support
16+
17+
Usage Example
18+
-------------
19+
20+
Test TOML example (Standard Mode):
21+
22+
.. code-block:: toml
23+
24+
name = "deepep_standard"
25+
description = "DeepEP MoE Benchmark - Standard Mode"
26+
test_template_name = "DeepEP"
27+
28+
[cmd_args]
29+
docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
30+
mode = "standard"
31+
tokens = 1024
32+
num_experts = 256
33+
num_topk = 8
34+
hidden_size = 7168
35+
data_type = "bfloat16"
36+
num_warmups = 20
37+
num_iterations = 50
38+
39+
Test TOML example (Low-Latency Mode):
40+
41+
.. code-block:: toml
42+
43+
name = "deepep_low_latency"
44+
description = "DeepEP MoE Benchmark - Low Latency Mode"
45+
test_template_name = "DeepEP"
46+
47+
[cmd_args]
48+
docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
49+
mode = "low_latency"
50+
tokens = 128
51+
num_experts = 256
52+
num_topk = 1
53+
hidden_size = 7168
54+
data_type = "bfloat16"
55+
allow_nvlink_for_low_latency = false
56+
allow_mnnvl = false
57+
58+
Test Scenario example:
59+
60+
.. code-block:: toml
61+
62+
name = "deepep-benchmark"
63+
64+
[[Tests]]
65+
id = "Tests.1"
66+
test_name = "deepep_standard"
67+
num_nodes = 2
68+
time_limit = "00:30:00"
69+
70+
Test-in-Scenario example:
71+
72+
.. code-block:: toml
73+
74+
name = "deepep-benchmark"
75+
76+
[[Tests]]
77+
id = "Tests.1"
78+
num_nodes = 2
79+
nodes = "GAIA:standard_nodes:2"
80+
time_limit = "00:30:00"
81+
82+
name = "deepep_standard"
83+
description = "DeepEP MoE Benchmark"
84+
test_template_name = "DeepEP"
85+
86+
[Tests.cmd_args]
87+
docker_image_url = "gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark"
88+
mode = "standard"
89+
tokens = 1024
90+
num_experts = 256
91+
num_topk = 8
92+
93+
API Documentation
94+
-----------------
95+
96+
Command Arguments
97+
~~~~~~~~~~~~~~~~~
98+
99+
.. autoclass:: cloudai.workloads.deepep.deepep.DeepEPCmdArgs
100+
:members:
101+
:show-inheritance:
102+
103+
Test Definition
104+
~~~~~~~~~~~~~~~
105+
106+
.. autoclass:: cloudai.workloads.deepep.deepep.DeepEPTestDefinition
107+
:members:
108+
:show-inheritance:
109+

doc/workloads/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ This section contains automatically generated documentation for all CloudAI work
1111
ai_dynamo
1212
bash_cmd
1313
chakra_replay
14+
deepep
1415
nccl
1516
ddlb
1617
nemo_run

src/cloudai/registration.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@ def register_all():
7373
DDLBTestDefinition,
7474
DDLBTestSlurmCommandGenStrategy,
7575
)
76+
from cloudai.workloads.deepep import (
77+
DeepEPReportGenerationStrategy,
78+
DeepEPSlurmCommandGenStrategy,
79+
DeepEPTestDefinition,
80+
)
7681
from cloudai.workloads.jax_toolbox import (
7782
GPTTestDefinition,
7883
GrokTestDefinition,
@@ -180,6 +185,7 @@ def register_all():
180185
Registry().add_command_gen_strategy(SlurmSystem, UCCTestDefinition, UCCTestSlurmCommandGenStrategy)
181186

182187
Registry().add_command_gen_strategy(SlurmSystem, ChakraReplayTestDefinition, ChakraReplaySlurmCommandGenStrategy)
188+
Registry().add_command_gen_strategy(SlurmSystem, DeepEPTestDefinition, DeepEPSlurmCommandGenStrategy)
183189
Registry().add_command_gen_strategy(SlurmSystem, SlurmContainerTestDefinition, SlurmContainerCommandGenStrategy)
184190
Registry().add_command_gen_strategy(
185191
SlurmSystem, TritonInferenceTestDefinition, TritonInferenceSlurmCommandGenStrategy
@@ -206,6 +212,7 @@ def register_all():
206212
Registry().add_test_definition("NcclTest", NCCLTestDefinition)
207213
Registry().add_test_definition("DDLBTest", DDLBTestDefinition)
208214
Registry().add_test_definition("ChakraReplay", ChakraReplayTestDefinition)
215+
Registry().add_test_definition("DeepEP", DeepEPTestDefinition)
209216
Registry().add_test_definition("Sleep", SleepTestDefinition)
210217
Registry().add_test_definition("NeMoLauncher", NeMoLauncherTestDefinition)
211218
Registry().add_test_definition("NeMoRun", NeMoRunTestDefinition)
@@ -224,6 +231,7 @@ def register_all():
224231
Registry().add_agent("grid_search", GridSearchAgent)
225232

226233
Registry().add_report(ChakraReplayTestDefinition, ChakraReplayReportGenerationStrategy)
234+
Registry().add_report(DeepEPTestDefinition, DeepEPReportGenerationStrategy)
227235
Registry().add_report(GPTTestDefinition, JaxToolboxReportGenerationStrategy)
228236
Registry().add_report(GrokTestDefinition, JaxToolboxReportGenerationStrategy)
229237
Registry().add_report(MegatronRunTestDefinition, CheckpointTimingReportGenerationStrategy)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from .deepep import DeepEPCmdArgs, DeepEPTestDefinition
18+
from .report_generation_strategy import DeepEPReportGenerationStrategy
19+
from .slurm_command_gen_strategy import DeepEPSlurmCommandGenStrategy
20+
21+
__all__ = [
22+
"DeepEPCmdArgs",
23+
"DeepEPReportGenerationStrategy",
24+
"DeepEPSlurmCommandGenStrategy",
25+
"DeepEPTestDefinition",
26+
]
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from typing import Literal, Optional
18+
19+
from cloudai.core import DockerImage, Installable
20+
from cloudai.models.workload import CmdArgs, TestDefinition
21+
22+
23+
class DeepEPCmdArgs(CmdArgs):
24+
"""DeepEP benchmark command arguments."""
25+
26+
docker_image_url: str
27+
mode: Literal["standard", "low_latency"] = "standard"
28+
tokens: int = 1024
29+
num_experts: int = 256
30+
num_topk: int = 8
31+
hidden_size: int = 7168
32+
data_type: Literal["bfloat16", "fp8"] = "bfloat16"
33+
allow_nvlink_for_low_latency: bool = False
34+
allow_mnnvl: bool = False
35+
round_scale: bool = False
36+
use_ue8m0: bool = False
37+
num_warmups: int = 20
38+
num_iterations: int = 50
39+
shuffle_columns: bool = False
40+
use_kineto_profiler: bool = False
41+
num_sms: int = 24
42+
num_qps_per_rank: int = 12
43+
config_file_path: str = "/tmp/config.yaml"
44+
results_dir: str = "/workspace/dp-benchmark/results"
45+
46+
47+
class DeepEPTestDefinition(TestDefinition):
48+
"""Test object for DeepEP MoE benchmark."""
49+
50+
cmd_args: DeepEPCmdArgs
51+
_docker_image: Optional[DockerImage] = None
52+
53+
@property
54+
def docker_image(self) -> DockerImage:
55+
if not self._docker_image:
56+
if not self.cmd_args.docker_image_url:
57+
raise ValueError("docker_image_url is required for DeepEP benchmark")
58+
self._docker_image = DockerImage(url=self.cmd_args.docker_image_url)
59+
return self._docker_image
60+
61+
@property
62+
def installables(self) -> list[Installable]:
63+
return [self.docker_image]
64+
65+
@property
66+
def cmd_args_dict(self) -> dict:
67+
"""Return command arguments as dict, excluding CloudAI-specific fields."""
68+
return self.cmd_args.model_dump(
69+
exclude={
70+
"docker_image_url",
71+
"mode",
72+
"num_sms",
73+
"num_qps_per_rank",
74+
"config_file_path",
75+
"results_dir",
76+
}
77+
)

0 commit comments

Comments
 (0)