Skip to content

Commit 99f9158

Browse files
Merge pull request #764 from srivatsankrishnan/m-bridge
Megatron Bridge in CloudAI
2 parents fbf9891 + cfa33b9 commit 99f9158

File tree

13 files changed

+1221
-5
lines changed

13 files changed

+1221
-5
lines changed
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "megatron_bridge_qwen_30b"
18+
description = "Megatron-Bridge run via CloudAI SlurmSystem for Qwen3 30B A3B"
19+
test_template_name = "MegatronBridge"
20+
21+
extra_container_mounts = []
22+
23+
[cmd_args]
24+
gpu_type = "gb200"
25+
container_image = "nvcr.io#nvidia/nemo:25.11.01"
26+
27+
model_name = "qwen3"
28+
model_size = "30b_a3b"
29+
gpus_per_node = 4
30+
num_gpus = 8
31+
domain = "llm"
32+
task = "pretrain"
33+
compute_dtype = "fp8_mx"
34+
35+
hf_token = "REPLACE_ME_WITH_HF_TOKEN"
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "megatron_bridge_qwen_30b"
18+
19+
[[Tests]]
20+
id = "megatron_bridge_qwen_30b"
21+
test_name = "megatron_bridge_qwen_30b"
22+
num_nodes = "2"

src/cloudai/registration.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,11 @@ def register_all():
9191
JaxToolboxSlurmCommandGenStrategy,
9292
NemotronTestDefinition,
9393
)
94+
from cloudai.workloads.megatron_bridge import (
95+
MegatronBridgeReportGenerationStrategy,
96+
MegatronBridgeSlurmCommandGenStrategy,
97+
MegatronBridgeTestDefinition,
98+
)
9499
from cloudai.workloads.megatron_run import (
95100
CheckpointTimingReportGenerationStrategy,
96101
MegatronRunSlurmCommandGenStrategy,
@@ -185,6 +190,9 @@ def register_all():
185190
Registry().add_command_gen_strategy(SlurmSystem, MegatronRunTestDefinition, MegatronRunSlurmCommandGenStrategy)
186191
Registry().add_command_gen_strategy(SlurmSystem, NCCLTestDefinition, NcclTestSlurmCommandGenStrategy)
187192
Registry().add_command_gen_strategy(SlurmSystem, DDLBTestDefinition, DDLBTestSlurmCommandGenStrategy)
193+
Registry().add_command_gen_strategy(
194+
SlurmSystem, MegatronBridgeTestDefinition, MegatronBridgeSlurmCommandGenStrategy
195+
)
188196

189197
Registry().add_command_gen_strategy(SlurmSystem, NeMoLauncherTestDefinition, NeMoLauncherSlurmCommandGenStrategy)
190198
Registry().add_command_gen_strategy(SlurmSystem, NeMoRunTestDefinition, NeMoRunSlurmCommandGenStrategy)
@@ -234,6 +242,7 @@ def register_all():
234242
Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition)
235243
Registry().add_test_definition("SlurmContainer", SlurmContainerTestDefinition)
236244
Registry().add_test_definition("MegatronRun", MegatronRunTestDefinition)
245+
Registry().add_test_definition("MegatronBridge", MegatronBridgeTestDefinition)
237246
Registry().add_test_definition("TritonInference", TritonInferenceTestDefinition)
238247
Registry().add_test_definition("NIXLBench", NIXLBenchTestDefinition)
239248
Registry().add_test_definition("AIDynamo", AIDynamoTestDefinition)
@@ -250,6 +259,7 @@ def register_all():
250259
Registry().add_report(GPTTestDefinition, JaxToolboxReportGenerationStrategy)
251260
Registry().add_report(GrokTestDefinition, JaxToolboxReportGenerationStrategy)
252261
Registry().add_report(MegatronRunTestDefinition, CheckpointTimingReportGenerationStrategy)
262+
Registry().add_report(MegatronBridgeTestDefinition, MegatronBridgeReportGenerationStrategy)
253263
Registry().add_report(NCCLTestDefinition, NcclTestPerformanceReportGenerationStrategy)
254264
Registry().add_report(NeMoLauncherTestDefinition, NeMoLauncherReportGenerationStrategy)
255265
Registry().add_report(NeMoRunTestDefinition, NeMoRunReportGenerationStrategy)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from .megatron_bridge import MegatronBridgeCmdArgs, MegatronBridgeTestDefinition
18+
from .report_generation_strategy import MegatronBridgeReportGenerationStrategy
19+
from .slurm_command_gen_strategy import MegatronBridgeSlurmCommandGenStrategy
20+
21+
__all__ = [
22+
"MegatronBridgeCmdArgs",
23+
"MegatronBridgeReportGenerationStrategy",
24+
"MegatronBridgeSlurmCommandGenStrategy",
25+
"MegatronBridgeTestDefinition",
26+
]

0 commit comments

Comments
 (0)