Skip to content

Commit 7b63c79

Browse files
Merge pull request #767 from srivatsankrishnan/m-bridge
M bridge updates
2 parents 6da438c + b122c68 commit 7b63c79

File tree

7 files changed

+203
-34
lines changed

7 files changed

+203
-34
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "megatron_bridge_qwen_30b"
18+
description = "Megatron-Bridge run via CloudAI SlurmSystem for Qwen3 30B A3B"
19+
test_template_name = "MegatronBridge"
20+
21+
extra_container_mounts = []
22+
23+
[[git_repos]]
24+
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
25+
commit = "r0.2.0"
26+
mount_as = "/opt/Megatron-Bridge"
27+
28+
[cmd_args]
29+
gpu_type = "b200"
30+
container_image = "nvcr.io#nvidia/nemo:25.11.01"
31+
model_name = "qwen3"
32+
model_size = "30b_a3b"
33+
gpus_per_node = 4
34+
num_gpus = 8
35+
domain = "llm"
36+
task = "pretrain"
37+
compute_dtype = "fp8_mx"
38+
hf_token = ""
39+
enable_vboost = true
40+
detach = false

conf/experimental/megatron_bridge/test/megatron_bridge_qwen_30b.toml renamed to conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2-
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# SPDX-License-Identifier: Apache-2.0
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,16 +20,21 @@ test_template_name = "MegatronBridge"
2020

2121
extra_container_mounts = []
2222

23+
[[git_repos]]
24+
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
25+
commit = "r0.2.0"
26+
mount_as = "/opt/Megatron-Bridge"
27+
2328
[cmd_args]
2429
gpu_type = "gb200"
2530
container_image = "nvcr.io#nvidia/nemo:25.11.01"
26-
2731
model_name = "qwen3"
2832
model_size = "30b_a3b"
2933
gpus_per_node = 4
3034
num_gpus = 8
3135
domain = "llm"
3236
task = "pretrain"
3337
compute_dtype = "fp8_mx"
34-
3538
hf_token = ""
39+
enable_vboost = true
40+
detach = false
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "megatron_bridge_qwen_30b"
18+
description = "Megatron-Bridge run via CloudAI SlurmSystem for Qwen3 30B A3B"
19+
test_template_name = "MegatronBridge"
20+
21+
extra_container_mounts = []
22+
23+
[[git_repos]]
24+
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
25+
commit = "r0.2.0"
26+
mount_as = "/opt/Megatron-Bridge"
27+
28+
[cmd_args]
29+
gpu_type = "gb300"
30+
container_image = "nvcr.io#nvidia/nemo:25.11.01"
31+
model_name = "qwen3"
32+
model_size = "30b_a3b"
33+
gpus_per_node = 4
34+
num_gpus = 8
35+
domain = "llm"
36+
task = "pretrain"
37+
compute_dtype = "fp8_mx"
38+
hf_token = ""
39+
enable_vboost = true
40+
detach = false
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "megatron_bridge_qwen_30b"
18+
description = "Megatron-Bridge run via CloudAI SlurmSystem for Qwen3 30B A3B"
19+
test_template_name = "MegatronBridge"
20+
21+
extra_container_mounts = []
22+
23+
[[git_repos]]
24+
url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
25+
commit = "r0.2.0"
26+
mount_as = "/opt/Megatron-Bridge"
27+
28+
[cmd_args]
29+
gpu_type = "h100"
30+
container_image = "nvcr.io#nvidia/nemo:25.11.01"
31+
model_name = "qwen3"
32+
model_size = "30b_a3b"
33+
gpus_per_node = 8
34+
num_gpus = 16
35+
domain = "llm"
36+
task = "pretrain"
37+
compute_dtype = "fp8_cs"
38+
hf_token = ""
39+
enable_vboost = true
40+
detach = false

src/cloudai/workloads/megatron_bridge/megatron_bridge.py

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2-
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# SPDX-License-Identifier: Apache-2.0
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,8 +31,6 @@ class MegatronBridgeCmdArgs(CmdArgs):
3131
log_dir: str = Field(default="")
3232
time_limit: str = Field(default="00:30:00")
3333
container_image: str = Field(default="")
34-
nemo_container_version: str = Field(default="25.11")
35-
megatron_bridge_ref: Optional[str] = Field(default=None)
3634
num_gpus: int = Field(default=8)
3735
gpus_per_node: int = Field(default=8)
3836
custom_mounts: Optional[str] = Field(default=None)
@@ -96,7 +94,6 @@ class MegatronBridgeTestDefinition(TestDefinition):
9694

9795
cmd_args: MegatronBridgeCmdArgs
9896

99-
# NeMo-Run (provides the `nemo_run` package used by Megatron-Bridge launcher on the submit node)
10097
nemo_run_repo: GitRepo = GitRepo(
10198
url="https://github.com/NVIDIA-NeMo/Run.git",
10299
commit="main",
@@ -106,6 +103,31 @@ class MegatronBridgeTestDefinition(TestDefinition):
106103
_python_executable: Optional[PythonExecutable] = None
107104
_megatron_bridge_repo: Optional[GitRepo] = None
108105

106+
@staticmethod
107+
def _select_megatron_bridge_repo(git_repos: list[GitRepo]) -> GitRepo | None:
108+
"""Return the Megatron-Bridge repo from `git_repos` (normalized to mount_as=/opt/Megatron-Bridge)."""
109+
for repo in git_repos:
110+
if "Megatron-Bridge" in repo.url or (repo.mount_as or "").rstrip("/") == "/opt/Megatron-Bridge":
111+
return repo if repo.mount_as else repo.model_copy(update={"mount_as": "/opt/Megatron-Bridge"})
112+
return None
113+
114+
@field_validator("git_repos", mode="after")
115+
@classmethod
116+
def validate_git_repos_has_megatron_bridge_repo(cls, v: list[GitRepo]) -> list[GitRepo]:
117+
"""MegatronBridge requires users to pin the Megatron-Bridge repo version via `[[git_repos]]`."""
118+
if not v:
119+
raise ValueError(
120+
"MegatronBridge requires the user to pin the Megatron-Bridge repository via `[[git_repos]]` "
121+
"in the test TOML (provide at least url and commit)."
122+
)
123+
124+
if cls._select_megatron_bridge_repo(v) is None:
125+
raise ValueError(
126+
"MegatronBridge requires `[[git_repos]]` to include the Megatron-Bridge repo (url containing "
127+
"'Megatron-Bridge' or mount_as='/opt/Megatron-Bridge')."
128+
)
129+
return v
130+
109131
@property
110132
def docker_image(self) -> DockerImage:
111133
if not self._docker_image:
@@ -121,25 +143,15 @@ def python_executable(self) -> PythonExecutable:
121143
@property
122144
def megatron_bridge_repo(self) -> GitRepo:
123145
if self._megatron_bridge_repo is None:
124-
self._megatron_bridge_repo = GitRepo(
125-
url="https://github.com/NVIDIA-NeMo/Megatron-Bridge.git",
126-
commit=self._infer_megatron_bridge_ref(),
127-
mount_as="/opt/Megatron-Bridge",
128-
)
146+
selected = self._select_megatron_bridge_repo(self.git_repos)
147+
if selected is None:
148+
raise ValueError(
149+
"MegatronBridge requires the user to pin the Megatron-Bridge repository via `[[git_repos]]` "
150+
"in the test TOML (provide at least url and commit)."
151+
)
152+
self._megatron_bridge_repo = selected
129153
return self._megatron_bridge_repo
130154

131-
def _infer_megatron_bridge_ref(self) -> str:
132-
if self.cmd_args.megatron_bridge_ref:
133-
return self.cmd_args.megatron_bridge_ref
134-
135-
return self._map_container_version_to_mbridge_ref(self.cmd_args.nemo_container_version.strip())
136-
137-
def _map_container_version_to_mbridge_ref(self, ver: str) -> str:
138-
version_to_branch = {
139-
"25.11": "r0.2.0",
140-
}
141-
return version_to_branch.get(ver, "main")
142-
143155
@property
144156
def installables(self) -> list[Installable]:
145157
items: list[Installable] = [self.python_executable, self.megatron_bridge_repo]

src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2-
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# SPDX-License-Identifier: Apache-2.0
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -77,7 +77,7 @@ def store_test_run(self) -> None:
7777
toml.dump(trd.model_dump(), f)
7878

7979
def _write_command_to_file(self, command: str, output_path: Path) -> None:
80-
log_file = output_path / "generated_command.sh"
80+
log_file = output_path / "cloudai_generated_command.sh"
8181
log_file.parent.mkdir(parents=True, exist_ok=True)
8282
with log_file.open("w") as f:
8383
f.write(f"{command}\n")
@@ -111,8 +111,8 @@ def _wrap_launcher_for_job_id_and_quiet_output(self, launcher_cmd: str) -> str:
111111
output_dir = self.test_run.output_path.absolute()
112112
output_dir.mkdir(parents=True, exist_ok=True)
113113

114-
wrapper_path = output_dir / "megatron_bridge_submit_and_parse_jobid.sh"
115-
log_path = output_dir / "megatron_bridge_launcher.log"
114+
wrapper_path = output_dir / "cloudai_megatron_bridge_submit_and_parse_jobid.sh"
115+
log_path = output_dir / "cloudai_megatron_bridge_launcher.log"
116116

117117
script_lines = [
118118
"#!/usr/bin/env bash",

tests/slurm_command_gen_strategy/test_megatron_bridge_slurm_command_gen_strategy.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2-
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# SPDX-License-Identifier: Apache-2.0
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -51,6 +51,13 @@ def test_run(self, tmp_path: Path) -> TestRun:
5151
test_template_name="MegatronBridge",
5252
cmd_args=args,
5353
extra_container_mounts=[],
54+
git_repos=[
55+
{
56+
"url": "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git",
57+
"commit": "r0.2.0",
58+
"mount_as": "/opt/Megatron-Bridge",
59+
}
60+
], # type: ignore[arg-type]
5461
)
5562

5663
# Fake installed paths for installables so command-gen doesn't depend on real installs.
@@ -80,6 +87,24 @@ def test_hf_token_empty_is_rejected_by_schema(self) -> None:
8087
with pytest.raises(Exception, match=r"hf_token"):
8188
MegatronBridgeCmdArgs.model_validate({"hf_token": ""})
8289

90+
def test_git_repos_can_pin_megatron_bridge_commit(self) -> None:
91+
args = MegatronBridgeCmdArgs(hf_token="dummy_token", model_name="qwen3", model_size="30b_a3b")
92+
tdef = MegatronBridgeTestDefinition(
93+
name="mb",
94+
description="desc",
95+
test_template_name="MegatronBridge",
96+
cmd_args=args,
97+
extra_container_mounts=[],
98+
git_repos=[
99+
{
100+
"url": "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git",
101+
"commit": "abcdef1234567890",
102+
"mount_as": "/opt/Megatron-Bridge",
103+
}
104+
], # type: ignore[arg-type]
105+
)
106+
assert tdef.megatron_bridge_repo.commit == "abcdef1234567890"
107+
83108
def test_defaults_not_emitted_when_not_set_in_toml(self, slurm_system: SlurmSystem, tmp_path: Path) -> None:
84109
sqsh = tmp_path / "img.sqsh"
85110
sqsh.write_text("x")
@@ -98,6 +123,13 @@ def test_defaults_not_emitted_when_not_set_in_toml(self, slurm_system: SlurmSyst
98123
test_template_name="MegatronBridge",
99124
cmd_args=args,
100125
extra_container_mounts=[],
126+
git_repos=[
127+
{
128+
"url": "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git",
129+
"commit": "r0.2.0",
130+
"mount_as": "/opt/Megatron-Bridge",
131+
}
132+
], # type: ignore[arg-type]
101133
)
102134

103135
(tmp_path / "run_repo").mkdir()
@@ -127,15 +159,15 @@ def test_container_image_local_path_passed_verbatim(
127159
assert local_img.exists()
128160

129161
cmd_gen.gen_exec_command()
130-
wrapper = test_run.output_path / "megatron_bridge_submit_and_parse_jobid.sh"
162+
wrapper = test_run.output_path / "cloudai_megatron_bridge_submit_and_parse_jobid.sh"
131163
assert wrapper.exists()
132164
wrapper_content = wrapper.read_text()
133165
assert f"-i {local_img.absolute()}" in wrapper_content
134166
assert str(tdef.docker_image.installed_path) not in wrapper_content
135167

136168
def test_cuda_graph_scope_normalization(self, cmd_gen: MegatronBridgeSlurmCommandGenStrategy) -> None:
137169
cmd_gen.gen_exec_command()
138-
wrapper = cmd_gen.test_run.output_path / "megatron_bridge_submit_and_parse_jobid.sh"
170+
wrapper = cmd_gen.test_run.output_path / "cloudai_megatron_bridge_submit_and_parse_jobid.sh"
139171
wrapper_content = wrapper.read_text()
140172
assert "--cuda_graph_scope moe_router,moe_preprocess" in wrapper_content
141173

@@ -168,7 +200,7 @@ def test_detach_flags(
168200
slurm_system.default_partition = "gb300"
169201
cmd_gen = MegatronBridgeSlurmCommandGenStrategy(slurm_system, test_run)
170202
cmd_gen.gen_exec_command()
171-
wrapper = test_run.output_path / "megatron_bridge_submit_and_parse_jobid.sh"
203+
wrapper = test_run.output_path / "cloudai_megatron_bridge_submit_and_parse_jobid.sh"
172204
wrapper_content = wrapper.read_text()
173205
if detach is None:
174206
assert "--detach" not in wrapper_content
@@ -183,9 +215,9 @@ def test_generated_command_file_written(
183215
) -> None:
184216
cmd = cmd_gen.gen_exec_command()
185217
out_dir = test_run.output_path
186-
gen_file = out_dir / "generated_command.sh"
218+
gen_file = out_dir / "cloudai_generated_command.sh"
187219
assert gen_file.exists()
188220
content = gen_file.read_text()
189221
assert cmd in content
190222
assert content.startswith("bash ")
191-
assert "megatron_bridge_submit_and_parse_jobid.sh" in content
223+
assert "cloudai_megatron_bridge_submit_and_parse_jobid.sh" in content

0 commit comments

Comments
 (0)