Merge pull request #767 from srivatsankrishnan/m-bridge

srivatsankrishnan · web-flow · commit 7b63c79c8f6a · 2026-01-05T15:18:30.000-08:00
M bridge updates
diff --git a/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/b200/megatron_bridge_qwen_30b.toml
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "megatron_bridge_qwen_30b"
+description = "Megatron-Bridge run via CloudAI SlurmSystem for Qwen3 30B A3B"
+test_template_name = "MegatronBridge"
+
+extra_container_mounts = []
+
+[[git_repos]]
+url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
+commit = "r0.2.0"
+mount_as = "/opt/Megatron-Bridge"
+
+[cmd_args]
+gpu_type = "b200"
+container_image = "nvcr.io#nvidia/nemo:25.11.01"
+model_name = "qwen3"
+model_size = "30b_a3b"
+gpus_per_node = 4
+num_gpus = 8
+domain = "llm"
+task = "pretrain"
+compute_dtype = "fp8_mx"
+hf_token = ""
+enable_vboost = true
+detach = false
diff --git a/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb200/megatron_bridge_qwen_30b.toml
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,16 +20,21 @@ test_template_name = "MegatronBridge"
 
 extra_container_mounts = []
 
+[[git_repos]]
+url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
+commit = "r0.2.0"
+mount_as = "/opt/Megatron-Bridge"
+
 [cmd_args]
 gpu_type = "gb200"
 container_image = "nvcr.io#nvidia/nemo:25.11.01"
-
 model_name = "qwen3"
 model_size = "30b_a3b"
 gpus_per_node = 4
 num_gpus = 8
 domain = "llm"
 task = "pretrain"
 compute_dtype = "fp8_mx"
-
 hf_token = ""
+enable_vboost = true
+detach = false
diff --git a/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/gb300/megatron_bridge_qwen_30b.toml
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "megatron_bridge_qwen_30b"
+description = "Megatron-Bridge run via CloudAI SlurmSystem for Qwen3 30B A3B"
+test_template_name = "MegatronBridge"
+
+extra_container_mounts = []
+
+[[git_repos]]
+url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
+commit = "r0.2.0"
+mount_as = "/opt/Megatron-Bridge"
+
+[cmd_args]
+gpu_type = "gb300"
+container_image = "nvcr.io#nvidia/nemo:25.11.01"
+model_name = "qwen3"
+model_size = "30b_a3b"
+gpus_per_node = 4
+num_gpus = 8
+domain = "llm"
+task = "pretrain"
+compute_dtype = "fp8_mx"
+hf_token = ""
+enable_vboost = true
+detach = false
diff --git a/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml b/conf/experimental/megatron_bridge/test/h100/megatron_bridge_qwen_30b.toml
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "megatron_bridge_qwen_30b"
+description = "Megatron-Bridge run via CloudAI SlurmSystem for Qwen3 30B A3B"
+test_template_name = "MegatronBridge"
+
+extra_container_mounts = []
+
+[[git_repos]]
+url = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git"
+commit = "r0.2.0"
+mount_as = "/opt/Megatron-Bridge"
+
+[cmd_args]
+gpu_type = "h100"
+container_image = "nvcr.io#nvidia/nemo:25.11.01"
+model_name = "qwen3"
+model_size = "30b_a3b"
+gpus_per_node = 8
+num_gpus = 16
+domain = "llm"
+task = "pretrain"
+compute_dtype = "fp8_cs"
+hf_token = ""
+enable_vboost = true
+detach = false
diff --git a/src/cloudai/workloads/megatron_bridge/megatron_bridge.py b/src/cloudai/workloads/megatron_bridge/megatron_bridge.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,8 +31,6 @@ class MegatronBridgeCmdArgs(CmdArgs):
     log_dir: str = Field(default="")
     time_limit: str = Field(default="00:30:00")
     container_image: str = Field(default="")
-    nemo_container_version: str = Field(default="25.11")
-    megatron_bridge_ref: Optional[str] = Field(default=None)
     num_gpus: int = Field(default=8)
     gpus_per_node: int = Field(default=8)
     custom_mounts: Optional[str] = Field(default=None)
@@ -96,7 +94,6 @@ class MegatronBridgeTestDefinition(TestDefinition):
 
     cmd_args: MegatronBridgeCmdArgs
 
-    # NeMo-Run (provides the `nemo_run` package used by Megatron-Bridge launcher on the submit node)
     nemo_run_repo: GitRepo = GitRepo(
         url="https://github.com/NVIDIA-NeMo/Run.git",
         commit="main",
@@ -106,6 +103,31 @@ class MegatronBridgeTestDefinition(TestDefinition):
     _python_executable: Optional[PythonExecutable] = None
     _megatron_bridge_repo: Optional[GitRepo] = None
 
+    @staticmethod
+    def _select_megatron_bridge_repo(git_repos: list[GitRepo]) -> GitRepo | None:
+        """Return the Megatron-Bridge repo from `git_repos` (normalized to mount_as=/opt/Megatron-Bridge)."""
+        for repo in git_repos:
+            if "Megatron-Bridge" in repo.url or (repo.mount_as or "").rstrip("/") == "/opt/Megatron-Bridge":
+                return repo if repo.mount_as else repo.model_copy(update={"mount_as": "/opt/Megatron-Bridge"})
+        return None
+
+    @field_validator("git_repos", mode="after")
+    @classmethod
+    def validate_git_repos_has_megatron_bridge_repo(cls, v: list[GitRepo]) -> list[GitRepo]:
+        """MegatronBridge requires users to pin the Megatron-Bridge repo version via `[[git_repos]]`."""
+        if not v:
+            raise ValueError(
+                "MegatronBridge requires the user to pin the Megatron-Bridge repository via `[[git_repos]]` "
+                "in the test TOML (provide at least url and commit)."
+            )
+
+        if cls._select_megatron_bridge_repo(v) is None:
+            raise ValueError(
+                "MegatronBridge requires `[[git_repos]]` to include the Megatron-Bridge repo (url containing "
+                "'Megatron-Bridge' or mount_as='/opt/Megatron-Bridge')."
+            )
+        return v
+
     @property
     def docker_image(self) -> DockerImage:
         if not self._docker_image:
@@ -121,25 +143,15 @@ def python_executable(self) -> PythonExecutable:
     @property
     def megatron_bridge_repo(self) -> GitRepo:
         if self._megatron_bridge_repo is None:
-            self._megatron_bridge_repo = GitRepo(
-                url="https://github.com/NVIDIA-NeMo/Megatron-Bridge.git",
-                commit=self._infer_megatron_bridge_ref(),
-                mount_as="/opt/Megatron-Bridge",
-            )
+            selected = self._select_megatron_bridge_repo(self.git_repos)
+            if selected is None:
+                raise ValueError(
+                    "MegatronBridge requires the user to pin the Megatron-Bridge repository via `[[git_repos]]` "
+                    "in the test TOML (provide at least url and commit)."
+                )
+            self._megatron_bridge_repo = selected
         return self._megatron_bridge_repo
 
-    def _infer_megatron_bridge_ref(self) -> str:
-        if self.cmd_args.megatron_bridge_ref:
-            return self.cmd_args.megatron_bridge_ref
-
-        return self._map_container_version_to_mbridge_ref(self.cmd_args.nemo_container_version.strip())
-
-    def _map_container_version_to_mbridge_ref(self, ver: str) -> str:
-        version_to_branch = {
-            "25.11": "r0.2.0",
-        }
-        return version_to_branch.get(ver, "main")
-
     @property
     def installables(self) -> list[Installable]:
         items: list[Installable] = [self.python_executable, self.megatron_bridge_repo]
diff --git a/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py b/src/cloudai/workloads/megatron_bridge/slurm_command_gen_strategy.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -77,7 +77,7 @@ def store_test_run(self) -> None:
             toml.dump(trd.model_dump(), f)
 
     def _write_command_to_file(self, command: str, output_path: Path) -> None:
-        log_file = output_path / "generated_command.sh"
+        log_file = output_path / "cloudai_generated_command.sh"
         log_file.parent.mkdir(parents=True, exist_ok=True)
         with log_file.open("w") as f:
             f.write(f"{command}\n")
@@ -111,8 +111,8 @@ def _wrap_launcher_for_job_id_and_quiet_output(self, launcher_cmd: str) -> str:
         output_dir = self.test_run.output_path.absolute()
         output_dir.mkdir(parents=True, exist_ok=True)
 
-        wrapper_path = output_dir / "megatron_bridge_submit_and_parse_jobid.sh"
-        log_path = output_dir / "megatron_bridge_launcher.log"
+        wrapper_path = output_dir / "cloudai_megatron_bridge_submit_and_parse_jobid.sh"
+        log_path = output_dir / "cloudai_megatron_bridge_launcher.log"
 
         script_lines = [
             "#!/usr/bin/env bash",
diff --git a/tests/slurm_command_gen_strategy/test_megatron_bridge_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_megatron_bridge_slurm_command_gen_strategy.py
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -51,6 +51,13 @@ def test_run(self, tmp_path: Path) -> TestRun:
             test_template_name="MegatronBridge",
             cmd_args=args,
             extra_container_mounts=[],
+            git_repos=[
+                {
+                    "url": "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git",
+                    "commit": "r0.2.0",
+                    "mount_as": "/opt/Megatron-Bridge",
+                }
+            ],  # type: ignore[arg-type]
         )
 
         # Fake installed paths for installables so command-gen doesn't depend on real installs.
@@ -80,6 +87,24 @@ def test_hf_token_empty_is_rejected_by_schema(self) -> None:
         with pytest.raises(Exception, match=r"hf_token"):
             MegatronBridgeCmdArgs.model_validate({"hf_token": ""})
 
+    def test_git_repos_can_pin_megatron_bridge_commit(self) -> None:
+        args = MegatronBridgeCmdArgs(hf_token="dummy_token", model_name="qwen3", model_size="30b_a3b")
+        tdef = MegatronBridgeTestDefinition(
+            name="mb",
+            description="desc",
+            test_template_name="MegatronBridge",
+            cmd_args=args,
+            extra_container_mounts=[],
+            git_repos=[
+                {
+                    "url": "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git",
+                    "commit": "abcdef1234567890",
+                    "mount_as": "/opt/Megatron-Bridge",
+                }
+            ],  # type: ignore[arg-type]
+        )
+        assert tdef.megatron_bridge_repo.commit == "abcdef1234567890"
+
     def test_defaults_not_emitted_when_not_set_in_toml(self, slurm_system: SlurmSystem, tmp_path: Path) -> None:
         sqsh = tmp_path / "img.sqsh"
         sqsh.write_text("x")
@@ -98,6 +123,13 @@ def test_defaults_not_emitted_when_not_set_in_toml(self, slurm_system: SlurmSyst
             test_template_name="MegatronBridge",
             cmd_args=args,
             extra_container_mounts=[],
+            git_repos=[
+                {
+                    "url": "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git",
+                    "commit": "r0.2.0",
+                    "mount_as": "/opt/Megatron-Bridge",
+                }
+            ],  # type: ignore[arg-type]
         )
 
         (tmp_path / "run_repo").mkdir()
@@ -127,15 +159,15 @@ def test_container_image_local_path_passed_verbatim(
         assert local_img.exists()
 
         cmd_gen.gen_exec_command()
-        wrapper = test_run.output_path / "megatron_bridge_submit_and_parse_jobid.sh"
+        wrapper = test_run.output_path / "cloudai_megatron_bridge_submit_and_parse_jobid.sh"
         assert wrapper.exists()
         wrapper_content = wrapper.read_text()
         assert f"-i {local_img.absolute()}" in wrapper_content
         assert str(tdef.docker_image.installed_path) not in wrapper_content
 
     def test_cuda_graph_scope_normalization(self, cmd_gen: MegatronBridgeSlurmCommandGenStrategy) -> None:
         cmd_gen.gen_exec_command()
-        wrapper = cmd_gen.test_run.output_path / "megatron_bridge_submit_and_parse_jobid.sh"
+        wrapper = cmd_gen.test_run.output_path / "cloudai_megatron_bridge_submit_and_parse_jobid.sh"
         wrapper_content = wrapper.read_text()
         assert "--cuda_graph_scope moe_router,moe_preprocess" in wrapper_content
 
@@ -168,7 +200,7 @@ def test_detach_flags(
         slurm_system.default_partition = "gb300"
         cmd_gen = MegatronBridgeSlurmCommandGenStrategy(slurm_system, test_run)
         cmd_gen.gen_exec_command()
-        wrapper = test_run.output_path / "megatron_bridge_submit_and_parse_jobid.sh"
+        wrapper = test_run.output_path / "cloudai_megatron_bridge_submit_and_parse_jobid.sh"
         wrapper_content = wrapper.read_text()
         if detach is None:
             assert "--detach" not in wrapper_content
@@ -183,9 +215,9 @@ def test_generated_command_file_written(
     ) -> None:
         cmd = cmd_gen.gen_exec_command()
         out_dir = test_run.output_path
-        gen_file = out_dir / "generated_command.sh"
+        gen_file = out_dir / "cloudai_generated_command.sh"
         assert gen_file.exists()
         content = gen_file.read_text()
         assert cmd in content
         assert content.startswith("bash ")
-        assert "megatron_bridge_submit_and_parse_jobid.sh" in content
+        assert "cloudai_megatron_bridge_submit_and_parse_jobid.sh" in content