Skip to content
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
ecf22e8
initial port m-bridge
srivatsankrishnan Dec 18, 2025
1b42e01
m-bridge using slurmcommand stratergy
srivatsankrishnan Dec 18, 2025
59a9b14
latest m-bridge changes
srivatsankrishnan Dec 18, 2025
1a3dcc6
tracked config
srivatsankrishnan Dec 22, 2025
6e39fc3
fix installation sequence
srivatsankrishnan Dec 22, 2025
7fba08b
update container url
srivatsankrishnan Dec 22, 2025
d344e58
fix url
srivatsankrishnan Dec 22, 2025
87f03d8
absolute path
srivatsankrishnan Dec 22, 2025
1986de4
fix the caching bug
srivatsankrishnan Dec 22, 2025
dd9ed59
fix the container
srivatsankrishnan Dec 22, 2025
d32a50e
sanitize the flag/api to reflect changes
srivatsankrishnan Dec 22, 2025
7199e4f
cuda_graph scope logic update
srivatsankrishnan Dec 22, 2025
5eadb51
more api name change fixes
srivatsankrishnan Dec 22, 2025
022cbc6
update to use local path as is
srivatsankrishnan Dec 22, 2025
9960611
add unit test and clean up overrides
srivatsankrishnan Dec 22, 2025
462c6b7
don't pass defaults yet (issue with M-bridge overrides).
srivatsankrishnan Dec 22, 2025
c58ccac
keep optional none to they don't get set (M_bridge has override issues)
srivatsankrishnan Dec 22, 2025
db742c1
get job_id and redirect m-bridge to own logs
srivatsankrishnan Dec 22, 2025
52b3dd7
fix job id retrival logic
srivatsankrishnan Dec 22, 2025
08bb064
make report generation logic fix
srivatsankrishnan Dec 22, 2025
ae12297
fix report parsing logic + unit test
srivatsankrishnan Dec 22, 2025
dc6b76b
Merge branch 'main' into m-bridge
srivatsankrishnan Dec 22, 2025
78b919e
fix unit tests/liting etc
srivatsankrishnan Dec 23, 2025
593556c
simplify log finding logic
srivatsankrishnan Dec 23, 2025
0074a6e
fix silent failures during report generation
srivatsankrishnan Dec 23, 2025
62613cb
simplifying extracting shared log-finding and extraction logic.
srivatsankrishnan Dec 23, 2025
2ab4b4f
f-strings or consistent formatting.
srivatsankrishnan Dec 23, 2025
871f460
fix produces "None" string if installed_path is None
srivatsankrishnan Dec 23, 2025
a054f4a
fix Assigning detach after construction doesn't update model_fields_set
srivatsankrishnan Dec 23, 2025
120a3c5
Incomplete test assertions
srivatsankrishnan Dec 23, 2025
cfa33b9
greptile fixes
srivatsankrishnan Dec 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "megatron_bridge_qwen_30b"
description = "Megatron-Bridge run via CloudAI SlurmSystem for Qwen3 30B A3B"
test_template_name = "MegatronBridge"

extra_container_mounts = []

[cmd_args]
gpu_type = "gb200"
container_image = "nvcr.io#nvidia/nemo:25.11.01"

model_name = "qwen3"
model_size = "30b_a3b"
gpus_per_node = 4
num_gpus = 8
domain = "llm"
task = "pretrain"
compute_dtype = "fp8_mx"

hf_token = "REPLACE_ME_WITH_HF_TOKEN"
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "megatron_bridge_qwen_30b"

[[Tests]]
id = "megatron_bridge_qwen_30b"
test_name = "megatron_bridge_qwen_30b"
num_nodes = "2"
10 changes: 10 additions & 0 deletions src/cloudai/registration.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ def register_all():
JaxToolboxSlurmCommandGenStrategy,
NemotronTestDefinition,
)
from cloudai.workloads.megatron_bridge import (
MegatronBridgeReportGenerationStrategy,
MegatronBridgeSlurmCommandGenStrategy,
MegatronBridgeTestDefinition,
)
from cloudai.workloads.megatron_run import (
CheckpointTimingReportGenerationStrategy,
MegatronRunSlurmCommandGenStrategy,
Expand Down Expand Up @@ -185,6 +190,9 @@ def register_all():
Registry().add_command_gen_strategy(SlurmSystem, MegatronRunTestDefinition, MegatronRunSlurmCommandGenStrategy)
Registry().add_command_gen_strategy(SlurmSystem, NCCLTestDefinition, NcclTestSlurmCommandGenStrategy)
Registry().add_command_gen_strategy(SlurmSystem, DDLBTestDefinition, DDLBTestSlurmCommandGenStrategy)
Registry().add_command_gen_strategy(
SlurmSystem, MegatronBridgeTestDefinition, MegatronBridgeSlurmCommandGenStrategy
)

Registry().add_command_gen_strategy(SlurmSystem, NeMoLauncherTestDefinition, NeMoLauncherSlurmCommandGenStrategy)
Registry().add_command_gen_strategy(SlurmSystem, NeMoRunTestDefinition, NeMoRunSlurmCommandGenStrategy)
Expand Down Expand Up @@ -234,6 +242,7 @@ def register_all():
Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition)
Registry().add_test_definition("SlurmContainer", SlurmContainerTestDefinition)
Registry().add_test_definition("MegatronRun", MegatronRunTestDefinition)
Registry().add_test_definition("MegatronBridge", MegatronBridgeTestDefinition)
Registry().add_test_definition("TritonInference", TritonInferenceTestDefinition)
Registry().add_test_definition("NIXLBench", NIXLBenchTestDefinition)
Registry().add_test_definition("AIDynamo", AIDynamoTestDefinition)
Expand All @@ -250,6 +259,7 @@ def register_all():
Registry().add_report(GPTTestDefinition, JaxToolboxReportGenerationStrategy)
Registry().add_report(GrokTestDefinition, JaxToolboxReportGenerationStrategy)
Registry().add_report(MegatronRunTestDefinition, CheckpointTimingReportGenerationStrategy)
Registry().add_report(MegatronBridgeTestDefinition, MegatronBridgeReportGenerationStrategy)
Registry().add_report(NCCLTestDefinition, NcclTestPerformanceReportGenerationStrategy)
Registry().add_report(NeMoLauncherTestDefinition, NeMoLauncherReportGenerationStrategy)
Registry().add_report(NeMoRunTestDefinition, NeMoRunReportGenerationStrategy)
Expand Down
26 changes: 26 additions & 0 deletions src/cloudai/workloads/megatron_bridge/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .megatron_bridge import MegatronBridgeCmdArgs, MegatronBridgeTestDefinition
from .report_generation_strategy import MegatronBridgeReportGenerationStrategy
from .slurm_command_gen_strategy import MegatronBridgeSlurmCommandGenStrategy

__all__ = [
"MegatronBridgeCmdArgs",
"MegatronBridgeReportGenerationStrategy",
"MegatronBridgeSlurmCommandGenStrategy",
"MegatronBridgeTestDefinition",
]
Loading