Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ mount_as = "/opt/Megatron-Bridge"
[cmd_args]
gpu_type = "b200"
container_image = "nvcr.io#nvidia/nemo:25.11.01"
model_name = "qwen3"
model_size = "30b_a3b"
model_family_name = "qwen3"
model_recipe_name = "30b_a3b"
gpus_per_node = 4
num_gpus = 8
domain = "llm"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ mount_as = "/opt/Megatron-Bridge"
[cmd_args]
gpu_type = "gb200"
container_image = "nvcr.io#nvidia/nemo:25.11.01"
model_name = "qwen3"
model_size = "30b_a3b"
model_family_name = "qwen3"
model_recipe_name = "30b_a3b"
gpus_per_node = 4
num_gpus = 8
domain = "llm"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ mount_as = "/opt/Megatron-Bridge"
[cmd_args]
gpu_type = "gb300"
container_image = "nvcr.io#nvidia/nemo:25.11.01"
model_name = "qwen3"
model_size = "30b_a3b"
model_family_name = "qwen3"
model_recipe_name = "30b_a3b"
gpus_per_node = 4
num_gpus = 8
domain = "llm"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ mount_as = "/opt/Megatron-Bridge"
[cmd_args]
gpu_type = "h100"
container_image = "nvcr.io#nvidia/nemo:25.11.01"
model_name = "qwen3"
model_size = "30b_a3b"
model_family_name = "qwen3"
model_recipe_name = "30b_a3b"
gpus_per_node = 8
num_gpus = 16
domain = "llm"
Expand Down
8 changes: 4 additions & 4 deletions doc/workloads/megatron_bridge.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ Test TOML example:
# Container can be an NGC/enroot URL (nvcr.io#...) or a local .sqsh path.
container_image = "nvcr.io#nvidia/nemo:25.11.01"

model_name = "qwen3"
model_size = "30b_a3b"
model_family_name = "qwen3"
model_recipe_name = "30b_a3b"
task = "pretrain"
domain = "llm"
compute_dtype = "fp8_mx"
Expand Down Expand Up @@ -55,8 +55,8 @@ Test-in-Scenario example:

[Tests.cmd_args]
container_image = "nvcr.io#nvidia/nemo:25.11.01"
model_name = "qwen3"
model_size = "30b_a3b"
model_family_name = "qwen3"
model_recipe_name = "30b_a3b"
task = "pretrain"
domain = "llm"
compute_dtype = "fp8_mx"
Expand Down
88 changes: 77 additions & 11 deletions src/cloudai/workloads/megatron_bridge/megatron_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,22 +40,27 @@ class MegatronBridgeCmdArgs(CmdArgs):
detach: Optional[bool] = Field(default=None)

# Model/task
model_name: str = Field(default="")
model_size: str = Field(default="")
domain: str = Field(default="llm")
model_family_name: str = Field(default="")
model_recipe_name: str = Field(default="")
use_recipes: Optional[bool] = Field(default=None)
task: str = Field(default="pretrain")
compute_dtype: str = Field(default="bf16")
fp8_recipe: Optional[str] = Field(default=None)
hf_token: Optional[str] = Field(default=None)
nemo_home: Optional[str] = Field(default=None)
wandb_key: Optional[str] = Field(default=None)
wandb_prj_name: Optional[str] = Field(default=None)
wandb_exp_name: Optional[str] = Field(default=None)
wandb_project_name: Optional[str] = Field(default=None)
wandb_entity_name: Optional[str] = Field(default=None)
wandb_experiment_name: Optional[str] = Field(default=None)
wandb_save_dir: Optional[str] = Field(default=None)

# Retries
max_retries: Optional[int] = Field(default=None)

# Feature flags (allow sweeps)
use_tokendrop: Optional[Union[bool, List[bool]]] = Field(default=None)
use_megatron_fsdp: Optional[Union[bool, List[bool]]] = Field(default=None)
cuda_graph_impl: Optional[str] = Field(default=None)
cuda_graph_impl: Optional[Union[str, List[str]]] = Field(default=None)
cuda_graph_scope: Optional[Union[str, List[str]]] = Field(default=None)

# Parallelism
Expand All @@ -69,6 +74,43 @@ class MegatronBridgeCmdArgs(CmdArgs):
# Batch sizes
mb: Optional[Union[int, List[int]]] = Field(default=None)
gb: Optional[Union[int, List[int]]] = Field(default=None)
seq_length: Optional[Union[int, List[int]]] = Field(default=None)

# Optimizer
lr: Optional[Union[float, List[float]]] = Field(default=None)
min_lr: Optional[Union[float, List[float]]] = Field(default=None)
warmup_iters: Optional[Union[int, List[int]]] = Field(default=None)

# Checkpointing
pretrained_checkpoint: Optional[str] = Field(default=None)
save_dir: Optional[str] = Field(default=None)
load_dir: Optional[str] = Field(default=None)
save_interval: Optional[int] = Field(default=None)
most_recent_k: Optional[int] = Field(default=None)
save_config_filepath: Optional[str] = Field(default=None)

# Data / Tokenizer
data: Optional[str] = Field(default=None)
dataset_paths: Optional[Union[str, List[str]]] = Field(default=None)
dataset_root: Optional[str] = Field(default=None)
index_mapping_dir: Optional[str] = Field(default=None)
dataset_name: Optional[str] = Field(default=None)
packed_sequence: Optional[bool] = Field(default=None)
head_only: Optional[bool] = Field(default=None)
tokenizer_type: Optional[str] = Field(default=None)
tokenizer_model: Optional[str] = Field(default=None)
vocab_size: Optional[int] = Field(default=None)

# Profiling (performance group in argument_parser.py)
pytorch_profiler: Optional[bool] = Field(default=None)
profiling_start_step: Optional[int] = Field(default=None)
profiling_stop_step: Optional[int] = Field(default=None)
record_memory_history: Optional[bool] = Field(default=None)
profiling_gpu_metrics: Optional[bool] = Field(default=None)
profiling_ranks: Optional[Union[int, List[int]]] = Field(default=None)

# Performance
nccl_ub: Optional[Union[bool, List[bool]]] = Field(default=None)

# Perf/tuning
moe_a2a_overlap: Optional[Union[bool, List[bool]]] = Field(default=None)
Expand Down Expand Up @@ -315,14 +357,16 @@ def _normalize_str_list(val: Optional[Union[str, List[str]]]) -> list[str]:
else:
constraint10 = True

# Constraint 11: CUDA graphs require a2a overlap disabled
# Constraint 11: When cuda_graph_impl is set (not none), a2a overlap must be disabled
# moe_a2a_overlap can only be true when cuda_graph_impl is 'none' or unset
a2a_overlap = _as_bool(self.cmd_args.moe_a2a_overlap)
constraint11 = not (cuda_graphs and a2a_overlap)
cuda_impl_enabled = cgi not in {"", "none", "null"}
constraint11 = not (cuda_impl_enabled and a2a_overlap)
if not constraint11:
logging.error(
"Constraint 11 failed: cuda_graphs=true requires moe_a2a_overlap=false. "
"cuda_graphs=%s moe_a2a_overlap=%s",
cuda_graphs,
"Constraint 11 failed: moe_a2a_overlap must be false when cuda_graph_impl is not 'none'. "
"cuda_graph_impl=%s moe_a2a_overlap=%s",
cgi,
a2a_overlap,
)

Expand Down Expand Up @@ -424,6 +468,27 @@ def _normalize_str_list(val: Optional[Union[str, List[str]]]) -> list[str]:
else:
constraint17 = True

# Constraint 18: Valid (PP, VP) combinations for DeepSeek v3 pipeline layout
# Only specific (pp, vp) pairs are supported by DeepSeek v3's pipeline layout mapping
model_recipe = (self.cmd_args.model_recipe_name or "").lower()
is_deepseek_v3 = "deepseek_v3" in model_recipe or "deepseekv3" in model_recipe

if is_deepseek_v3:
valid_pp_vp_combinations = {(1, 1), (4, 1), (8, 1), (4, 2), (16, 1), (8, 2), (4, 4)}
current_vp = vp if vp is not None else 1
pp_vp_pair = (pp, current_vp)
constraint18 = pp_vp_pair in valid_pp_vp_combinations
if not constraint18:
logging.error(
"Constraint 18 failed: Invalid (PP, VP) combination for DeepSeek v3. pp=%s vp=%s. "
"Valid combinations: %s",
pp,
current_vp,
sorted(valid_pp_vp_combinations),
)
else:
constraint18 = True # Skip this constraint for non-DeepSeek v3 models

return bool(
constraint1
and constraint2
Expand All @@ -442,4 +507,5 @@ def _normalize_str_list(val: Optional[Union[str, List[str]]]) -> list[str]:
and constraint15
and constraint16
and constraint17
and constraint18
)
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ class MegatronBridgeReportGenerationStrategy(ReportGenerationStrategy):
metrics: ClassVar[list[str]] = ["default", "step-time", "tflops-per-gpu"]

def get_log_file(self) -> Path | None:
log = self.test_run.output_path / "megatron_bridge_launcher.log"
log = self.test_run.output_path / "cloudai_megatron_bridge_launcher.log"
return log if log.is_file() else None

@property
def results_file(self) -> Path:
return self.get_log_file() or (self.test_run.output_path / "megatron_bridge_launcher.log")
return self.get_log_file() or (self.test_run.output_path / "cloudai_megatron_bridge_launcher.log")

def can_handle_directory(self) -> bool:
return self.get_log_file() is not None
Expand Down Expand Up @@ -75,8 +75,8 @@ def generate_report(self) -> None:
log_file, step_times_s, gpu_tflops = self._get_extracted_data()
if not log_file:
logging.error(
"No Megatron-Bridge launcher log file found: %s",
self.test_run.output_path / "megatron_bridge_launcher.log",
"No Megatron-Bridge launcher log file found in: %s",
self.test_run.output_path,
)
return

Expand Down Expand Up @@ -130,8 +130,8 @@ def get_metric(self, metric: str) -> float:
log_file, step_times_s, gpu_tflops = self._get_extracted_data()
if not log_file:
logging.error(
"No Megatron-Bridge launcher log file found: %s",
self.test_run.output_path / "megatron_bridge_launcher.log",
"No Megatron-Bridge launcher log file found in: %s",
self.test_run.output_path,
)
return METRIC_ERROR
if not step_times_s:
Expand Down
Loading