NVIDIA-NeMo · yaoyu-33 · Mar 14, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
@@ -896,7 +896,7 @@ def parse_cli_args():
         "-cv",
         "--config_variant",
         type=str,
-        help="Config variant to use (e.g., 'v1', 'v2'). Defaults to 'v2' ('v1' if 'v2' doens't exist). Use --list_config_variants to see available options.",
+        help="Config variant for workload base configs (used by setup_experiment.py only). Use --list_config_variants to see available options.",
         default="v2",
     )
     config_variant_args.add_argument(

diff --git a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py
@@ -286,6 +286,7 @@ def deepseek_v3_pretrain_config_h100(
         set_full_iter_cg_configs(cfg)
 
     # Disabling to avoid functional errors. TODO: Test with it enabled and keep it enabled if it works.
+    cfg.ddp.overlap_grad_reduce = False
     cfg.comm_overlap.overlap_grad_reduce = False
 
     return cfg
diff --git a/scripts/performance/dump_perf_configs.py b/scripts/performance/dump_perf_configs.py
@@ -19,11 +19,11 @@
 
 import torch
 from argument_parser import parse_cli_args
-from utils.overrides import set_cli_overrides, set_post_overrides, set_user_overrides
-from utils.utils import get_perf_optimized_recipe
+from utils.overrides import set_cli_overrides, set_user_overrides
 
 from megatron.bridge.diffusion.models.wan.wan_step import WanForwardStep
 from megatron.bridge.models.qwen_vl.qwen3_vl_step import forward_step as qwen3_vl_forward_step
+from megatron.bridge.training.config import runtime_config_update
 from megatron.bridge.training.gpt_step import forward_step
 from megatron.bridge.training.pretrain import pretrain
 from megatron.bridge.training.vlm_step import forward_step as vlm_forward_step
@@ -62,6 +62,61 @@ def _dump_env_rank0() -> None:
         logger.warning(f"Failed to write environment dump to {env_path}: {e}")
 
 
+def get_perf_recipe_by_name(model_recipe_name, task, num_gpus, gpu, precision, config_variant=None):
+    """Load a flat perf recipe from megatron.bridge.perf_recipes by convention name.
+
+    Non-default ``config_variant`` (anything other than ``v1``/``v2``) is appended
+    to the function name. E.g. ``config_variant="large_scale"`` resolves to
+    ``{model}_{task}_{N}gpu_{gpu}_{prec}_large_scale_config``.
+    """
+    import importlib
+
+    precision_map = {
+        "bf16": "bf16",
+        "fp8_cs": "fp8cs",
+        "fp8_mx": "fp8mx",
+        "fp8_sc": "fp8sc",
+        "nvfp4": "nvfp4",
+    }
+    prec = precision_map.get(precision.lower(), precision.lower().replace("_", ""))
+
+    variant_suffix = f"_{config_variant}" if config_variant and config_variant not in {"v1", "v2"} else ""
+    name = f"{model_recipe_name}_{task}_{num_gpus}gpu_{gpu}_{prec}{variant_suffix}_config"
+
+    family_map = {
+        "llama3_8b": "llama",
+        "llama3_70b": "llama",
+        "llama31_405b": "llama",
+        "qwen3_235b_a22b": "qwen",
+        "qwen3_30b_a3b": "qwen",
+        "qwen3_next_80b_a3b": "qwen",
+        "deepseek_v3": "deepseek",
+        "nemotronh_56b": "nemotronh",
+        "nemotron_3_nano": "nemotronh",
+        "nemotron_3_super": "nemotronh",
+        "kimi_k2": "kimi",
+        "gpt_oss_120b": "gpt_oss",
+        "qwen3_vl_235b_a22b": "qwen_vl",
+        "qwen3_vl_30b_a3b": "qwen_vl",
+        "qwen35_vl_35b_a3b": "qwen_vl",
+        "qwen35_vl_122b_a10b": "qwen_vl",
+        "qwen35_vl_397b_a17b": "qwen_vl",
+        "wan_14b": "wan",
+    }
+
+    family = family_map.get(model_recipe_name)
+    if not family:
+        raise ValueError(
+            f"Unknown model_recipe_name {model_recipe_name!r}. Add it to family_map in get_perf_recipe_by_name."
+        )
+
+    mod = importlib.import_module(f"megatron.bridge.perf_recipes.{family}")
+    recipe_fn = getattr(mod, name, None)
+    if recipe_fn is None:
+        raise ValueError(f"No perf recipe {name!r} found in megatron.bridge.perf_recipes.{family}.")
+    return recipe_fn()
+
+
 def main():
     """Main function to run the pretraining/finetuning script."""
     # Parse known args and treat any unknown args as Hydra-style config overrides.
@@ -72,30 +127,22 @@ def main():
     if args.dump_env:
         _dump_env_rank0()
 
-    recipe = get_perf_optimized_recipe(
-        model_family_name=args.model_family_name,
+    recipe = get_perf_recipe_by_name(
         model_recipe_name=args.model_recipe_name,
-        train_task=args.task,
+        task=args.task,
+        num_gpus=args.num_gpus,
         gpu=args.gpu,
-        compute_dtype=args.compute_dtype,
-        mock=args.data == "mock",
-        config_variant=args.config_variant,
-        optimizer_type=getattr(args, "optimizer_type", None),
+        precision=args.compute_dtype,
+        config_variant=getattr(args, "config_variant", None),
     )
 
     recipe = set_cli_overrides(recipe, cli_overrides)
     recipe = set_user_overrides(recipe, args)
-    recipe = set_post_overrides(
-        recipe,
-        args.model_family_name,
-        args.model_recipe_name,
-        args.gpu,
-        args.num_gpus,
-        args.compute_dtype,
-        args.task,
-        user_gbs=args.global_batch_size,
-        config_variant=args.config_variant,
-    )
+
+    # Preserve legacy BF16 Adam precision-aware behavior. Parallelism-dependent
+    # optimizer-step overlap is encoded directly in the flat perf recipes.
+    if args.compute_dtype == "bf16" and recipe.optimizer.optimizer == "adam":
+        recipe.optimizer.use_precision_aware_optimizer = True
 
     # Set NCCL env vars for nccl_ub enabled via recipe config (not just CLI).
     if getattr(recipe.ddp, "nccl_ub", False):
@@ -105,6 +152,11 @@ def main():
     if args.dryrun:
         save_path = args.save_config_filepath or "ConfigContainer.yaml"
         os.makedirs(os.path.dirname(os.path.abspath(save_path)), exist_ok=True)
+        if "WORLD_SIZE" not in os.environ and "SLURM_NTASKS" not in os.environ:
+            os.environ["WORLD_SIZE"] = str(args.num_gpus)
+        if "RANK" not in os.environ and "SLURM_PROCID" not in os.environ:
+            os.environ["RANK"] = "0"
+        runtime_config_update(recipe)
         recipe.to_yaml(save_path)
         logger.info(f"ConfigContainer saved to: {os.path.abspath(save_path)}")
         recipe.print_yaml()

diff --git a/src/megatron/bridge/perf_recipes/__init__.py b/src/megatron/bridge/perf_recipes/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Flat performance benchmark recipes for throughput measurement.
+
+Each sub-package corresponds to one model family.  Every recipe function is
+self-contained: it calls a library recipe, overrides parallelism / precision,
+calls ``_benchmark_common()``, and returns a ``ConfigContainer``.
+"""
diff --git a/src/megatron/bridge/perf_recipes/_common.py b/src/megatron/bridge/perf_recipes/_common.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared helpers for flat performance benchmark recipes.
+
+``_benchmark_common`` applies throughput-measurement defaults.
+``_perf_precision`` returns a mixed-precision config for a given dtype.
+"""
+
+from megatron.bridge.training.config import ConfigContainer
+from megatron.bridge.training.mixed_precision import (
+    bf16_mixed,
+    bf16_with_fp8_current_scaling_mixed,
+    bf16_with_mxfp8_mixed,
+    bf16_with_nvfp4_mixed,
+)
+
+
+def _benchmark_common(cfg: ConfigContainer, cross_entropy_impl: str = "te") -> None:
+    """Apply benchmark-mode defaults that prioritize throughput measurement over convergence.
+
+    Intended for performance benchmark recipes only. Sets short training runs,
+    disables checkpointing/eval, tunes scheduler, and enables perf-oriented kernels.
+
+    Must stay in sync with ``_set_common_perf_overrides`` in
+    ``scripts/performance/utils/overrides.py``.
+
+    Individual recipes may override any of these after calling this function
+    (e.g. Kimi K2 sets ``grad_reduce_in_fp32 = True``).
+    """
+    cfg.train.train_iters = 50
+    cfg.train.eval_iters = 0
+
+    cfg.checkpoint.save = None
+
+    cfg.logger.log_interval = 1
+    cfg.logger.tensorboard_dir = None
+
+    cfg.ddp.check_for_nan_in_grad = False
+    cfg.ddp.check_for_large_grads = False
+
+    cfg.rerun_state_machine.check_for_nan_in_loss = False
+
+    cfg.scheduler.lr_decay_iters = cfg.train.train_iters
+    cfg.scheduler.lr_warmup_iters = 10
+
+    if hasattr(cfg.model, "use_transformer_engine_op_fuser") and cfg.model.use_transformer_engine_op_fuser:
+        cfg.model.use_transformer_engine_op_fuser = False
+    cfg.model.apply_rope_fusion = True
+    cfg.model.cross_entropy_fusion_impl = cross_entropy_impl
+
+    if not isinstance(cfg.mixed_precision, str):
+        cfg.mixed_precision.grad_reduce_in_fp32 = False
+    cfg.ddp.grad_reduce_in_fp32 = False
+
+    # mcore may auto-promote cuda_graph_impl from "none" to "full_iteration" when
+    # cuda_graph_scope contains "full_iteration", so consult both fields when
+    # deciding whether CUDA graphs will actually run at training time.
+    cuda_impl = getattr(cfg.model, "cuda_graph_impl", None)
+    cuda_scope = getattr(cfg.model, "cuda_graph_scope", None) or []
+    scope_names = {s if isinstance(s, str) else getattr(s, "name", "") for s in cuda_scope}
+    graphs_active = (cuda_impl is not None and cuda_impl != "none") or "full_iteration" in scope_names
+    if cuda_impl == "none":
+        cfg.model.cuda_graph_scope = []
+    if cuda_impl is not None or scope_names:
+        cfg.rng.te_rng_tracker = cfg.model.use_te_rng_tracker = graphs_active
+
+    if getattr(cfg.model, "moe_flex_dispatcher_backend", None) == "hybridep":
+        cfg.model.moe_hybridep_num_sms = 32
+
+
+def _enable_overlap_param_gather_with_optimizer_step(cfg: ConfigContainer) -> None:
+    """Enable optimizer-step parameter gather overlap on optimizer and comm-overlap configs."""
+    cfg.optimizer.overlap_param_gather_with_optimizer_step = True
+    if cfg.comm_overlap is not None:
+        cfg.comm_overlap.overlap_param_gather_with_optimizer_step = True
+
+
+def _perf_precision(compute_dtype: str):
+    """Return mixed-precision config tuned for perf benchmarks.
+
+    Identical to ``scripts/performance/utils/precision.get_precision_config``
+    but importable from the library side.  Always sets
+    ``grad_reduce_in_fp32=False`` so that callers that replace
+    ``cfg.mixed_precision`` after ``_benchmark_common()`` still get the
+    benchmark-mode default.
+    """
+    if compute_dtype == "bf16":
+        cfg = bf16_mixed()
+    elif compute_dtype == "fp8_cs":
+        cfg = bf16_with_fp8_current_scaling_mixed()
+        cfg.first_last_layers_bf16 = False
+    elif compute_dtype == "fp8_mx":
+        cfg = bf16_with_mxfp8_mixed()
+    elif compute_dtype == "nvfp4":
+        cfg = bf16_with_nvfp4_mixed()
+    else:
+        raise ValueError(f"Unknown compute_dtype: {compute_dtype}")
+    cfg.grad_reduce_in_fp32 = False
+    return cfg
diff --git a/src/megatron/bridge/perf_recipes/deepseek/__init__.py b/src/megatron/bridge/perf_recipes/deepseek/__init__.py
@@ -0,0 +1,32 @@
+from megatron.bridge.perf_recipes.deepseek.deepseek_v3_perf import (
+    deepseek_v3_pretrain_64gpu_h100_bf16_config,
+    deepseek_v3_pretrain_64gpu_h100_fp8cs_config,
+    deepseek_v3_pretrain_256gpu_b200_bf16_config,
+    deepseek_v3_pretrain_256gpu_b200_fp8cs_config,
+    deepseek_v3_pretrain_256gpu_b200_fp8mx_config,
+    deepseek_v3_pretrain_256gpu_b200_fp8mx_large_scale_config,
+    deepseek_v3_pretrain_256gpu_b200_nvfp4_config,
+    deepseek_v3_pretrain_256gpu_b300_bf16_config,
+    deepseek_v3_pretrain_256gpu_b300_fp8cs_config,
+    deepseek_v3_pretrain_256gpu_b300_fp8mx_config,
+    deepseek_v3_pretrain_256gpu_b300_fp8mx_large_scale_config,
+    deepseek_v3_pretrain_256gpu_b300_nvfp4_config,
+    deepseek_v3_pretrain_256gpu_gb200_bf16_config,
+    deepseek_v3_pretrain_256gpu_gb200_fp8cs_config,
+    deepseek_v3_pretrain_256gpu_gb200_fp8mx_config,
+    deepseek_v3_pretrain_256gpu_gb200_fp8mx_large_scale_config,
+    deepseek_v3_pretrain_256gpu_gb200_nvfp4_config,
+    deepseek_v3_pretrain_256gpu_gb300_bf16_config,
+    deepseek_v3_pretrain_256gpu_gb300_fp8cs_config,
+    deepseek_v3_pretrain_256gpu_gb300_fp8mx_config,
+    deepseek_v3_pretrain_256gpu_gb300_fp8mx_large_scale_config,
+    deepseek_v3_pretrain_256gpu_gb300_nvfp4_config,
+    deepseek_v3_pretrain_256gpu_vr200_bf16_config,
+    deepseek_v3_pretrain_256gpu_vr200_fp8cs_config,
+    deepseek_v3_pretrain_256gpu_vr200_fp8mx_config,
+    deepseek_v3_pretrain_256gpu_vr200_nvfp4_config,
+    deepseek_v3_pretrain_1024gpu_h100_bf16_config,
+    deepseek_v3_pretrain_1024gpu_h100_fp8cs_config,
+    deepseek_v3_pretrain_1024gpu_h100_fp8sc_config,
+    deepseek_v3_pretrain_1024gpu_h100_fp8sc_large_scale_config,
+)