NVIDIA
diff --git a/‎.github/copy-pr-bot.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/copy-pr-bot.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/oncall_schedule.json‎
Lines changed: 4 additions & 4 deletions b/‎.github/oncall_schedule.json‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/post_training/modelopt/README.md‎
Lines changed: 1 addition & 0 deletions b/‎examples/post_training/modelopt/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16.sh‎
Lines changed: 62 additions & 0 deletions b/‎examples/post_training/modelopt/conf/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16.sh‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎gpt_builders.py‎
Lines changed: 18 additions & 19 deletions b/‎gpt_builders.py‎
Lines changed: 18 additions & 19 deletions
diff --git a/‎megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py‎
Lines changed: 5 additions & 0 deletions b/‎megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎megatron/core/inference/engines/dynamic_engine.py‎
Lines changed: 6 additions & 0 deletions b/‎megatron/core/inference/engines/dynamic_engine.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎megatron/core/models/gpt/experimental_attention_variant_module_specs.py‎
Lines changed: 1 addition & 0 deletions b/‎megatron/core/models/gpt/experimental_attention_variant_module_specs.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎megatron/core/models/gpt/gpt_layer_specs.py‎
Lines changed: 46 additions & 0 deletions b/‎megatron/core/models/gpt/gpt_layer_specs.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎megatron/core/pipeline_parallel/schedules.py‎
Lines changed: 3 additions & 3 deletions b/‎megatron/core/pipeline_parallel/schedules.py‎
Lines changed: 3 additions & 3 deletions
@@ -1,4 +1,4 @@
 enabled: true
 auto_sync_draft: false
 auto_sync_ready: true
-trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "nanz-nv", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
+trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "nanz-nv", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "wplf", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
@@ -1,8 +1,4 @@
 [
-    {
-        "user": "BoxiangW",
-        "date": "2026-03-04"
-    },
     {
         "user": "maanug-nv",
         "date": "2026-03-11"
@@ -46,5 +42,9 @@
     {
         "user": "gautham-kollu",
         "date": "2026-05-20"
+    },
+    {
+        "user": "ilml",
+        "date": "2026-05-27"
     }
 ]
@@ -34,6 +34,7 @@ knowledge distillation, pruning, speculative decoding, and more.
 | `moonshotai/Kimi-K2-Instruct` | ✅ | ✅ | - | - |
 | `nvidia/NVIDIA-Nemotron-Nano-9B-v2` | ✅ | - | ✅ | ✅ |
 | `nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16` | ✅ | - | ✅ | ✅ |
+| `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` | ✅ | - | ✅ | ✅ |
 | `openai/gpt-oss-{20b, 120b}` | ✅ | **Online** | ✅ | ✅ |
 | `Qwen/Qwen3-{0.6B, 8B}` | ✅ | ✅ | ✅ | ✅ |
 | `Qwen/Qwen3-{30B-A3B, 235B-A22B}` | **WAR** | ✅ | ✅ | ✅ |
 
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16
+    TOKENIZER_MODEL=nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+
+
+MODEL_ARGS=" \
+    --trust-remote-code \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --enable-experimental \
+    --use-fused-weighted-squared-relu \
+    --cross-entropy-loss-fusion \
+    --cross-entropy-fusion-impl native \
+    --num-experts 512 \
+    --moe-router-score-function sigmoid \
+    --moe-grouped-gemm \
+    --moe-aux-loss-coeff 1e-4 \
+    --moe-router-topk 22 \
+    --moe-permute-fusion \
+    --moe-router-topk-scaling-factor 5.0 \
+    --moe-router-enable-expert-bias \
+    --moe-router-dtype fp32 \
+    --moe-router-load-balancing-type seq_aux_loss \
+    --moe-shared-expert-intermediate-size 5376 \
+    --moe-token-dispatcher-type allgather \
+    --moe-latent-size 1024 \
+    \
+    --attention-backend flash \
+    --disable-gloo-process-groups \
+    --is-hybrid-model \
+    --mamba-num-heads 128 \
+    --mamba-head-dim 64 \
+    --hybrid-layer-pattern MEMEMEM*EMEMEMEM*EMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEM*EMEMEMEME \
+    \
+    --use-mcore-models \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --init-method-std 0.014 \
+    --position-embedding-type none \
+    --squared-relu \
+    --hidden-size 4096 \
+    --num-attention-heads 32 \
+    --group-query-attention \
+    --num-query-groups 2 \
+    --ffn-hidden-size 2688 \
+    --kv-channels 128 \
+    --normalization RMSNorm \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    \
+    --tokenizer-type HuggingFaceTokenizer \
+    --bf16 \
+    --seq-length 8192 \
+    --max-position-embeddings 8192 \
+    --export-model-type MambaModel \
+    "
@@ -115,43 +115,42 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_
 
 def _get_transformer_layer_spec(use_te, config):
     """Get transformer layer specification based on configuration.
-    
+
     Args:
         use_te (bool): Whether to use Transformer Engine
-        args: Training arguments
         config: Model configuration
-        
+
     Returns:
         transformer_layer_spec: The transformer layer specification
     """
-    args = get_args()
     if use_te:
         return get_gpt_layer_with_transformer_engine_spec(
-            args.num_experts,
-            args.moe_grouped_gemm,
-            args.qk_layernorm,
-            args.multi_latent_attention,
-            args.experimental_attention_variant,
-            qk_l2_norm=args.qk_l2_norm,
+            config.num_moe_experts,
+            config.moe_grouped_gemm,
+            config.qk_layernorm,
+            config.multi_latent_attention,
+            config.experimental_attention_variant,
+            qk_l2_norm=config.qk_l2_norm,
             use_kitchen=config.use_kitchen,
             use_te_activation_func=config.use_te_activation_func,
             use_kitchen_attention=config.use_kitchen_attention,
             kitchen_attention_backend=config.kitchen_attention_backend,
+            mla_down_proj_fusion=getattr(config, "mla_down_proj_fusion", False),
         )
     elif config.transformer_impl == "inference_optimized":
         return get_gpt_layer_with_inference_spec(
-            args.qk_layernorm,
-            args.multi_latent_attention,
-            qk_l2_norm=args.qk_l2_norm,
+            config.qk_layernorm,
+            config.multi_latent_attention,
+            qk_l2_norm=config.qk_l2_norm,
         )
     else:
         return get_gpt_layer_local_spec(
-            args.num_experts,
-            args.moe_grouped_gemm,
-            args.qk_layernorm,
-            args.multi_latent_attention,
-            args.experimental_attention_variant,
-            normalization=args.normalization,
+            config.num_moe_experts,
+            config.moe_grouped_gemm,
+            config.qk_layernorm,
+            config.multi_latent_attention,
+            config.experimental_attention_variant,
+            normalization=config.normalization,
             use_kitchen=config.use_kitchen,
             use_kitchen_attention=config.use_kitchen_attention,
             kitchen_attention_backend=config.kitchen_attention_backend,
 
@@ -175,6 +175,11 @@ def validate_uneven_dtensor(dtensor: DTensor) -> None:
     )
 
     # Check that all boundaries (start and end) are touched.
+    # Skip under fake process group — all_reduce is a no-op so only rank 0's
+    # boundaries are visible, which makes the end-boundary check always fail.
+    if torch.distributed.is_initialized() and torch.distributed.get_backend() == 'fake':
+        return
+
     boundary_checks = torch.tensor(
         [
             [offset == 0, offset + size == dtensor.shape[dim]]
 
@@ -839,10 +839,16 @@ def _add_request(
             len(request.prompt_tokens) + request.sampling_params.num_tokens_to_generate
             > self.context.max_sequence_length
         ) or (request.sampling_params.num_tokens_to_generate < 0):
+            logging.error(
+                f"{request_id=} Invalid number of tokens to generate. Prompt len: {len(request.prompt_tokens)}, tokens to generate: {request.sampling_params.num_tokens_to_generate}, max seq len: {self.context.max_sequence_length}."
+            )
             request.status = Status.FAILED
             request.add_event_error_nontransient(MaxSequenceLengthOverflowError(request_id))
 
         if len(request.prompt_tokens) > self.context.max_tokens and not self.enable_chunked_prefill:
+            logging.error(
+                f"{request_id=} Prompt is longer than context.max_tokens. Prompt tokens: {len(request.prompt_tokens)}, context.max_tokens: {self.context.max_tokens}, chunked_prefill: {self.enable_chunked_prefill}"
+            )
             request.status = Status.FAILED
             request.add_event_error_nontransient(TokenOverflowError(request_id))
 
 
@@ -397,6 +397,7 @@ def _get_self_attention_module_spec(
         use_te_activation_func=config.use_te_activation_func,
         use_kitchen_attention=config.use_kitchen_attention,
         kitchen_attention_backend=config.kitchen_attention_backend,
+        mla_down_proj_fusion=getattr(config, "mla_down_proj_fusion", False),
     )
     attn_spec = layer_spec.submodules.self_attention
     if config.multi_latent_attention:
 
@@ -14,6 +14,7 @@
 from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.multi_latent_attention import (
+    FusedMLASelfAttention,
     MLASelfAttention,
     MLASelfAttentionSubmodules,
 )
@@ -184,6 +185,7 @@ def get_gpt_layer_with_transformer_engine_submodules(
     use_te_activation_func: bool = False,
     use_kitchen_attention: bool = False,
     kitchen_attention_backend: str = "sdpa",
+    mla_down_proj_fusion: bool = False,
 ) -> TransformerLayerSubmodules:
     """Use these submodules to use lower-level Transformer Engine modules (required for fp8
     training).
@@ -198,6 +200,9 @@ def get_gpt_layer_with_transformer_engine_submodules(
         qk_l2_norm (bool, optional): To use l2 norm for queries/keys. Defaults to False.
         use_te_op_fuser (bool, optional): Use Transformer Engine's operation-based API, which may
                                           enable certain operation fusions. Defaults to False.
+        mla_down_proj_fusion (bool, optional): Enable fused q/kv down-projection and fused input
+                                               layernorm when backend supports. Otherwise fall back
+                                               to the unfused MLA.
 
     Returns:
         TransformerLayerSubmodules: TE modules to construct a TransformerLayer
@@ -243,6 +248,45 @@ def get_gpt_layer_with_transformer_engine_submodules(
             if qk_layernorm
             else backend.column_parallel_linear()
         )
+
+        if mla_down_proj_fusion:
+            fuse_input_layernorm = backend.column_parallel_layer_norm_linear() is not None
+            input_layernorm = IdentityOp if fuse_input_layernorm else backend.layer_norm()
+            down_proj_linear = (
+                backend.column_parallel_layer_norm_linear()
+                if fuse_input_layernorm
+                else backend.linear()
+            )
+            return TransformerLayerSubmodules(
+                input_layernorm=input_layernorm,
+                self_attention=ModuleSpec(
+                    module=FusedMLASelfAttention,
+                    params={"attn_mask_type": AttnMaskType.causal},
+                    submodules=MLASelfAttentionSubmodules(
+                        linear_q_proj=backend.column_parallel_linear(),
+                        linear_qkv_down_proj=down_proj_linear,
+                        linear_q_up_proj=linear_q_up_proj,
+                        linear_kv_up_proj=linear_kv_up_proj,
+                        core_attention=backend.core_attention(),
+                        linear_proj=backend.row_parallel_linear(),
+                        q_layernorm=IdentityOp,
+                        kv_layernorm=IdentityOp,
+                    ),
+                ),
+                self_attn_bda=get_bias_dropout_add,
+                pre_mlp_layernorm=backend.layer_norm() if num_experts else IdentityOp,
+                mlp=mlp,
+                mlp_bda=get_bias_dropout_add,
+                sharded_state_dict_keys_map=(
+                    {
+                        "self_attention.linear_q_down_proj.layer_norm_": "input_layernorm.",
+                        "self_attention.linear_kv_down_proj.layer_norm_": "input_layernorm.",
+                        "self_attention.linear_qkv_down_proj.layer_norm_": "input_layernorm.",
+                    }
+                    if fuse_input_layernorm
+                    else {}
+                ),
+            )
         return TransformerLayerSubmodules(
             input_layernorm=backend.layer_norm(has_residual=True),
             self_attention=ModuleSpec(
@@ -526,6 +570,7 @@ def get_gpt_decoder_layer_specs(
             use_te_activation_func=config.use_te_activation_func,
             use_kitchen_attention=config.use_kitchen_attention,
             kitchen_attention_backend=config.kitchen_attention_backend,
+            mla_down_proj_fusion=getattr(config, "mla_down_proj_fusion", False),
         )
         moe_layer_spec = get_gpt_layer_with_transformer_engine_spec(
             num_experts=config.num_moe_experts,
@@ -537,6 +582,7 @@ def get_gpt_decoder_layer_specs(
             use_te_activation_func=config.use_te_activation_func,
             use_kitchen_attention=config.use_kitchen_attention,
             kitchen_attention_backend=config.kitchen_attention_backend,
+            mla_down_proj_fusion=getattr(config, "mla_down_proj_fusion", False),
         )
     elif config.transformer_impl == "inference_optimized":
         layer_norm_impl = TENorm
 
@@ -666,7 +666,7 @@ def forward_backward_no_pipelining(
             force_all_reduce=force_all_reduce,
         )
 
-    if not forward_only and config.fine_grained_activation_offloading:
+    if getattr(config, 'fine_grained_activation_offloading', False):
         off_interface.reset()
 
     if config.timers is not None:
@@ -1905,7 +1905,7 @@ def pp_post_backward(input_tensor_grad, vp_stage=None):
             force_all_reduce=force_all_reduce,
         )
 
-    if not forward_only and config.fine_grained_activation_offloading:
+    if getattr(config, 'fine_grained_activation_offloading', False):
         off_interface.reset()
     # Restore config.grad_sync_func and config.param_sync_func.
     if forward_only:
@@ -2297,7 +2297,7 @@ def enable_grad_sync():
             force_all_reduce=force_all_reduce,
         )
 
-    if not forward_only and config.fine_grained_activation_offloading:
+    if getattr(config, 'fine_grained_activation_offloading', False):
         off_interface.reset()
 
     if config.timers is not None:
Original file line number	Diff line number	Diff line change
`@@ -1,8 +1,4 @@`
`1`	`1`	`[`
`2`		`- {`
`3`		`- "user": "BoxiangW",`
`4`		`- "date": "2026-03-04"`
`5`		`- },`
`6`	`2`	`{`
`7`	`3`	`"user": "maanug-nv",`
`8`	`4`	`"date": "2026-03-11"`
`@@ -46,5 +42,9 @@`
`46`	`42`	`{`
`47`	`43`	`"user": "gautham-kollu",`
`48`	`44`	`"date": "2026-05-20"`
	`45`	`+ },`
	`46`	`+ {`
	`47`	`+ "user": "ilml",`
	`48`	`+ "date": "2026-05-27"`
`49`	`49`	`}`
`50`	`50`	`]`
Original file line number	Diff line number	Diff line change
`@@ -175,6 +175,11 @@ def validate_uneven_dtensor(dtensor: DTensor) -> None:`
`175`	`175`	`)`
`176`	`176`
`177`	`177`	`# Check that all boundaries (start and end) are touched.`
	`178`	`+ # Skip under fake process group — all_reduce is a no-op so only rank 0's`
	`179`	`+ # boundaries are visible, which makes the end-boundary check always fail.`
	`180`	`+ if torch.distributed.is_initialized() and torch.distributed.get_backend() == 'fake':`
	`181`	`+ return`
	`182`	`+`
`178`	`183`	`boundary_checks = torch.tensor(`
`179`	`184`	`[`
`180`	`185`	`[offset == 0, offset + size == dtensor.shape[dim]]`
Original file line number	Diff line number	Diff line change
`@@ -397,6 +397,7 @@ def _get_self_attention_module_spec(`
`397`	`397`	`use_te_activation_func=config.use_te_activation_func,`
`398`	`398`	`use_kitchen_attention=config.use_kitchen_attention,`
`399`	`399`	`kitchen_attention_backend=config.kitchen_attention_backend,`
	`400`	`+ mla_down_proj_fusion=getattr(config, "mla_down_proj_fusion", False),`
`400`	`401`	`)`
`401`	`402`	`attn_spec = layer_spec.submodules.self_attention`
`402`	`403`	`if config.multi_latent_attention:`