Skip to content

Commit d43aeca

Browse files
Merge branch 'main' into shanmugamr1992/megatron_inference_ultra
2 parents 64cf04b + da46946 commit d43aeca

File tree

38 files changed

+17838
-2474
lines changed

38 files changed

+17838
-2474
lines changed

.github/copy-pr-bot.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
enabled: true
22
auto_sync_draft: false
33
auto_sync_ready: true
4-
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "nanz-nv", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]
4+
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "CarlosGomes98", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "Phlip79", "QiZhangNV", "RPrenger", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "ahmadki", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cjld", "cspades", "cuichenx", "deepakn94", "dimapihtar", "dingqingy-nv", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "faradawn", "frsun-nvda", "gautham-kollu", "gdengk", "guyueh1", "huvunvidia", "hxbai", "ilml", "jalbericiola", "janEbert", "jaredcasper", "jenchen13", "jiemingz", "jingqiny-99", "jkamalu", "jon-barker", "jstjohn", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "kwyss-nvidia", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mchrzanowski", "mehraakash", "mkhona-nvidia", "nanz-nv", "parthmannan", "prajwal1210", "pthombre", "rhewett-nv", "rogerwaleffe", "sajadn", "sanandaraj5597", "sancha", "santhnm2", "sbak5", "shanmugamr1992", "sharathts", "shengf-nv", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "sudhakarsingh27", "tdene", "theothermike", "thomasdhc", "tomlifu", "trintamaki", "tylerpoon", "wdykas", "wplf", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yueshen2016", "yuzhongw-nvidia", "zhongbozhu"]

.github/oncall_schedule.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,4 @@
11
[
2-
{
3-
"user": "BoxiangW",
4-
"date": "2026-03-04"
5-
},
62
{
73
"user": "maanug-nv",
84
"date": "2026-03-11"
@@ -46,5 +42,9 @@
4642
{
4743
"user": "gautham-kollu",
4844
"date": "2026-05-20"
45+
},
46+
{
47+
"user": "ilml",
48+
"date": "2026-05-27"
4949
}
5050
]

examples/post_training/modelopt/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ knowledge distillation, pruning, speculative decoding, and more.
3434
| `moonshotai/Kimi-K2-Instruct` ||| - | - |
3535
| `nvidia/NVIDIA-Nemotron-Nano-9B-v2` || - |||
3636
| `nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16` || - |||
37+
| `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` || - |||
3738
| `openai/gpt-oss-{20b, 120b}` || **Online** |||
3839
| `Qwen/Qwen3-{0.6B, 8B}` |||||
3940
| `Qwen/Qwen3-{30B-A3B, 235B-A22B}` | **WAR** ||||
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/bin/bash
2+
3+
if [ -z ${HF_MODEL_CKPT} ]; then
4+
HF_MODEL_CKPT=nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16
5+
TOKENIZER_MODEL=nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16
6+
else
7+
TOKENIZER_MODEL=${HF_MODEL_CKPT}
8+
fi
9+
10+
11+
12+
MODEL_ARGS=" \
13+
--trust-remote-code \
14+
--save-interval 100000 \
15+
--micro-batch-size 1 \
16+
--enable-experimental \
17+
--use-fused-weighted-squared-relu \
18+
--cross-entropy-loss-fusion \
19+
--cross-entropy-fusion-impl native \
20+
--num-experts 512 \
21+
--moe-router-score-function sigmoid \
22+
--moe-grouped-gemm \
23+
--moe-aux-loss-coeff 1e-4 \
24+
--moe-router-topk 22 \
25+
--moe-permute-fusion \
26+
--moe-router-topk-scaling-factor 5.0 \
27+
--moe-router-enable-expert-bias \
28+
--moe-router-dtype fp32 \
29+
--moe-router-load-balancing-type seq_aux_loss \
30+
--moe-shared-expert-intermediate-size 5376 \
31+
--moe-token-dispatcher-type allgather \
32+
--moe-latent-size 1024 \
33+
\
34+
--attention-backend flash \
35+
--disable-gloo-process-groups \
36+
--is-hybrid-model \
37+
--mamba-num-heads 128 \
38+
--mamba-head-dim 64 \
39+
--hybrid-layer-pattern MEMEMEM*EMEMEMEM*EMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEMEM*EMEMEMEM*EMEMEMEME \
40+
\
41+
--use-mcore-models \
42+
--untie-embeddings-and-output-weights \
43+
--disable-bias-linear \
44+
--init-method-std 0.014 \
45+
--position-embedding-type none \
46+
--squared-relu \
47+
--hidden-size 4096 \
48+
--num-attention-heads 32 \
49+
--group-query-attention \
50+
--num-query-groups 2 \
51+
--ffn-hidden-size 2688 \
52+
--kv-channels 128 \
53+
--normalization RMSNorm \
54+
--attention-dropout 0.0 \
55+
--hidden-dropout 0.0 \
56+
\
57+
--tokenizer-type HuggingFaceTokenizer \
58+
--bf16 \
59+
--seq-length 8192 \
60+
--max-position-embeddings 8192 \
61+
--export-model-type MambaModel \
62+
"

gpt_builders.py

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -115,43 +115,42 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_
115115

116116
def _get_transformer_layer_spec(use_te, config):
117117
"""Get transformer layer specification based on configuration.
118-
118+
119119
Args:
120120
use_te (bool): Whether to use Transformer Engine
121-
args: Training arguments
122121
config: Model configuration
123-
122+
124123
Returns:
125124
transformer_layer_spec: The transformer layer specification
126125
"""
127-
args = get_args()
128126
if use_te:
129127
return get_gpt_layer_with_transformer_engine_spec(
130-
args.num_experts,
131-
args.moe_grouped_gemm,
132-
args.qk_layernorm,
133-
args.multi_latent_attention,
134-
args.experimental_attention_variant,
135-
qk_l2_norm=args.qk_l2_norm,
128+
config.num_moe_experts,
129+
config.moe_grouped_gemm,
130+
config.qk_layernorm,
131+
config.multi_latent_attention,
132+
config.experimental_attention_variant,
133+
qk_l2_norm=config.qk_l2_norm,
136134
use_kitchen=config.use_kitchen,
137135
use_te_activation_func=config.use_te_activation_func,
138136
use_kitchen_attention=config.use_kitchen_attention,
139137
kitchen_attention_backend=config.kitchen_attention_backend,
138+
mla_down_proj_fusion=getattr(config, "mla_down_proj_fusion", False),
140139
)
141140
elif config.transformer_impl == "inference_optimized":
142141
return get_gpt_layer_with_inference_spec(
143-
args.qk_layernorm,
144-
args.multi_latent_attention,
145-
qk_l2_norm=args.qk_l2_norm,
142+
config.qk_layernorm,
143+
config.multi_latent_attention,
144+
qk_l2_norm=config.qk_l2_norm,
146145
)
147146
else:
148147
return get_gpt_layer_local_spec(
149-
args.num_experts,
150-
args.moe_grouped_gemm,
151-
args.qk_layernorm,
152-
args.multi_latent_attention,
153-
args.experimental_attention_variant,
154-
normalization=args.normalization,
148+
config.num_moe_experts,
149+
config.moe_grouped_gemm,
150+
config.qk_layernorm,
151+
config.multi_latent_attention,
152+
config.experimental_attention_variant,
153+
normalization=config.normalization,
155154
use_kitchen=config.use_kitchen,
156155
use_kitchen_attention=config.use_kitchen_attention,
157156
kitchen_attention_backend=config.kitchen_attention_backend,

megatron/core/distributed/fsdp/src/megatron_fsdp/uneven_dtensor.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,11 @@ def validate_uneven_dtensor(dtensor: DTensor) -> None:
175175
)
176176

177177
# Check that all boundaries (start and end) are touched.
178+
# Skip under fake process group — all_reduce is a no-op so only rank 0's
179+
# boundaries are visible, which makes the end-boundary check always fail.
180+
if torch.distributed.is_initialized() and torch.distributed.get_backend() == 'fake':
181+
return
182+
178183
boundary_checks = torch.tensor(
179184
[
180185
[offset == 0, offset + size == dtensor.shape[dim]]

megatron/core/inference/engines/dynamic_engine.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -839,10 +839,16 @@ def _add_request(
839839
len(request.prompt_tokens) + request.sampling_params.num_tokens_to_generate
840840
> self.context.max_sequence_length
841841
) or (request.sampling_params.num_tokens_to_generate < 0):
842+
logging.error(
843+
f"{request_id=} Invalid number of tokens to generate. Prompt len: {len(request.prompt_tokens)}, tokens to generate: {request.sampling_params.num_tokens_to_generate}, max seq len: {self.context.max_sequence_length}."
844+
)
842845
request.status = Status.FAILED
843846
request.add_event_error_nontransient(MaxSequenceLengthOverflowError(request_id))
844847

845848
if len(request.prompt_tokens) > self.context.max_tokens and not self.enable_chunked_prefill:
849+
logging.error(
850+
f"{request_id=} Prompt is longer than context.max_tokens. Prompt tokens: {len(request.prompt_tokens)}, context.max_tokens: {self.context.max_tokens}, chunked_prefill: {self.enable_chunked_prefill}"
851+
)
846852
request.status = Status.FAILED
847853
request.add_event_error_nontransient(TokenOverflowError(request_id))
848854

megatron/core/models/gpt/experimental_attention_variant_module_specs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,7 @@ def _get_self_attention_module_spec(
397397
use_te_activation_func=config.use_te_activation_func,
398398
use_kitchen_attention=config.use_kitchen_attention,
399399
kitchen_attention_backend=config.kitchen_attention_backend,
400+
mla_down_proj_fusion=getattr(config, "mla_down_proj_fusion", False),
400401
)
401402
attn_spec = layer_spec.submodules.self_attention
402403
if config.multi_latent_attention:

megatron/core/models/gpt/gpt_layer_specs.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from megatron.core.transformer.identity_op import IdentityOp
1515
from megatron.core.transformer.mlp import MLP, MLPSubmodules
1616
from megatron.core.transformer.multi_latent_attention import (
17+
FusedMLASelfAttention,
1718
MLASelfAttention,
1819
MLASelfAttentionSubmodules,
1920
)
@@ -184,6 +185,7 @@ def get_gpt_layer_with_transformer_engine_submodules(
184185
use_te_activation_func: bool = False,
185186
use_kitchen_attention: bool = False,
186187
kitchen_attention_backend: str = "sdpa",
188+
mla_down_proj_fusion: bool = False,
187189
) -> TransformerLayerSubmodules:
188190
"""Use these submodules to use lower-level Transformer Engine modules (required for fp8
189191
training).
@@ -198,6 +200,9 @@ def get_gpt_layer_with_transformer_engine_submodules(
198200
qk_l2_norm (bool, optional): To use l2 norm for queries/keys. Defaults to False.
199201
use_te_op_fuser (bool, optional): Use Transformer Engine's operation-based API, which may
200202
enable certain operation fusions. Defaults to False.
203+
mla_down_proj_fusion (bool, optional): Enable fused q/kv down-projection and fused input
204+
layernorm when backend supports. Otherwise fall back
205+
to the unfused MLA.
201206
202207
Returns:
203208
TransformerLayerSubmodules: TE modules to construct a TransformerLayer
@@ -243,6 +248,45 @@ def get_gpt_layer_with_transformer_engine_submodules(
243248
if qk_layernorm
244249
else backend.column_parallel_linear()
245250
)
251+
252+
if mla_down_proj_fusion:
253+
fuse_input_layernorm = backend.column_parallel_layer_norm_linear() is not None
254+
input_layernorm = IdentityOp if fuse_input_layernorm else backend.layer_norm()
255+
down_proj_linear = (
256+
backend.column_parallel_layer_norm_linear()
257+
if fuse_input_layernorm
258+
else backend.linear()
259+
)
260+
return TransformerLayerSubmodules(
261+
input_layernorm=input_layernorm,
262+
self_attention=ModuleSpec(
263+
module=FusedMLASelfAttention,
264+
params={"attn_mask_type": AttnMaskType.causal},
265+
submodules=MLASelfAttentionSubmodules(
266+
linear_q_proj=backend.column_parallel_linear(),
267+
linear_qkv_down_proj=down_proj_linear,
268+
linear_q_up_proj=linear_q_up_proj,
269+
linear_kv_up_proj=linear_kv_up_proj,
270+
core_attention=backend.core_attention(),
271+
linear_proj=backend.row_parallel_linear(),
272+
q_layernorm=IdentityOp,
273+
kv_layernorm=IdentityOp,
274+
),
275+
),
276+
self_attn_bda=get_bias_dropout_add,
277+
pre_mlp_layernorm=backend.layer_norm() if num_experts else IdentityOp,
278+
mlp=mlp,
279+
mlp_bda=get_bias_dropout_add,
280+
sharded_state_dict_keys_map=(
281+
{
282+
"self_attention.linear_q_down_proj.layer_norm_": "input_layernorm.",
283+
"self_attention.linear_kv_down_proj.layer_norm_": "input_layernorm.",
284+
"self_attention.linear_qkv_down_proj.layer_norm_": "input_layernorm.",
285+
}
286+
if fuse_input_layernorm
287+
else {}
288+
),
289+
)
246290
return TransformerLayerSubmodules(
247291
input_layernorm=backend.layer_norm(has_residual=True),
248292
self_attention=ModuleSpec(
@@ -526,6 +570,7 @@ def get_gpt_decoder_layer_specs(
526570
use_te_activation_func=config.use_te_activation_func,
527571
use_kitchen_attention=config.use_kitchen_attention,
528572
kitchen_attention_backend=config.kitchen_attention_backend,
573+
mla_down_proj_fusion=getattr(config, "mla_down_proj_fusion", False),
529574
)
530575
moe_layer_spec = get_gpt_layer_with_transformer_engine_spec(
531576
num_experts=config.num_moe_experts,
@@ -537,6 +582,7 @@ def get_gpt_decoder_layer_specs(
537582
use_te_activation_func=config.use_te_activation_func,
538583
use_kitchen_attention=config.use_kitchen_attention,
539584
kitchen_attention_backend=config.kitchen_attention_backend,
585+
mla_down_proj_fusion=getattr(config, "mla_down_proj_fusion", False),
540586
)
541587
elif config.transformer_impl == "inference_optimized":
542588
layer_norm_impl = TENorm

megatron/core/pipeline_parallel/schedules.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -666,7 +666,7 @@ def forward_backward_no_pipelining(
666666
force_all_reduce=force_all_reduce,
667667
)
668668

669-
if not forward_only and config.fine_grained_activation_offloading:
669+
if getattr(config, 'fine_grained_activation_offloading', False):
670670
off_interface.reset()
671671

672672
if config.timers is not None:
@@ -1905,7 +1905,7 @@ def pp_post_backward(input_tensor_grad, vp_stage=None):
19051905
force_all_reduce=force_all_reduce,
19061906
)
19071907

1908-
if not forward_only and config.fine_grained_activation_offloading:
1908+
if getattr(config, 'fine_grained_activation_offloading', False):
19091909
off_interface.reset()
19101910
# Restore config.grad_sync_func and config.param_sync_func.
19111911
if forward_only:
@@ -2297,7 +2297,7 @@ def enable_grad_sync():
22972297
force_all_reduce=force_all_reduce,
22982298
)
22992299

2300-
if not forward_only and config.fine_grained_activation_offloading:
2300+
if getattr(config, 'fine_grained_activation_offloading', False):
23012301
off_interface.reset()
23022302

23032303
if config.timers is not None:

0 commit comments

Comments
 (0)