chore: Update Megatron patch with MTP loss div-by-zero guard

zx3xyy · zx3xyy · commit d00d24637eb3 · 2026-02-19T14:36:24.000-08:00
Guard against division by zero in MTP loss computation when
num_tokens is 0, which can happen with context parallelism
when one CP rank has no response tokens after label rolling.
diff --git a/docker/patch/v0.5.7/megatron.patch b/docker/patch/v0.5.7/megatron.patch
@@ -379,7 +379,7 @@ index e21127b87..712793853 100755
          ),
      )
 diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
-index a1230568c..1fd52f65a 100644
+index a1230568c..b45e63237 100644
 --- a/megatron/core/models/gpt/gpt_model.py
 +++ b/megatron/core/models/gpt/gpt_model.py
 @@ -446,6 +446,7 @@ class GPTModel(LanguageModule):
@@ -437,7 +437,7 @@ index a1230568c..1fd52f65a 100644
              for mtp_layer_number in range(self.config.mtp_num_layers):
                  # Calc loss for the current Multi-Token Prediction (MTP) layers.
                  mtp_labels, _ = roll_tensor(
-@@ -595,7 +604,7 @@ class GPTModel(LanguageModule):
+@@ -595,17 +604,19 @@ class GPTModel(LanguageModule):
                      sequence_parallel_enabled=self.output_layer.sequence_parallel,
                      column_parallel_linear=self.output_layer,
                      col_linear_kwargs={
@@ -446,6 +446,28 @@ index a1230568c..1fd52f65a 100644
                          'runtime_gather_output': runtime_gather_output,
                      },
                  )
+ 
+                 mtp_loss = loss_mask * mtp_loss
++                # Guard against division by zero when num_tokens is 0
++                safe_num_tokens = max(num_tokens, 1)
+                 if self.training:
+                     # TODO(shifangx): remove the use of parallel_state here
+                     # after moving loss logging to loss_func in pretrain_gpt.py
+                     MTPLossLoggingHelper.save_loss_to_tracker(
+-                        torch.sum(mtp_loss) / num_tokens,
++                        torch.sum(mtp_loss) / safe_num_tokens,
+                         mtp_layer_number,
+                         self.config.mtp_num_layers,
+                         avg_group=parallel_state.get_data_parallel_group(
+@@ -619,7 +630,7 @@ class GPTModel(LanguageModule):
+                     )
+                 else:
+                     hidden_states = MTPLossAutoScaler.apply(
+-                        hidden_states, mtp_loss_scale * mtp_loss / num_tokens
++                        hidden_states, mtp_loss_scale * mtp_loss / safe_num_tokens
+                     )
+         sequence_parallel_override = False
+ 
 diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py
 index 6e093f96f..eac21a3ea 100644
 --- a/megatron/core/optimizer/distrib_optimizer.py
diff --git a/slime/backends/megatron_utils/megatron_to_hf/qwen3_next.py b/slime/backends/megatron_utils/megatron_to_hf/qwen3_next.py
@@ -53,11 +53,7 @@ def _convert_mtp_layer(args, name, param, layer_idx):
     if "final_layernorm.weight" in name:
         return [("mtp.norm.weight", param)]
     if "eh_proj.weight" in name:
-        if param.dim() < 2:
-            raise ValueError(f"eh_proj weight expects 2D tensor, got {param.shape}")
-        first_half, second_half = param.chunk(2, dim=1)
-        new_param = torch.cat([second_half, first_half], dim=1)
-        return [("mtp.fc.weight", new_param)]
+        return [("mtp.fc.weight", param)]
 
     # MTP inner transformer layers (keep layer index)
     if "transformer_layer" in name:
diff --git a/slime/backends/megatron_utils/model_provider.py b/slime/backends/megatron_utils/model_provider.py
@@ -194,11 +194,7 @@ def model_provider(pre_process: bool = True, post_process: bool = True, vp_stage
             if vp_stage is not None:
                 mtp_kwargs["vp_stage"] = vp_stage
 
-            from dataclasses import replace
-
-            mtp_config = replace(config, use_gated_attention=True)
-            object.__setattr__(config, "mtp_config", mtp_config)
-            mtp_block_spec = get_gpt_mtp_block_spec(mtp_config, transformer_layer_spec, **mtp_kwargs)
+            mtp_block_spec = get_gpt_mtp_block_spec(config, transformer_layer_spec, **mtp_kwargs)
             kwargs["mtp_block_spec"] = mtp_block_spec
 
         with build_model_context(**build_model_context_args):
diff --git a/slime_plugins/mbridge/qwen3_next.py b/slime_plugins/mbridge/qwen3_next.py
@@ -47,14 +47,10 @@ class Qwen3NextBridge(Qwen2MoEBridge):
     )
 
     def _get_gptmodel_args(self) -> dict:
-        """Override to add MTP block spec with gated attention config."""
-        from copy import deepcopy
-
+        """Override to add MTP block spec."""
         ret = super()._get_gptmodel_args()
         if getattr(self.config, "mtp_num_layers", None) is not None:
-            mtp_config = deepcopy(self.config)
-            mtp_config.use_gated_attention = True
-            mtp_block_spec = get_gpt_mtp_block_spec(mtp_config, mtp_config, use_transformer_engine=True)
+            mtp_block_spec = get_gpt_mtp_block_spec(self.config, self.config, use_transformer_engine=True)
             ret["mtp_block_spec"] = mtp_block_spec
         return ret
 
@@ -171,17 +167,11 @@ def _weight_to_mcore_format(
             return qgkv
 
         weight = super()._weight_to_mcore_format(mcore_weights_name, hf_weights)
-        if mcore_weights_name.endswith("eh_proj.weight"):
-            first_half, second_half = weight.chunk(2, dim=1)
-            weight = torch.cat([second_half, first_half], dim=1)
         return weight
 
     def _weight_to_hf_format(
         self, mcore_weights_name: str, mcore_weights: torch.Tensor
     ) -> tuple[list[str], list[torch.Tensor]]:
-        if mcore_weights_name.endswith("eh_proj.weight"):
-            first_half, second_half = mcore_weights.chunk(2, dim=1)
-            mcore_weights = torch.cat([second_half, first_half], dim=1)
         return super()._weight_to_hf_format(mcore_weights_name, mcore_weights)
 
     def _build_config(self):
@@ -211,5 +201,6 @@ def _build_config(self):
             # Qwen3 Next specific
             attention_output_gate=True,
             moe_shared_expert_gate=True,
+            use_gated_attention=True,
             **mtp_args,
         )