fix: gMLP uses full bias instead of truncated bias (#1371)

Mr-Neutr0n · web-flow · commit ea7aefd8f8e1 · 2026-05-19T13:41:24.000-07:00
* fix: correct undefined self.args in TopKTokenChoiceRouter

* fix: use correctly sliced bias in gMLP projection
diff --git a/megatron/model/gmlp.py b/megatron/model/gmlp.py
@@ -80,7 +80,7 @@ def forward(self, x, attention_mask):
             mask = torch.ones(weight.shape[:2], device=device).triu_(1).bool()
             weight = weight.masked_fill(mask, 0.0)
 
-        gate = F.linear(gate.transpose(2, 1), weight, self.proj.bias).transpose(2, 1)
+        gate = F.linear(gate.transpose(2, 1), weight, bias).transpose(2, 1)
 
         if self.use_attn:
             gate = gate + self.attn(x, attention_mask)
diff --git a/megatron/model/router.py b/megatron/model/router.py
@@ -223,8 +223,8 @@ def jitter(self, x):
         Returns:
             torch.Tensor: Jittered input tensor.
         """
-        low = 1.0 - self.args.moe_jitter_eps
-        high = 1.0 + self.args.moe_jitter_eps
+        low = 1.0 - self.jitter_eps
+        high = 1.0 + self.jitter_eps
         noise = torch.rand(x.size(), dtype=x.dtype, device=x.device)
         return low + noise * (high - low)