fix(model): always build SWA mask to preserve causal masking at all seq lengths

nvegesna-netizen · claude · nvegesna-netizen · commit beba296f1c9d · 2026-06-12T09:09:46.000-07:00
The previous optimization skipped get_swa() when sq &lt;= window_size[0] + 1,
assuming the window covered the full sequence. But SWA layers use
attn_mask_type=arbitrary, which routes through ScaledSoftmax (plain softmax,
no causal mask) when mask=None — dropping causal masking entirely for short
sequences, not just the SWA restriction.

get_swa() encodes the causal triangular structure via triu/tril and
degenerates to a pure causal mask when the window covers all positions,
so always calling it is both correct and sufficient.

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
Signed-off-by: Nitin Vegesna &lt;nvegesna@nvidia.com&gt;
diff --git a/src/megatron/bridge/models/gemma/gemma2_provider.py b/src/megatron/bridge/models/gemma/gemma2_provider.py
@@ -251,11 +251,12 @@ def forward(
         # mask [b, np, sq, sk], so we unsqueeze to [1, 1, sq, sk] when there is no
         # padding mask. When a padding mask [b, 1, sq, sk] is present, the | already
         # produces a 4D result via broadcasting.
-        # Skip mask generation when the window fully covers the sequence: masking only
-        # fires when query index i > window_size[0], i.e. seq_q > window_size[0] + 1.
-        # For seq_length=4096 with window=4095 this is a no-op, so we stay on the
-        # fast ScaledUpperTriangMaskedSoftmax (and FlexAttention) path.
-        if self.window_size is not None and query.size(0) > self.window_size[0] + 1:
+        # The mask is always generated for SWA layers: attn_mask_type=arbitrary means
+        # FusedScaleMaskSoftmax routes through ScaledSoftmax (no causal masking) when
+        # mask=None, so omitting the mask for short sequences would drop causal masking
+        # entirely. get_swa() encodes causal structure via triu/tril and degenerates to
+        # a pure causal mask when the window fully covers the sequence.
+        if self.window_size is not None:
             swa_mask = get_swa(query.size(0), key.size(0), self.window_size)
             if attention_mask is None:
                 attention_mask = swa_mask.unsqueeze(0).unsqueeze(0)
diff --git a/tests/unit_tests/models/gemma/test_gemma2_provider.py b/tests/unit_tests/models/gemma/test_gemma2_provider.py
@@ -255,15 +255,14 @@ def test_swa_applied_when_attention_mask_is_none(self):
         Prior to the fix, the gate was:
             if attention_mask is not None and self.window_size is not None:
         which was never True on the pretrain path (MCore passes attention_mask=None).
-        After the fix the gate is:
-            if self.window_size is not None and query.size(0) > self.window_size[0] + 1:
-        We verify this by patching get_swa and confirming it is called from forward()
-        when attention_mask=None is passed to an even-numbered layer whose window is
-        smaller than the sequence length.  window=(2, 0) with seq=4: 4 > 3, so the
-        guard fires and the SWA mask is built and unsqueezed to [1, 1, sq, sk].
+        The gate is now simply:
+            if self.window_size is not None:
+        The mask is always built for SWA layers — omitting it when the window covers the
+        full sequence would drop causal masking entirely because attn_mask_type=arbitrary
+        routes through ScaledSoftmax (plain softmax, no causal mask) when mask=None.
+        get_swa() degenerates to a pure causal mask when the window covers all positions.
         """
         seq, batch, heads, head_dim = 4, 1, 8, 32
-        # window=(2, 0): seq=4 > window+1=3, so the SWA guard fires.
         attn = _make_attention(window_size=(2, 0))
         assert attn.window_size == (2, 0), "even layer must have window_size set"
 
@@ -346,9 +345,7 @@ def test_swa_combined_with_padding_mask(self):
         which silently discarded any incoming padding mask. The correct behaviour is:
             attention_mask = swa_mask if attention_mask is None else (swa_mask | attention_mask)
         Both masks use True=masked-out, so logical OR gives the union of blocked positions.
-        window=(2, 0) with seq=4: 4 > 3, so the SWA guard fires.
         """
-        # window=(2, 0): seq=4 > window+1=3, so the SWA guard fires.
         attn = _make_attention(window_size=(2, 0))
 
         seq, batch, heads, head_dim = 4, 2, 8, 32