[None][test] ltx2: self-attn unit tests pass valid pe to match fused contract

luyiyun1021 · luyiyun1021 · commit fbe14ede9266 · 2026-05-21T20:33:29.000-07:00
LTX2Attention now hardcodes fuse_qk_norm_rope=True in __init__, which makes
forward() route every self-attn call through the FUSE_QKV branch that
unpacks `cos, sin = pe` unconditionally. The four self-attn sanity / backend-
equivalence tests previously passed pe=None, which silently fell into the
_forward_unfused path back when fuse_qk_norm_rope defaulted to False. That
implicit reliance is gone, so the tests now hit a TypeError at line 265.

Fix: build an identity-rotation RoPE tuple (cos=1, sin=0, shape [B,T,H,D])
in a `_make_pe` helper and pass it through pe= on the four self-attn
testcases. cos/sin layout mirrors what `_split_freqs_cis` produces in
production, so the tests exercise the same fused norm+RoPE kernel path
without needing real RoPE angles. Identity rotation keeps the resulting
shape and norm checks meaningful (q*1 + rotate_half(q)*0 = q).

Cross-attention tests are unaffected — they go through SEPARATE_QKV and
apply_split_norm_or_norm_rope, which already accepts pe=None as norm-only.

E2E LTX-2 nvfp4 single-stage smoke test still passes (32.7s, 12.13 MB mp4).
All six tests in test_ltx2_attention.py PASS after the change.

Signed-off-by: Yiyun Lu &lt;yiyunl@nvidia.com&gt;
Signed-off-by: Yiyun Lu &lt;55233584+luyiyun1021@users.noreply.github.com&gt;
diff --git a/tests/unittest/_torch/visual_gen/test_ltx2_attention.py b/tests/unittest/_torch/visual_gen/test_ltx2_attention.py
@@ -52,6 +52,27 @@ def _init_weights(module: torch.nn.Module, std: float = 0.02):
                 torch.nn.init.normal_(p, mean=0.0, std=std)
 
 
+def _make_pe(
+    batch_size: int,
+    seq_len: int,
+    heads: int,
+    head_dim: int,
+    dtype: torch.dtype,
+    device: str,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Build an identity-rotation (cos=1, sin=0) RoPE tuple for self-attn tests.
+
+    LTX-2 self-attn forward (fuse_qk_norm_rope=True, head_dim ∈ {64, 128}) requires
+    ``pe`` to be a non-None ``(cos, sin)`` tuple in token-major [B, T, H, D] layout —
+    the same shape ``_split_freqs_cis`` produces in production. cos=1, sin=0 makes
+    the RoPE step an identity, so shape-only sanity checks remain meaningful while
+    still exercising the fused norm+RoPE kernel.
+    """
+    cos = torch.ones(batch_size, seq_len, heads, head_dim, device=device, dtype=dtype)
+    sin = torch.zeros(batch_size, seq_len, heads, head_dim, device=device, dtype=dtype)
+    return cos, sin
+
+
 class TestLTX2SelfAttention(unittest.TestCase):
     """Test LTX2Attention self-attention with different backends."""
 
@@ -86,9 +107,10 @@ def test_vanilla_self_attention_sanity(self):
         )
 
         x = torch.randn(batch_size, seq_len, query_dim, device=self.DEVICE, dtype=dtype) * 0.02
+        pe = _make_pe(batch_size, seq_len, heads, head_dim, dtype, self.DEVICE)
 
         with torch.no_grad():
-            output = attn(x, context=None, pe=None)
+            output = attn(x, context=None, pe=pe)
 
         self.assertEqual(output.shape, (batch_size, seq_len, query_dim))
 
@@ -122,9 +144,10 @@ def test_trtllm_self_attention_sanity(self):
         )
 
         x = torch.randn(batch_size, seq_len, query_dim, device=self.DEVICE, dtype=dtype) * 0.02
+        pe = _make_pe(batch_size, seq_len, heads, head_dim, dtype, self.DEVICE)
 
         with torch.no_grad():
-            output = attn(x, context=None, pe=None)
+            output = attn(x, context=None, pe=pe)
 
         self.assertEqual(output.shape, (batch_size, seq_len, query_dim))
 
@@ -248,9 +271,10 @@ def test_gated_self_attention_sanity(self):
         self.assertIsNotNone(attn.to_gate_logits, "Gated attention should create to_gate_logits")
 
         x = torch.randn(batch_size, seq_len, query_dim, device=self.DEVICE, dtype=dtype) * 0.02
+        pe = _make_pe(batch_size, seq_len, heads, head_dim, dtype, self.DEVICE)
 
         with torch.no_grad():
-            output = attn(x, context=None, pe=None)
+            output = attn(x, context=None, pe=pe)
 
         self.assertEqual(output.shape, (batch_size, seq_len, query_dim))
 
@@ -308,10 +332,11 @@ def test_backend_equivalence(self):
         trtllm_attn.load_state_dict(vanilla_attn.state_dict())
 
         x = torch.randn(batch_size, seq_len, query_dim, device=self.DEVICE, dtype=dtype) * 0.02
+        pe = _make_pe(batch_size, seq_len, heads, head_dim, dtype, self.DEVICE)
 
         with torch.no_grad():
-            out_vanilla = vanilla_attn(x.clone(), context=None, pe=None)
-            out_trtllm = trtllm_attn(x.clone(), context=None, pe=None)
+            out_vanilla = vanilla_attn(x.clone(), context=None, pe=pe)
+            out_trtllm = trtllm_attn(x.clone(), context=None, pe=pe)
 
         # Skip comparison if either has NaN/Inf (can happen with random weights)
         has_nan = torch.isnan(out_vanilla).any() or torch.isnan(out_trtllm).any()