fix ring attn w/ native backend in torch 2.10 (#750)

DefTruth · web-flow · commit 2790975af0bd · 2026-01-23T18:11:13.000+08:00
diff --git a/src/cache_dit/parallelism/attention/_attention_dispatch.py b/src/cache_dit/parallelism/attention/_attention_dispatch.py
@@ -196,8 +196,12 @@ def _native_attention_forward_op(
                 is_causal=is_causal,
                 scale=scale,
             )[:2]
-            out = out.transpose(1, 2)
-            lse = lse.transpose(1, 2)
+            # [B, H, N, D] -> [B, N, H, D]
+            out = out.transpose(1, 2)  # type: torch.Tensor
+            lse = lse.transpose(1, 2)  # type: torch.Tensor
+            if lse.dim() == 3:
+                # [B, N, H] -> [B, N, H, 1]
+                lse = lse.unsqueeze(-1)
             return out, lse
 
         query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value))