[Bugfix] Fix NPU SDPA attention mask shape and semantics (#1031)

gcanlin · muziyuhui666 · hsliuustc0106 · web-flow · commit ed89c8b04369 · 2026-01-31T14:14:15.000+08:00
Signed-off-by: gcanlin &lt;canlinguosdu@gmail.com&gt;
Co-authored-by: muziyuhui666 &lt;111362884+muziyuhui666@users.noreply.github.com&gt;
Co-authored-by: Hongsheng Liu &lt;liuhongsheng4@huawei.com&gt;
diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py
@@ -10,26 +10,8 @@
     AttentionMetadata,
 )
 
-# Import flash attention functions with fallback chain from utils/fa.py
-# FA3 (fa3_fwd_interface) -> FA3 (flash_attn_interface) -> FA2 (flash_attn)
-from vllm_omni.diffusion.attention.backends.utils.fa import (
-    HAS_FLASH_ATTN,
-    _pad_input,
-    _unpad_input,
-    _upad_input,
-    flash_attn_func,
-    flash_attn_varlen_func,
-)
-
 logger = init_logger(__name__)
 
-if not HAS_FLASH_ATTN:
-    raise ImportError(
-        "FlashAttentionBackend requires Flash Attention. "
-        "Please install one of: fa3-fwd, flash-attention, or flash-attn. "
-        "Otherwise, use SDPA backend by setting DIFFUSION_ATTENTION_BACKEND=TORCH_SDPA"
-    )
-
 
 class FlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
@@ -74,6 +56,24 @@ def forward_cuda(
         attn_metadata: AttentionMetadata = None,
     ) -> torch.Tensor:
         """CUDA/ROCm flash attention implementation."""
+        # Import flash attention functions with fallback chain from utils/fa.py
+        # FA3 (fa3_fwd_interface) -> FA3 (flash_attn_interface) -> FA2 (flash_attn)
+        from vllm_omni.diffusion.attention.backends.utils.fa import (
+            HAS_FLASH_ATTN,
+            _pad_input,
+            _unpad_input,
+            _upad_input,
+            flash_attn_func,
+            flash_attn_varlen_func,
+        )
+
+        if not HAS_FLASH_ATTN:
+            raise ImportError(
+                "FlashAttentionBackend requires Flash Attention. "
+                "Please install one of: fa3-fwd, flash-attention, or flash-attn. "
+                "Otherwise, use SDPA backend by setting DIFFUSION_ATTENTION_BACKEND=TORCH_SDPA"
+            )
+
         query_length = query.size(1)
         attention_mask = attn_metadata.attn_mask if attn_metadata is not None else None
         #  Contains at least one padding token in the sequence
@@ -122,7 +122,15 @@ def forward_npu(
         attn_metadata: AttentionMetadata = None,
     ) -> torch.Tensor:
         """NPU attention implementation using mindiesd."""
-        from mindiesd import attention_forward
+        try:
+            from mindiesd import attention_forward
+        except ImportError:
+            raise ImportError(
+                "FlashAttentionBackend NPU implementation requires MindIE-SD. "
+                "Please install MindIE-SD to enable NPU attention support. "
+                "For installation details, see https://gitcode.com/Ascend/MindIE-SD"
+                "Otherwise, use SDPA backend by setting DIFFUSION_ATTENTION_BACKEND=TORCH_SDPA"
+            )
 
         attention_mask = attn_metadata.attn_mask if attn_metadata else None
         output = attention_forward(
diff --git a/vllm_omni/diffusion/attention/backends/sdpa.py b/vllm_omni/diffusion/attention/backends/sdpa.py
@@ -13,6 +13,29 @@
 logger = init_logger(__name__)
 
 
+def _maybe_reshape_attn_mask(query: torch.Tensor, key: torch.Tensor, attn_mask: torch.Tensor | None = None):
+    """
+    Reshape Attention Mask
+    [batch_size, seq_len_k] -> [batch_size, 1, seq_len_q, seq_len_k]
+    """
+    # Skip Attention Mask if all values are 1, `None` mask can speedup the computation
+    if attn_mask is not None and torch.all(attn_mask != 0):
+        attn_mask = None
+
+    # Reshape Attention Mask
+    # [batch_size, seq_len_k] -> [batch_size, 1, seq_len_q, seq_len_k]
+    if (
+        attn_mask is not None
+        and attn_mask.ndim == 2
+        and attn_mask.shape[0] == query.shape[0]
+        and attn_mask.shape[1] == key.shape[1]
+    ):
+        B, Sq, Skv = attn_mask.shape[0], query.shape[1], key.shape[1]
+        attn_mask = attn_mask.to(torch.bool)
+        attn_mask = attn_mask.unsqueeze(1).expand(B, Sq, Skv).unsqueeze(1).contiguous()
+    return attn_mask
+
+
 class SDPABackend(AttentionBackend):
     accept_output_buffer: bool = True
 
@@ -47,16 +70,15 @@ def __init__(
         self.causal = causal
         self.softmax_scale = softmax_scale
 
-    def forward(
+    def forward_cuda(
         self,
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        attn_metadata: AttentionMetadata = None,
+        attn_metadata: AttentionMetadata | None = None,
     ) -> torch.Tensor:
         query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value))
         attention_mask = attn_metadata.attn_mask if attn_metadata else None
-
         output = torch.nn.functional.scaled_dot_product_attention(
             query,
             key,
@@ -68,3 +90,33 @@ def forward(
         )
         out = output.permute(0, 2, 1, 3)
         return out
+
+    def forward_xpu(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_metadata: AttentionMetadata | None = None,
+    ) -> torch.Tensor:
+        return self.forward_cuda(query, key, value, attn_metadata)
+
+    def forward_hip(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_metadata: AttentionMetadata | None = None,
+    ) -> torch.Tensor:
+        return self.forward_cuda(query, key, value, attn_metadata)
+
+    def forward_npu(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_metadata: AttentionMetadata | None = None,
+    ) -> torch.Tensor:
+        if attn_metadata:
+            attention_mask = _maybe_reshape_attn_mask(query, key, attn_metadata.attn_mask)
+            setattr(attn_metadata, "attn_mask", attention_mask)
+        return self.forward_cuda(query, key, value, attn_metadata)