support generate dense attn mask from sparse one

forBlank · forBlank · commit 264caa1a3ca3 · 2025-11-19T18:46:11.000+08:00
diff --git a/paddleformers/nn/attention/eager_attention.py b/paddleformers/nn/attention/eager_attention.py
@@ -17,6 +17,7 @@
 import paddle
 import paddle.nn as nn
 
+from ...utils.masking_utils import _gen_from_sparse_attn_mask_indices
 from .utils import repeat_kv
 
 
@@ -37,15 +38,26 @@ def eager_attention_forward(
         key = repeat_kv(key, num_key_value_groups)
         value = repeat_kv(value, num_key_value_groups)
 
+    if attention_mask is None and kwargs.get("attn_mask_startend_row_indices", None) is not None:
+        attn_mask_startend_row_indices = kwargs["attn_mask_startend_row_indices"]
+        if attn_mask_startend_row_indices.ndim == 3:
+            attn_mask_startend_row_indices = attn_mask_startend_row_indices.unsqueeze(-1)
+        if attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.shape[-1] == 1:
+            is_causal = True
+        if attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.shape[-1] == 4:
+            is_causal = False
+
+        attention_mask = _gen_from_sparse_attn_mask_indices(attn_mask_startend_row_indices, query.dtype, is_causal)
+
     perm = [0, 2, 1, 3]  # b l h d -> b h l d
     query = paddle.transpose(x=query, perm=perm)
     key = paddle.transpose(x=key, perm=perm)
     value = paddle.transpose(x=value, perm=perm)
 
-    attn_weights = paddle.matmul(query, key.transpose([0, 1, 3, 2])) * scaling
+    attn_weights = paddle.matmul(x=query * scaling, y=key, transpose_y=True)
     if attention_mask is not None:
-        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
-        attn_weights = attn_weights + causal_mask
+        attention_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + attention_mask
 
     if sink is not None:
         sink = sink.reshape([1, -1, 1, 1]).expand([query.shape[0], -1, query.shape[-2], -1])
@@ -54,7 +66,7 @@ def eager_attention_forward(
         scores = probs[..., :-1]  # we drop the sink here
         attn_weights = nn.functional.dropout(scores, p=dropout, training=module.training)
     else:
-        attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype=paddle.float32).astype(query.dtype)
+        attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype=query.dtype)
         attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
 
     attn_output = paddle.matmul(attn_weights, value)  # b h l l @ b h l d -> b h l d
diff --git a/paddleformers/nn/attention/flashmask_attention.py b/paddleformers/nn/attention/flashmask_attention.py
@@ -40,11 +40,9 @@ def flashmask_attention_forward(
         is_causal = True
     if attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.shape[-1] == 4:
         is_causal = False
-    
+
     if is_causal is None:
-        raise ValueError(
-            f"The `is_causal` argument must be specified when using the Flash Mask Attention kernel."
-        )
+        raise ValueError("The `is_causal` argument must be specified when using the Flash Mask Attention.")
 
     if sink is None:
         out = flashmask_attention(
diff --git a/paddleformers/nn/attention/sdpa_attention.py b/paddleformers/nn/attention/sdpa_attention.py
@@ -38,10 +38,14 @@ def sdpa_attention_forward(
     if is_causal is None and attn_mask_startend_row_indices is None:
         is_causal = query.shape[1] > 1 and attention_mask is None and getattr(module, "is_causal", True)
     elif attn_mask_startend_row_indices is not None:
-        is_causal = False
         if attn_mask_startend_row_indices.ndim == 3:
             attn_mask_startend_row_indices = attn_mask_startend_row_indices.unsqueeze(-1)
-        attention_mask = _gen_from_sparse_attn_mask_indices(attn_mask_startend_row_indices, query.dtype)
+        if attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.shape[-1] == 1:
+            is_causal = True
+        if attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.shape[-1] == 4:
+            is_causal = False
+
+        attention_mask = _gen_from_sparse_attn_mask_indices(attn_mask_startend_row_indices, query.dtype, is_causal)
 
     if sink is None:
         attn_output = nn.functional.scaled_dot_product_attention(
diff --git a/paddleformers/utils/masking_utils.py b/paddleformers/utils/masking_utils.py
@@ -13,32 +13,97 @@
 # limitations under the License.
 
 import os
+from typing import Optional
 
 import numpy as np
 import paddle
 
 from .tools import get_env_device
 
 
-def _gen_from_sparse_attn_mask_indices(attn_mask_start_row_indices, dtype):
+def _gen_from_sparse_attn_mask_indices(
+    attn_mask_startend_row_indices: paddle.Tensor,
+    dtype: Optional[paddle.dtype] = paddle.bfloat16,
+    is_causal: Optional[bool] = None,
+):
     """
-    Recover 4-D attention_mask from attn_mask_start_row_indices.
+    Recover 4-D attention_mask from attn_mask_startend_row_indices.
 
     Args:
-        attn_mask_start_row_indices (paddle.Tensor): The start row indices for the attention mask.
-        dtype (str): The data type of the tensor.
+        attn_mask_startend_row_indices (paddle.Tensor):
+            A column-wise sparse attention mask row indices tensor.
+            A 4-D tensor with shape [batch_size, k_num_heads, k_seq_len, {1, 2, 4}].
+            The dtype must be int32. k_num_heads can be 1 or the same as key's num_heads. When num_heads is 1, it will be broadcast to match key's num_heads.
+            Depending on the value of the causal parameter, startend_row_indices can take different shapes and meanings.
+
+            - When `causal=True` and the shape is [batch_size, k_num_heads, k_seq_len, 1],
+              indicating unidirectional attention. The value represents the starting row index of the left
+              lower triangular mask in the dense mask. The value startend_row_indices[..., 0] indicates that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) will be masked.
+            - When `causal=True` and the shape is [batch_size, k_num_heads, k_seq_len, 2],
+              indicating unidirectional attention. The values represent the starting and ending row indices of
+              the left lower triangular mask in the dense mask. The values startend_row_indices[..., 0:2] in startend_row_indices indicate that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) but above the startend_row_indices[..., 1]-th row (exclusive) will be masked.
+            - When `causal=False` and the shape is [batch_size, k_num_heads, k_seq_len, 2],
+              indicating bidirectional attention. The values represent the starting row index of the left
+              lower triangular mask and the ending row index of the right upper triangular mask in the dense mask. The values startend_row_indices[..., 0:2] in startend_row_indices indicate that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) will be masked, and elements in the upper right triangle starting from the startend_row_indices[..., 1]-th row upwards (exclusive) will be masked.
+            - When `causal=False` and the shape is [batch_size, k_num_heads, k_seq_len, 4] ,
+              indicating bidirectional attention. The values represent the start and end row indices of the
+              left lower triangular mask and the start and end row indices of the right upper triangular mask in the dense mask. The values startend_row_indices[..., 0:4] in startend_row_indices indicate that elements in the lower left triangle of the attention score matrix starting from the startend_row_indices[..., 0]-th row downwards (inclusive) but above the startend_row_indices[..., 1] row (exclusive) will be masked, and elements in the upper right triangle starting from the startend_row_indices[..., 2]-th row downwards (inclusive) but above the startend_row_indices[..., 3] row (exclusive) will be masked.
+        dtype (paddle.dtype): The data type of the tensor.
+        causal (bool): Whether to enable causal mode.
 
     Returns:
-        paddle.Tensor: The dense attention mask recovered from attn_mask_start_row_indices.
+        paddle.Tensor: The dense attention mask recovered from attn_mask_startend_row_indices.
     """
-    batch_size, _, max_seq_len, _ = attn_mask_start_row_indices.shape
-    base = paddle.arange(max_seq_len, dtype="int32").unsqueeze(1).expand([batch_size, -1, max_seq_len]).unsqueeze(1)
-    mask_indices = attn_mask_start_row_indices
 
-    tril = paddle.tril(
-        paddle.ones([max_seq_len, max_seq_len], dtype="bool").expand([batch_size, 1, max_seq_len, max_seq_len])
-    )
-    attention_mask = paddle.logical_and(base < mask_indices, tril)
+    if attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.ndim == 3:
+        attn_mask_startend_row_indices = attn_mask_startend_row_indices.unsqueeze(-1)
+    if attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.shape[-1] == 1:
+        is_causal = True
+    if attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.shape[-1] == 4:
+        is_causal = False
+
+    if is_causal is None:
+        raise ValueError(
+            "The `is_causal` argument must be specified when recovering the dense attention mask from the column-wise sparse attention mask row indices."
+        )
+
+    batch_size, num_head, seq_len, bound_num = attn_mask_startend_row_indices.shape
+    has_end = (is_causal and bound_num == 2) or ((not is_causal) and bound_num == 4)
+
+    attention_mask = paddle.ones([seq_len, seq_len], dtype="bool").expand([batch_size, num_head, seq_len, seq_len])
+    if is_causal:
+        attention_mask = paddle.tril(attention_mask)
+
+    base = paddle.arange(seq_len, dtype="int32").unsqueeze(1).expand([batch_size, num_head, -1, seq_len])
+
+    # [batch_size, k_num_heads, k_seq_len, {1, 2, 4}] -> [batch_size, k_num_heads, {1, 2, 4}, k_seq_len]
+    mask_indices = attn_mask_startend_row_indices.transpose([0, 1, 3, 2])
+
+    downstart_mask_indices = mask_indices[:, :, 0, :]
+    downstart_mask_indices = downstart_mask_indices.expand([batch_size, num_head, seq_len, -1])
+    lower_tri = base < downstart_mask_indices
+    if has_end:
+        downend_mask_indices = mask_indices[:, :, 1, :]
+        downend_mask_indices = downend_mask_indices.expand([batch_size, num_head, seq_len, -1])
+        lower_tri = paddle.logical_or(lower_tri, base >= downend_mask_indices)
+
+    attention_mask = paddle.logical_and(attention_mask, lower_tri)
+
+    if not is_causal:
+        if has_end:
+            upstart_mask_indices = mask_indices[:, :, 2, :]
+            upstart_mask_indices = upstart_mask_indices.expand([batch_size, num_head, seq_len, -1])
+            upend_mask_indices = mask_indices[:, :, 3, :]
+            upend_mask_indices = upend_mask_indices.expand([batch_size, num_head, seq_len, -1])
+            upper_tri = base >= upend_mask_indices
+            upper_tri = paddle.logical_or(upper_tri, base < upstart_mask_indices)
+        else:
+            upend_mask_indices = mask_indices[:, :, 1, :]
+            upend_mask_indices = upend_mask_indices.expand([batch_size, num_head, seq_len, -1])
+            upper_tri = base >= upend_mask_indices
+
+        attention_mask = paddle.logical_and(attention_mask, upper_tri)
+
     attention_mask = paddle.scale(
         x=attention_mask.astype(dtype),
         scale=1000000.0,