Replace naive eager attention with SDPA (#4725)

Bobholamovic · Bobholamovic · commit acab8aa60754 · 2025-11-13T06:19:48.000Z
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_ernie.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_ernie.py
@@ -296,7 +296,7 @@ def forward(self, hidden_states):
                 3. Scale by learned weight parameter
             - Maintains original dtype for numerical stability during computation
         """
-        if self.config.fuse_rms_norm:
+        if hidden_states.dtype != paddle.float16 and self.config.fuse_rms_norm:
             return fused_rms_norm_ext(
                 hidden_states, self.weight, self.variance_epsilon
             )[0].astype(self.weight.dtype)
@@ -854,8 +854,15 @@ def core_attn(
         v = tensor.transpose(x=v, perm=perm)
 
         replicate = self.config.num_attention_heads // self.config.num_key_value_heads
+        is_float16 = k.dtype == paddle.float16
+        if is_float16:
+            k = k.cast(paddle.float32)
+            v = v.cast(paddle.float32)
         k = paddle.repeat_interleave(k, replicate, axis=1)
         v = paddle.repeat_interleave(v, replicate, axis=1)
+        if is_float16:
+            k = k.cast(paddle.float16)
+            v = v.cast(paddle.float16)
 
         scale_qk_coeff = self.config.scale_qk_coeff * self.head_dim**0.5
         product = paddle.matmul(x=q.scale(1.0 / scale_qk_coeff), y=k, transpose_y=True)
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_siglip.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_siglip.py
@@ -100,15 +100,22 @@ def eager_attention_forward(
     dropout: float = 0.0,
     **kwargs,
 ):
-    attn_weights = paddle.matmul(query, key.transpose((0, 1, 3, 2))) * scaling
+    origin_dtype = query.dtype
+
+    attn_weights = paddle.matmul(x=query.scale(scaling), y=key, transpose_y=True)
+    attn_weights = attn_weights.cast(paddle.float32)
+
     if attention_mask is not None:
+        attnetion_mask = attention_mask.cast(paddle.float32)
         attn_weights = attn_weights + attention_mask
 
-    attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query.dtype)
+    attn_weights = F.softmax(attn_weights, axis=-1)
+    attn_weights = attn_weights.cast(origin_dtype)
+
     attn_weights = F.dropout(attn_weights, p=dropout, training=module.training)
 
     attn_output = paddle.matmul(attn_weights, value)
-    attn_output = attn_output.transpose((0, 2, 1, 3)).contiguous()
+    attn_output = attn_output.transpose((0, 2, 1, 3))
 
     return attn_output, attn_weights
 
@@ -138,44 +145,55 @@ def forward(
         cu_seqlens: Optional[List[paddle.Tensor]] = None,
         rope_emb: Optional[Tuple[paddle.Tensor, paddle.Tensor]] = None,  # (cos, sin)
     ):
+        if output_attentions:
+            raise NotImplementedError
+
         B, L, D = hidden_states.shape
 
         q = self.q_proj(hidden_states)
         k = self.k_proj(hidden_states)
         v = self.v_proj(hidden_states)
 
         # [B, L, H, Dh]
-
         q = q.reshape([B, L, self.num_heads, self.head_dim])
         k = k.reshape([B, L, self.num_heads, self.head_dim])
         v = v.reshape([B, L, self.num_heads, self.head_dim])
         if rope_emb is not None:
             cos, sin = rope_emb
             q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
 
-        # → [B, H, L, Dh]
-        q = q.transpose([0, 2, 1, 3])
-        k = k.transpose([0, 2, 1, 3])
-        v = v.transpose([0, 2, 1, 3])
-
-        attn_output, attn_weights = eager_attention_forward(
-            self,
-            q,
-            k,
-            v,
-            attention_mask,
-            is_causal=self.is_causal,
-            scaling=self.scale,
-            dropout=0.0 if not self.training else self.dropout,
-        )
-        attn_output = attn_output.reshape([B, L, D]).contiguous()
+        if q.dtype == paddle.float32:
+            # → [B, H, L, Dh]
+            q = q.transpose([0, 2, 1, 3])
+            k = k.transpose([0, 2, 1, 3])
+            v = v.transpose([0, 2, 1, 3])
+
+            attn_output, _ = eager_attention_forward(
+                self,
+                q,
+                k,
+                v,
+                attention_mask,
+                is_causal=self.is_causal,
+                scaling=self.scale,
+                dropout=0.0 if not self.training else self.dropout,
+            )
+            attn_output = attn_output.reshape([B, L, D])
+        else:
+            attn_output = paddle.nn.functional.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attention_mask,
+                dropout_p=self.dropout,
+                is_causal=self.is_causal,
+                training=self.training,
+            )
+        attn_output = attn_output.reshape([B, L, D])
 
         attn_output = self.out_proj(attn_output)
 
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights
+        return attn_output, None
 
 
 class SiglipVisionEmbeddings(nn.Layer):
diff --git a/paddlex/inference/models/doc_vlm/predictor.py b/paddlex/inference/models/doc_vlm/predictor.py
@@ -29,7 +29,7 @@
 from ....utils.deps import require_genai_client_plugin
 from ....utils.device import TemporaryDeviceChanger
 from ...common.batch_sampler import DocVLMBatchSampler
-from ...utils.misc import is_bfloat16_available
+from ...utils.misc import is_bfloat16_available, is_float16_available
 from ..base import BasePredictor
 from .result import DocVLMResult
 
@@ -54,7 +54,12 @@ def __init__(self, *args, **kwargs):
 
         if self._use_local_model:
             self.device = kwargs.get("device", None)
-            self.dtype = "bfloat16" if is_bfloat16_available(self.device) else "float32"
+            if is_bfloat16_available(self.device):
+                self.dtype = "bfloat16"
+            elif is_float16_available(self.device):
+                self.dtype = "float16"
+            else:
+                self.dtype = "float32"
 
             self.infer, self.processor = self._build(**kwargs)
 
diff --git a/paddlex/inference/utils/misc.py b/paddlex/inference/utils/misc.py
@@ -32,3 +32,14 @@ def is_bfloat16_available(device):
     return (
         "npu" in get_device_type() or paddle.amp.is_bfloat16_supported()
     ) and device_type in ("gpu", "npu", "xpu", "mlu")
+
+
+def is_float16_available(device):
+    import paddle.amp
+
+    if device is None:
+        device = get_default_device()
+    device_type, _ = parse_device(device)
+    return (
+        "npu" in get_device_type() or paddle.amp.is_float16_supported()
+    ) and device_type in ("gpu", "npu", "xpu", "mlu", "dcu")