Fix error in merge conflct

muskansh-google · muskansh-google · commit be6178e2fee3 · 2026-03-23T15:44:57.000Z
diff --git a/tpu_inference/models/vllm/vllm_model_wrapper.py b/tpu_inference/models/vllm/vllm_model_wrapper.py
@@ -130,79 +130,6 @@ def __init__(self, vllm_config: VllmConfig, rng: PRNGKey, mesh: Mesh):
             MultiHeadLatentAttentionWrapper.register_oot(
                 VllmTPUMultiHeadLatentAttentionWrapper)
 
-    def _patch_sdpa(self):
-        from torchax.ops.jtorch import register_function
-
-        from tpu_inference.layers.common.attention_interface import \
-            sharded_flash_attention
-
-        @register_function(
-            torch.nn.functional.scaled_dot_product_attention,
-            is_jax_function=True,
-            needs_env=False,
-        )
-        def patched_sdpa(
-            query,
-            key,
-            value,
-            attn_mask=None,
-            dropout_p=0.0,
-            is_causal=False,
-            scale=None,
-            enable_gqa=False,
-        ):
-            if dropout_p != 0.0:
-                raise NotImplementedError(
-                    "patched_sdpa does not support dropout_p")
-            if enable_gqa is not False:
-                raise NotImplementedError(
-                    "patched_sdpa does not support enable_gqa")
-
-            # Q, K, V shapes: (batch, num_heads, seq_len, head_dim)
-            batch = query.shape[0]
-            num_heads = query.shape[1]
-            q_seq_len = query.shape[2]
-            kv_seq_len = key.shape[2]
-
-            # padding due to the requirement of sharded_flash_attention
-            q_pad = (128 - (q_seq_len % 128)) % 128
-            kv_pad = (128 - (kv_seq_len % 128)) % 128
-
-            if q_pad > 0:
-                query = jnp.pad(query, ((0, 0), (0, 0), (0, q_pad), (0, 0)))
-            if kv_pad > 0:
-                key = jnp.pad(key, ((0, 0), (0, 0), (0, kv_pad), (0, 0)))
-                value = jnp.pad(value, ((0, 0), (0, 0), (0, kv_pad), (0, 0)))
-
-            # Prevent nan while using -inf
-            mask_value = -0.7 * float(jnp.finfo(jnp.dtype("float32")).max)
-            ab = jnp.zeros((batch, num_heads, q_seq_len, kv_seq_len),
-                           dtype=jnp.float32)
-            if attn_mask is not None:
-                # attn_mask shape: (batch, num_heads, q_len, kv_len)
-                if attn_mask.dtype == jnp.bool_:
-                    ab = jnp.where(attn_mask, ab, mask_value)
-                else:
-                    ab += attn_mask
-
-            if q_pad > 0 or kv_pad > 0:
-                ab = jnp.pad(
-                    ab,
-                    ((0, 0), (0, 0), (0, q_pad), (0, kv_pad)),
-                    mode="constant",
-                    constant_values=mask_value,
-                )
-
-            attn_fn = sharded_flash_attention(self.mesh,
-                                              causal=is_causal,
-                                              sm_scale=scale,
-                                              use_attention_bias=True)
-            out = attn_fn(query, key, value, ab, None)
-
-            if q_pad > 0:
-                out = out[:, :, :q_seq_len, :]
-
-            return out
 
     def _patch_vllm_ops(self):
         # Caution: there is no public api for restore the ops.