Fix type and formatting issues for int4 paged KV

lesj0610 · lesj0610 · commit 3ee92f463441 · 2026-04-14T00:41:47.000+09:00
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -18,7 +18,7 @@
 import logging
 import math
 from types import SimpleNamespace
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union, overload
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union, cast, overload
 
 import torch
 
@@ -1285,19 +1285,26 @@ def single_prefill_with_kv_cache(
     if return_lse:
         lse = torch.empty((q.size(0), q.size(1)), dtype=torch.float32, device=q.device)
 
+    k_tensor = cast(torch.Tensor, k)
+    v_tensor = cast(torch.Tensor, v)
+
     if is_float8(q):
         # FP8 quant enabled, do sanity check:
         #   1. unsupported feature
         #   2. dtype check
         assert window_left == -1
-        assert q.dtype == k.dtype == v.dtype
-        assert q.shape[-1] == k.shape[-1] == v.shape[-1]
+        assert q.dtype == k_tensor.dtype == v_tensor.dtype
+        assert q.shape[-1] == k_tensor.shape[-1] == v_tensor.shape[-1]
         if scale_q is None:
             scale_q = torch.ones(q.shape[1], dtype=torch.float32, device=q.device)
         if scale_k is None:
-            scale_k = torch.ones(k.shape[1], dtype=torch.float32, device=q.device)
+            scale_k = torch.ones(
+                k_tensor.shape[1], dtype=torch.float32, device=q.device
+            )
         if scale_v is None:
-            scale_v = torch.ones(v.shape[1], dtype=torch.float32, device=q.device)
+            scale_v = torch.ones(
+                v_tensor.shape[1], dtype=torch.float32, device=q.device
+            )
     else:
         if scale_q is not None:
             sm_scale *= scale_q
@@ -1318,21 +1325,23 @@ def single_prefill_with_kv_cache(
             use_fp16_qk_reduction,
             packed_custom_mask is not None,  # use_custom_mask
             q.dtype,
-            k.dtype,
+            k_tensor.dtype,
         )
 
     # o_dtype should be provided for FP8 attention
     if o_dtype is None:
         o_dtype = q.dtype
-    out = torch.empty(q.shape[:-1] + v.shape[-1:], dtype=o_dtype, device=q.device)
+    out = torch.empty(
+        q.shape[:-1] + v_tensor.shape[-1:], dtype=o_dtype, device=q.device
+    )
 
     module = get_single_prefill_module(
         backend,
         q.dtype,
-        k.dtype,
+        k_tensor.dtype,
         out.dtype,
         q.shape[-1],  # head_dim_qk
-        v.shape[-1],  # head_dim_vo
+        v_tensor.shape[-1],  # head_dim_vo
         PosEncodingMode[pos_encoding_mode].value,
         window_left >= 0,  # use_sliding_window
         logits_soft_cap > 0,  # use_logits_soft_cap
@@ -1341,8 +1350,8 @@ def single_prefill_with_kv_cache(
 
     module.run(
         q,
-        k,
-        v,
+        k_tensor,
+        v_tensor,
         tmp,
         out,
         lse,
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
@@ -869,7 +869,7 @@ def is_int4_paged_kv_cache(
         INT4Tensor,
         Tuple[torch.Tensor, torch.Tensor],
         Tuple[INT4Tensor, INT4Tensor],
-    ]
+    ],
 ) -> bool:
     if isinstance(paged_kv_cache, INT4Tensor):
         return True
diff --git a/tests/attention/test_int4_paged_kv.py b/tests/attention/test_int4_paged_kv.py
@@ -149,8 +149,12 @@ def test_append_paged_kv_cache_int4_matches_quantized_layout(
         torch.testing.assert_close(
             v_cache.scale[page_indices, :, page_positions, :], expected_v.scale
         )
-        gathered_k = flashinfer.int4_dequantize(k_cache)[page_indices, :, page_positions]
-        gathered_v = flashinfer.int4_dequantize(v_cache)[page_indices, :, page_positions]
+        gathered_k = flashinfer.int4_dequantize(k_cache)[
+            page_indices, :, page_positions
+        ]
+        gathered_v = flashinfer.int4_dequantize(v_cache)[
+            page_indices, :, page_positions
+        ]
 
     torch.testing.assert_close(
         gathered_k,
@@ -177,11 +181,19 @@ def test_single_decode_with_kv_cache_int4(kv_layout, head_dim, use_tensor_cores)
 
     q = torch.randn(num_qo_heads, head_dim, dtype=torch.float16, device=device)
     if kv_layout == "NHD":
-        k = torch.randn(kv_len, num_kv_heads, head_dim, dtype=torch.float16, device=device)
-        v = torch.randn(kv_len, num_kv_heads, head_dim, dtype=torch.float16, device=device)
+        k = torch.randn(
+            kv_len, num_kv_heads, head_dim, dtype=torch.float16, device=device
+        )
+        v = torch.randn(
+            kv_len, num_kv_heads, head_dim, dtype=torch.float16, device=device
+        )
     else:
-        k = torch.randn(num_kv_heads, kv_len, head_dim, dtype=torch.float16, device=device)
-        v = torch.randn(num_kv_heads, kv_len, head_dim, dtype=torch.float16, device=device)
+        k = torch.randn(
+            num_kv_heads, kv_len, head_dim, dtype=torch.float16, device=device
+        )
+        v = torch.randn(
+            num_kv_heads, kv_len, head_dim, dtype=torch.float16, device=device
+        )
 
     k_int4 = flashinfer.int4_quantize(k)
     v_int4 = flashinfer.int4_quantize(v)
@@ -232,11 +244,19 @@ def test_single_prefill_with_kv_cache_int4(kv_layout, head_dim):
 
     q = torch.randn(qo_len, num_qo_heads, head_dim, dtype=torch.float16, device=device)
     if kv_layout == "NHD":
-        k = torch.randn(kv_len, num_kv_heads, head_dim, dtype=torch.float16, device=device)
-        v = torch.randn(kv_len, num_kv_heads, head_dim, dtype=torch.float16, device=device)
+        k = torch.randn(
+            kv_len, num_kv_heads, head_dim, dtype=torch.float16, device=device
+        )
+        v = torch.randn(
+            kv_len, num_kv_heads, head_dim, dtype=torch.float16, device=device
+        )
     else:
-        k = torch.randn(num_kv_heads, kv_len, head_dim, dtype=torch.float16, device=device)
-        v = torch.randn(num_kv_heads, kv_len, head_dim, dtype=torch.float16, device=device)
+        k = torch.randn(
+            num_kv_heads, kv_len, head_dim, dtype=torch.float16, device=device
+        )
+        v = torch.randn(
+            num_kv_heads, kv_len, head_dim, dtype=torch.float16, device=device
+        )
 
     k_int4 = flashinfer.int4_quantize(k)
     v_int4 = flashinfer.int4_quantize(v)
@@ -666,9 +686,8 @@ def test_int4_paged_kv_cache_cuda_graph_unsupported():
     head_dim = 128
     device = "cuda:0"
 
-    kv_indptr = (
-        torch.arange(0, batch_size + 1, device=device, dtype=torch.int32)
-        * ((kv_len + page_size - 1) // page_size)
+    kv_indptr = torch.arange(0, batch_size + 1, device=device, dtype=torch.int32) * (
+        (kv_len + page_size - 1) // page_size
     )
     kv_indices = torch.arange(kv_indptr[-1].item(), device=device, dtype=torch.int32)
     kv_last_page_len = torch.full(