Add test case for indexer_k_quant_and_cache (vllm-project#201)

LLee233 · web-flow · commit 7aa1fe51cc39 · 2026-03-20T20:54:06.000+08:00
diff --git a/csrc/cache.cpp b/csrc/cache.cpp
@@ -425,11 +425,10 @@ class indexer_k_quant_and_cache_kernel {
 
     // Compute local amax
     float amax = 0.f;
-    float k_vals[VEC_SIZE];
+    scalar_t k_vals[VEC_SIZE];
     for (int i = 0; i < VEC_SIZE; i++) {
-      k_vals[i] =
-          static_cast<float>(k_[token_idx * head_dim_ + head_dim_idx + i]);
-      amax = sycl::fmax(amax, sycl::fabs(k_vals[i]));
+      k_vals[i] = k_[token_idx * head_dim_ + head_dim_idx + i];
+      amax = sycl::fmax(amax, sycl::fabs(static_cast<float>(k_vals[i])));
     }
 
     // group-level reduction (sub-group reduce max)
diff --git a/tests/test_indexer_k_quant_and_cache.py b/tests/test_indexer_k_quant_and_cache.py
@@ -13,9 +13,9 @@
 QUANT_BLOCK_SIZES = [128]
 BLOCK_SIZES = [16]
 SCALE_FMTS = ["ue8m0", "fp8e4m3"]
-# TODO: will add back torch.bfloat16, torch.float16
+# TODO: will add back torch.float16
 # after fp8_e4m3 acc is verified
-DTYPES = [torch.float32]
+DTYPES = [torch.float32, torch.bfloat16]
 
 # override pytest parameters when enable mini pytest
 MINI_PYTEST_PARAMS = {
@@ -57,11 +57,12 @@ def _pytorch_group_quant(
     original_shape = x.shape
     num_groups = original_shape[-1] // group_size
     group_shape = original_shape[:-1] + (num_groups, group_size)
-    x_grouped = x.view(group_shape)
 
+    # Quantization should be done in FP32 for better accuracy.
+    x_grouped = x.view(group_shape).float()
     abs_max = torch.amax(torch.abs(x_grouped), dim=-1, keepdim=False)
-    abs_max = torch.maximum(abs_max,
-                            torch.tensor(eps, device=x.device, dtype=x.dtype))
+    abs_max = torch.maximum(
+        abs_max, torch.tensor(eps, device=x.device, dtype=torch.float32))
 
     FP8_MAX = torch.finfo(dtype).max
     FP8_MIN = torch.finfo(dtype).min