[CHUNK_PREFILL] add policy 192 and check conditions (vllm-project#68)

YizhouZ · web-flow · commit 2d03d252e258 · 2025-11-12T15:13:31.000+08:00
* add policy 192 and check conditions

Signed-off-by: Yizhou Wang &lt;yizhou.wang@intel.com&gt;

* pre-commit

Signed-off-by: Yizhou Wang &lt;yizhou.wang@intel.com&gt;

* solve comments

Signed-off-by: Yizhou Wang &lt;yizhou.wang@intel.com&gt;

---------

Signed-off-by: Yizhou Wang &lt;yizhou.wang@intel.com&gt;
diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp
@@ -21,6 +21,44 @@ std::vector<at::Tensor> mha_varlen_fwd(
     bool is_causal, int window_size_left, int window_size_right,
     const float softcap, const bool return_softmax,
     std::optional<at::Generator> gen_) {
+  auto q_type = q.scalar_type();
+  TORCH_CHECK(
+      q_type == at::ScalarType::Half || q_type == at::ScalarType::BFloat16,
+      "VLLM Kernel XPU only supports fp16 and bf16 type");
+
+  TORCH_CHECK(k.scalar_type() == q_type,
+              "query and key must have the same dtype");
+  TORCH_CHECK(v.scalar_type() == q_type,
+              "query and value must have the same dtype");
+
+  CHECK_DEVICE(q);
+  CHECK_DEVICE(k);
+  CHECK_DEVICE(v);
+
+  TORCH_CHECK(q.stride(-1) == 1,
+              "Input tensor must have contiguous last dimension");
+  TORCH_CHECK(k.stride(-1) == 1,
+              "Input tensor must have contiguous last dimension");
+  TORCH_CHECK(v.stride(-1) == 1,
+              "Input tensor must have contiguous last dimension");
+  TORCH_CHECK(q.dim() == 3, "query must be in ragged format");
+
+  CHECK_DEVICE(block_table_);
+  TORCH_CHECK(block_table_.dtype() == torch::kInt32,
+              "page_table must have dtype torch.int32");
+  TORCH_CHECK(block_table_.stride(-1) == 1,
+              "page_table must have contiguous last dimension");
+
+  CHECK_DEVICE(cu_seqlens_q);
+  CHECK_CONTIGUOUS(cu_seqlens_q);
+  TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32,
+              "cu_seqlens_q must have dtype torch.int32");
+
+  CHECK_DEVICE(cu_seqlens_k);
+  CHECK_CONTIGUOUS(cu_seqlens_k);
+  TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32,
+              "cu_seqlens_k must have dtype torch.int32");
+
   auto& queue = vllm::xpu::vllmGetQueue(q.device().index());
 
   at::Tensor out;
diff --git a/csrc/utils.h b/csrc/utils.h
@@ -4,6 +4,10 @@
 #include <c10/xpu/XPUStream.h>
 #include <sycl/sycl.hpp>
 
+#define CHECK_DEVICE(x) TORCH_CHECK(x.is_xpu(), #x " must be on XPU")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+
 namespace vllm {
 namespace xpu {
 
diff --git a/csrc/xpu/cutlass_kernels/chunk_prefill.hpp b/csrc/xpu/cutlass_kernels/chunk_prefill.hpp
@@ -343,11 +343,20 @@ void cutlass_chunk_prefill_impl(
                                is_sink};
   CutlassType cuType = aten_to_Cutlass_dtype(query);
 
+  static constexpr int max_head_size = 256;
+  TORCH_CHECK(head_size <= max_head_size,
+              "FMHA forward only supports head dimension at most " +
+                  std::to_string(max_head_size));
+
   if (args.head_size == HEAD_SIZE_LIMIT_0) {
     policy_dispatch<chunk_policy_head64>(queue, cuType, args);
   } else if (args.head_size == HEAD_SIZE_LIMIT_1) {
     policy_dispatch<chunk_policy_head128>(queue, cuType, args);
   } else if (args.head_size == HEAD_SIZE_LIMIT_2) {
+    policy_dispatch<chunk_policy_head192>(queue, cuType, args);
+  } else if (args.head_size == HEAD_SIZE_LIMIT_3) {
     policy_dispatch<chunk_policy_head256>(queue, cuType, args);
+  } else {
+    TORCH_CHECK(false, "Unsupported head size for fmha");
   }
 }
diff --git a/csrc/xpu/cutlass_kernels/fmha_utils.hpp b/csrc/xpu/cutlass_kernels/fmha_utils.hpp
@@ -4,8 +4,9 @@
 
 #define HEAD_SIZE_LIMIT_0 64
 #define HEAD_SIZE_LIMIT_1 128
-#define HEAD_SIZE_LIMIT_2 256
-#define HEAD_SIZE_LIMIT_3 512
+#define HEAD_SIZE_LIMIT_2 192
+#define HEAD_SIZE_LIMIT_3 256
+#define HEAD_SIZE_LIMIT_4 512
 
 enum class CutlassType {
   half,
@@ -40,6 +41,13 @@ struct chunk_policy_head128 {
   using SubgroupLayout = Layout<Shape<_16, _1, _1>, Stride<_1, _1, _1>>;
 };
 
+struct chunk_policy_head192 {
+  using ShapeQK = Shape<_256, _64, _64>;
+  using ShapePV = Shape<_256, _32, _64>;
+  using ShapeOutPut = Shape<_256, _192, _64>;
+  using SubgroupLayout = Layout<Shape<_32, _1, _1>, Stride<_1, _1, _1>>;
+};
+
 struct chunk_policy_head256 {
   using ShapeQK = Shape<_256, _64, _64>;
   using ShapePV = Shape<_256, _32, _64>;
diff --git a/tests/flash_attn/test_flash_attn_varlen_func.py b/tests/flash_attn/test_flash_attn_varlen_func.py
@@ -9,7 +9,7 @@
 from vllm_xpu_kernels.flash_attn_interface import flash_attn_varlen_func
 
 NUM_HEADS = [(4, 4), (8, 2)]
-HEAD_SIZES = [64, 128, 256]
+HEAD_SIZES = [64, 128, 192, 256]
 BLOCK_SIZES = [64]
 DTYPES = [torch.bfloat16, torch.half]
 QDTYPES = [None]