[CHUNK_PREFILL] new api refactor phase3 (vllm-project#90)

YizhouZ · web-flow · commit fc2065f3c4b3 · 2025-12-10T14:02:05.000+08:00
* add local attn(sliding window)
* solve template issues
* add asm barrier and fence

---------

Signed-off-by: Yizhou Wang &lt;yizhou.wang@intel.com&gt;
diff --git a/csrc/xpu/cutlass_kernels/chunk_prefill.hpp b/csrc/xpu/cutlass_kernels/chunk_prefill.hpp
@@ -148,7 +148,9 @@ struct KernelLauncher {
          static_cast<int*>(args.block_table),
          args.block_size,
          args.max_blocks_per_seq,
-         args.total_seqlen_k},
+         args.total_seqlen_k,
+         args.window_size_left,
+         args.window_size_right},
         {},
         hw_info};
 
@@ -231,14 +233,10 @@ struct FMHAConfig {
       decltype(cutlass::fmha::collective::get_sg_layout_pv(SubgroupLayoutQK{})),
       SubgroupLayoutPV_>;
 
-  template <
-      class Scheduler,
-      bool VarLen,
-      bool Paged,
-      bool Causal,
-      bool Local,
-      bool Sink>
+  template <class Scheduler, bool Causal, bool Local, bool Sink>
   static void run(sycl::queue& queue, const chunk_prefill_args_t& args) {
+    constexpr bool VarLen = true;
+    constexpr bool Paged = true;
     cutlass::KernelHardwareInfo hw_info;
 
     using ProblemShapeType = cutlass::fmha::kernel::FMHAProblemShape<VarLen>;
@@ -273,6 +271,7 @@ struct FMHAConfig {
     using CollectiveMainloop = cutlass::fmha::collective::FMHAFwdMainloop<
         MainloopDispatchPolicy,
         Causal,
+        Local,
         Paged,
         TiledMMAQK,
         TiledMMAPV,
@@ -338,13 +337,7 @@ void policy_dispatch(
         half_t,
         half_t>::
         kernel_dispatch(
-            queue,
-            args,
-            true,  // args.is_varlen,
-            true,  // args.is_paged,
-            args.is_causal,
-            false,  // args.is_local,
-            args.is_sink);
+            queue, args, args.is_causal, args.is_local, args.is_sink);
   } else {
     return FMHAConfig<
         typename chunk_policy::ShapeQK,
@@ -354,13 +347,7 @@ void policy_dispatch(
         void,
         PipelineStages>::
         kernel_dispatch(
-            queue,
-            args,
-            true,  // args.is_varlen,
-            true,  // args.is_paged,
-            args.is_causal,
-            false,  // args.is_local,
-            args.is_sink);
+            queue, args, args.is_causal, args.is_local, args.is_sink);
   }
 }
 
@@ -418,6 +405,10 @@ void cutlass_chunk_prefill_impl(
     window_size_left = window_size_left == -1 ? max_seqlen_k : window_size_left;
     window_size_right =
         window_size_right == -1 ? max_seqlen_k : window_size_right;
+    if (is_causal) {
+      window_size_right = 0;
+      is_causal = false;
+    }
   }
 
   chunk_prefill_args_t args = {
diff --git a/csrc/xpu/cutlass_kernels/chunk_prefill_kernel.hpp b/csrc/xpu/cutlass_kernels/chunk_prefill_kernel.hpp
@@ -115,6 +115,7 @@ class XeFMHAFwdKernel {
   // Template Features
   static constexpr bool PagedKV = CollectiveMainloop::PagedKV;
   static constexpr bool CausalMask = CollectiveMainloop::CausalMask;
+  static constexpr bool LocalMask = CollectiveMainloop::LocalMask;
   static constexpr bool Sink = CollectiveEpilogue::Sink;
   using ElementSink = typename CollectiveEpilogue::ElementSink;
 
@@ -246,7 +247,7 @@ class XeFMHAFwdKernel {
       auto [seq_len_qo, seq_len_kv] = sequence_length_shape;
       if (blk_q * get<0>(TileShapeQK{}) >= seq_len_qo) continue;
 
-      auto offset = cute::min(seq_len_qo, seq_len_kv);
+      auto offset = seq_len_qo;
       auto discard_seq_coord = seq_len_qo - offset;
       auto full_tile_offset = seq_len_kv - offset;
       int seq_coord =
@@ -256,13 +257,25 @@ class XeFMHAFwdKernel {
       // calc sg level seq_len_kv
       const int seq_len =
           CausalMask
-              ? full_tile_offset +
-                    cute::min(seq_len_kv, seq_coord - discard_seq_coord) +
-                    q_sg_tile
+              ? LocalMask
+                    ? cute::min(
+                          seq_len_kv,
+                          full_tile_offset + seq_coord + q_sg_tile +
+                              params.mainloop.local_right)
+                    : cute::min(
+                          seq_len_kv, full_tile_offset + seq_coord + q_sg_tile)
               : seq_len_kv;
+      const int k_block0 =
+          LocalMask
+              ? cute::max(
+                    seq_coord + full_tile_offset - params.mainloop.local_left,
+                    0) /
+                    get<1>(TileShapeQK{})
+              : 0;
       const int k_blocks = cute::ceil_div(seq_len, get<1>(TileShapeQK{}));
-      const int k_causal_blocks =
-          CausalMask ? (seq_len - q_sg_tile) / get<1>(TileShapeQK{}) : 0;
+      const int k_blocks_causal =
+          CausalMask ? (seq_coord + full_tile_offset) / get<1>(TileShapeQK{})
+                     : 0;
 
       int offset_q = 0, offset_k = 0, offset_v = 0, offset_o = 0;
       if constexpr (is_var_len) {
@@ -330,9 +343,9 @@ class XeFMHAFwdKernel {
           tA_sum,
           blk_qv,
           idx_b,
-          0,
+          k_block0,
           k_blocks,
-          k_causal_blocks,
+          k_blocks_causal,
           thr_id,
           seq_len,
           full_tile_offset,
diff --git a/csrc/xpu/cutlass_kernels/collective/chunk_prefill_mainloop.hpp b/csrc/xpu/cutlass_kernels/collective/chunk_prefill_mainloop.hpp
@@ -50,13 +50,25 @@ class XeDefault {};  // Default FMHA mainloop, P in registers.
 
 namespace cutlass::fmha::collective {
 
+static inline void sbarrier_wait() { asm volatile("sbarrier.wait\n"); }
+
+static inline void sbarrier_signal() { asm volatile("sbarrier.signal\n"); }
+
+static inline void gfence() { asm volatile("lsc_fence.ugm.none.group\n"); }
+
+static inline void barrier() {
+  asm volatile("lsc_fence.ugm.none.group\n");
+  asm volatile("barrier\n");
+}
+
 using namespace cute;
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <
     class DispatchPolicy_,
     bool CausalMask_,
+    bool LocalMask_,
     bool PagedKV_,
     class TiledMMAQK_,  // Tiling for Q*K GEMM
     class TiledMMAPV_,  // Tiling for P*V GEMM
@@ -78,6 +90,7 @@ struct FMHAFwdMainloop {
 template <
     int Stages,
     bool CausalMask_,
+    bool LocalMask_,
     bool PagedKV_,
     class TiledMMAQK_,
     class TiledMMAPV_,
@@ -91,6 +104,7 @@ template <
 struct FMHAFwdMainloop<
     XeDefault<Stages>,
     CausalMask_,
+    LocalMask_,
     PagedKV_,
     TiledMMAQK_,
     TiledMMAPV_,
@@ -165,6 +179,7 @@ struct FMHAFwdMainloop<
   using ElementA = typename TiledMMAPV::ValTypeD;
 
   static constexpr bool CausalMask = CausalMask_;
+  static constexpr bool LocalMask = LocalMask_;
   static constexpr bool PagedKV = PagedKV_;
 
   // User-facing arguments
@@ -176,6 +191,8 @@ struct FMHAFwdMainloop<
     int page_size;
     int max_pages_per_seq;
     int total_seqlen_kv;
+    // Local Mask
+    int local_left, local_right;
   };
 
   // Kernel-facing parameters
@@ -201,7 +218,9 @@ struct FMHAFwdMainloop<
         args.ptr_page_table,
         args.page_size,
         args.max_pages_per_seq,
-        args.total_seqlen_kv};
+        args.total_seqlen_kv,
+        args.local_left,
+        args.local_right};
   }
 
   CUTLASS_HOST_DEVICE static bool can_implement(Arguments const&) {
@@ -312,28 +331,29 @@ struct FMHAFwdMainloop<
 
     // PagedKV
     int tiles_per_page = params.page_size / get<1>(TileShapeQK{});
-    int page_idx, next_page_idx;
+    int page_idx = blk_k0, next_page_idx;
     int b_offset = idx_b * params.max_pages_per_seq;
     if constexpr (PagedKV) {
-      page_idx = params.ptr_page_table[b_offset] * tiles_per_page;
+      int page_local_idx = page_idx * get<1>(TileShapeQK{}) / params.page_size;
+      page_idx =
+          params.ptr_page_table[b_offset + page_local_idx] * tiles_per_page +
+          page_idx % tiles_per_page;
     }
 
     /* Initialization steps for first block: Q/K prefetch, O init */
     /* TODO: limit D prefetch for large head size, and reorder K prefetches */
-    if (blk_k0 == 0) {
-      for (int D = 0; D < size<3>(pQgQ); D++) {
-        prefetch(prefetch_q, pQgQ(_, _, _, D));
-      }
-
-      for (int D = 0; D < size<4>(pKgK); D++) {
-        prefetch(prefetch_k, pKgK(_, _, _, page_idx, D));
-      }
+    for (int D = 0; D < size<3>(pQgQ); D++) {
+      prefetch(prefetch_q, pQgQ(_, _, _, D));
+    }
 
-      clear(tArA);
-      fill(tA_max, cutlass::platform::numeric_limits<ElementA>::lowest());
-      clear(tA_sum);
+    for (int D = 0; D < size<4>(pKgK); D++) {
+      prefetch(prefetch_k, pKgK(_, _, _, page_idx, D));
     }
 
+    clear(tArA);
+    fill(tA_max, cutlass::platform::numeric_limits<ElementA>::lowest());
+    clear(tA_sum);
+
     /* Check if */
     bool check_remainder_k = (seq_len % get<1>(TileShapeQK{}) != 0);
 
@@ -379,6 +399,23 @@ struct FMHAFwdMainloop<
           }
         }
       }
+      /* Local masking */
+      if constexpr (LocalMask) {
+        Tensor cPgP = make_identity_tensor(make_shape(seq_len, seq_len));
+        Tensor gP = local_tile(
+            cPgP, take<0, 2>(TileShapeQK{}), make_coord(get<0>(blk_qv), K));
+        auto cS_thread = thr_mma_qk.partition_C(gP);
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < tSrS.size(); ++i) {
+          int row_idx = get<0>(cS_thread(i)) - discard_seq_coord;
+          int col_idx = get<1>(cS_thread(i)) - full_tile_offset;
+          bool left_mask = col_idx < row_idx - params.local_left;
+          bool right_mask = col_idx > row_idx + params.local_right;
+          if (left_mask || right_mask) {
+            tSrS(i) = ElementS(-INFINITY);
+          }
+        }
+      }
       /* k masking for remainder tiles */
       if (check_remainder_k && K == blk_k1 - 1) {
         FragSCol k_rem_mask;
@@ -406,7 +443,8 @@ struct FMHAFwdMainloop<
         cute::gemm(mma_pv, tArP, tArV, tArA(_, _, _, VV));
       }
 
-      sycl::group_barrier(compat::get_nd_item<1>().get_group());
+      // sycl::group_barrier(compat::get_nd_item<1>().get_group());
+      barrier();
 
       // next paged_idx
       next_page_idx = K + 1;
@@ -456,9 +494,10 @@ struct FMHAFwdMainloop<
 
     /* Scale S and subtract maxima, then exponentiate */
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < tS.size(); i++)
+    for (int i = 0; i < tS.size(); i++) {
       tS(i) = sycl::native::exp2(
           params.scale * tS(i) - broadcast<0>(tS_max, tS, i));
+    }
 
     /* Rescale existing S sums and O accumulator */
     if (!first_block) {
diff --git a/csrc/xpu/cutlass_kernels/fmha_utils.hpp b/csrc/xpu/cutlass_kernels/fmha_utils.hpp
@@ -28,8 +28,8 @@ inline CutlassType aten_to_Cutlass_dtype(const at::Tensor& input) {
 
 using namespace cute;
 struct chunk_policy_head64 {
-  using ShapeQK = Shape<_128, _64, _32>;
-  using ShapePV = Shape<_128, _32, _64>;
+  using ShapeQK = Shape<_128, _32, _32>;
+  using ShapePV = Shape<_128, _32, _32>;
   using ShapeOut = Shape<_128, _64>;
   using SubgroupLayoutQK = Layout<Shape<_8, _1, _1>>;
 };
@@ -49,15 +49,15 @@ struct chunk_policy_head128 {
 };
 
 struct chunk_policy_head192 {
-  using ShapeQK = Shape<_256, _64, _32>;
-  using ShapePV = Shape<_256, _32, _64>;
+  using ShapeQK = Shape<_256, _32, _32>;
+  using ShapePV = Shape<_256, _32, _32>;
   using ShapeOut = Shape<_256, _192>;
   using SubgroupLayoutQK = Layout<Shape<_32, _1, _1>>;
 };
 
 struct chunk_policy_head256 {
-  using ShapeQK = Shape<_256, _64, _32>;
-  using ShapePV = Shape<_256, _32, _64>;
+  using ShapeQK = Shape<_256, _32, _32>;
+  using ShapePV = Shape<_256, _32, _32>;
   using ShapeOut = Shape<_256, _256>;
   using SubgroupLayoutQK = Layout<Shape<_32, _1, _1>>;
 };
diff --git a/tests/flash_attn/test_flash_attn_varlen_func.py b/tests/flash_attn/test_flash_attn_varlen_func.py
@@ -18,7 +18,7 @@
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
 SOFT_CAPS = [None]
-SLIDING_WINDOWS = [(-1, 2), (2, -1), (11, 3), (-1, -1)]
+SLIDING_WINDOWS = [(-1, 127), (127, -1), (127, 127), (-1, -1)]
 SINK = [False, True]
 CASUAL = [False, True]
 
@@ -56,8 +56,10 @@ def ref_paged_attn(query: torch.Tensor,
         v = v[:kv_len]
 
         if q.shape[1] != k.shape[1]:
-            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1], dim=1)
-            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1], dim=1)
+            k = torch.repeat_interleave(k, q.shape[1] // k.shape[1],
+                                        dim=1).contiguous()
+            v = torch.repeat_interleave(v, q.shape[1] // v.shape[1],
+                                        dim=1).contiguous()
         attn = torch.einsum("qhd,khd->hqk", q, k).float()
         empty_mask = torch.ones(query_len, kv_len)
         mask = torch.triu(empty_mask, diagonal=kv_len - query_len + 1).bool()
@@ -111,7 +113,7 @@ def ref_paged_attn(query: torch.Tensor,
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
-@pytest.mark.parametrize("window_size", [(-1, -1)])
+@pytest.mark.parametrize("window_size", SLIDING_WINDOWS)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("soft_cap", SOFT_CAPS)
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@@ -135,15 +137,20 @@ def test_varlen_with_paged_kv(
     is_casual: bool,
 ) -> None:
     torch.set_default_device("xpu")
+    torch.xpu.set_device("xpu:0")
     # # FIXME: remove skip
     if (is_casual and seq_lens[1][0]
             == 5) and (os.getenv("SKIP_HANG_KERNEL") is not None
                        and os.getenv("SKIP_HANG_KERNEL") == "1"):
         pytest.skip("skip casual for seqlen0 to avoid runtime hang on CI.")
+    if (window_size[0] != -1 or window_size[1]
+            != -1) and (os.getenv("SKIP_HANG_KERNEL") is not None
+                        and os.getenv("SKIP_HANG_KERNEL") == "1"):
+        pytest.skip("skip local attn to avoid runtime hang on CI.")
     # if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
     #     pytest.skip("Flash attention with quantized inputs is only "
     #                 "supported on version 3 with bfloat16 base type")
-    torch.manual_seed(0)
+    torch.manual_seed(42)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
     kv_lens = [x[1] for x in seq_lens]
@@ -221,8 +228,10 @@ def test_varlen_with_paged_kv(
                                 sink=sink,
                                 window_size_left=window_size[0],
                                 window_size_right=window_size[1])
-    atol, rtol = 1.5e-2, 1e-2
+    atol, rtol = 1e-2, 1e-2
     if q_dtype is not None:
         atol, rtol = 1.5e-1, 1.5e-1
+    if window_size[0] != -1 or window_size[1] != -1:
+        atol, rtol = 1.5e-2, 1.5e-2
     torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
         f"{torch.max(torch.abs(output - ref_output))}"