zufangzhu
diff --git a/‎csrc/flash_attn/flash_api.cpp‎
Lines changed: 12 additions & 17 deletions b/‎csrc/flash_attn/flash_api.cpp‎
Lines changed: 12 additions & 17 deletions
diff --git a/‎csrc/xpu/cutlass_kernels/chunk_prefill.hpp‎
Lines changed: 55 additions & 22 deletions b/‎csrc/xpu/cutlass_kernels/chunk_prefill.hpp‎
Lines changed: 55 additions & 22 deletions
diff --git a/‎csrc/xpu/cutlass_kernels/chunk_prefill_kernel.hpp‎
Lines changed: 18 additions & 7 deletions b/‎csrc/xpu/cutlass_kernels/chunk_prefill_kernel.hpp‎
Lines changed: 18 additions & 7 deletions
@@ -8,27 +8,18 @@
 namespace FLASH_NAMESPACE {
 
 std::vector<at::Tensor> mha_varlen_fwd(
-    const at::Tensor&
-        q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
-    const at::Tensor& k,  // total_k x num_heads_k x head_size, total_k :=
-                          // \sum_{i=0}^{b} s_i or num_blocks x page_block_size
-                          // x num_heads_k x head_size if there's a block_table.
-    const at::Tensor& v,  // total_k x num_heads_k x head_size, total_k :=
-                          // \sum_{i=0}^{b} s_i or num_blocks x page_block_size
-                          // x num_heads_k x head_size if there's a block_table.
-    std::optional<at::Tensor>&
-        out_,  // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+    const at::Tensor& q, const at::Tensor& k, const at::Tensor& v,
+    std::optional<at::Tensor>& out_,
     const at::Tensor& cu_seqlens_q,  // b+1
     const at::Tensor& cu_seqlens_k,  // b+1
-    std::optional<at::Tensor>&
-        seqused_k,  // b. If given, only this many elements of each batch
-                    // element's keys are used.
+    std::optional<at::Tensor>& seqused_k,
     std::optional<const at::Tensor>& leftpad_k_,  // batch_size
     at::Tensor& block_table_,  // batch_size x max_num_blocks_per_seq
     std::optional<at::Tensor>& alibi_slopes_,  // num_heads or b x num_heads
     int max_seqlen_q, int max_seqlen_k, float p_dropout, float softmax_scale,
-    const bool zero_tensors, bool is_causal, int window_size_left,
-    int window_size_right, const float softcap, const bool return_softmax,
+    std::optional<const at::Tensor>& softmax_sink_, const bool zero_tensors,
+    bool is_causal, int window_size_left, int window_size_right,
+    const float softcap, const bool return_softmax,
     std::optional<at::Generator> gen_) {
   auto& queue = vllm::xpu::vllmGetQueue();
 
@@ -39,9 +30,13 @@ std::vector<at::Tensor> mha_varlen_fwd(
     out = torch::empty_like(q);
   }
 
+  bool is_local = (window_size_left != -1) | (window_size_right != -1);
+  bool is_sink = softmax_sink_.has_value();
+
   cutlass_chunk_prefill_impl(queue, q, k, v, out, block_table_, cu_seqlens_q,
                              cu_seqlens_k, max_seqlen_q, max_seqlen_k,
-                             softmax_scale, is_causal);
+                             softmax_scale, softmax_sink_, window_size_left,
+                             window_size_right, is_causal, is_local, is_sink);
 
   if (return_softmax) {
     // FIXME: current do not support store softmax_lse out
@@ -61,7 +56,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor cu_seqlens_k, Tensor? seqused_k, Tensor? leftpad_k, Tensor "
       "block_table, Tensor? alibi_slopes, "
       "int max_seqlen_q, int max_seqlen_k, float p_dropout, float "
-      "softmax_scale, bool zero_tensors, "
+      "softmax_scale, Tensor? softmax_sink, bool zero_tensors, "
       "bool is_causal, int window_size_left, int window_size_right, float "
       "softcap, bool return_softmax, "
       "Generator? gen) -> Tensor[]");
 
@@ -36,16 +36,21 @@ struct chunk_prefill_args_t {
   int total_seqlen_q;
   int total_seqlen_k;
   float sm_scale;
+  void* sm_sink;
   int batch_size;
   int num_heads_q;
   int num_heads_k;
   int head_size;
   int max_blocks_per_seq;
   int block_size;
-  bool is_causal;
+  int window_size_left = -1;
+  int window_size_right = -1;
+  bool is_causal = false;
+  bool is_local = false;
+  bool is_sink = false;
 };
 
-template <class FMHAChunkPrefillKernel, bool isVarLen>
+template <class FMHAChunkPrefillKernel>
 struct KernelLauncher {
   using StrideQ = typename FMHAChunkPrefillKernel::StrideQ;
   using StrideK = typename FMHAChunkPrefillKernel::StrideK;
@@ -62,6 +67,7 @@ struct KernelLauncher {
   using ElementOutput = typename CollectiveEpilogue::ElementOutput;
   using ElementCompute = typename CollectiveEpilogue::ElementCompute;
   using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+  using ElementSink = typename CollectiveEpilogue::ElementSink;
 
   using ProblemShapeType = typename FMHAChunkPrefillKernel::ProblemShape;
 
@@ -120,9 +126,11 @@ struct KernelLauncher {
          reinterpret_cast<ElementK*>(args.key), stride_K_cache,
          reinterpret_cast<ElementV*>(args.value), stride_V_cache,
          static_cast<int*>(args.block_table), args.block_size,
-         args.max_blocks_per_seq, args.total_seqlen_k, -1, -1},
+         args.max_blocks_per_seq, args.total_seqlen_k, args.window_size_left,
+         args.window_size_right},
         {args.sm_scale},
-        {reinterpret_cast<ElementOutput*>(args.out), stride_O},
+        {reinterpret_cast<ElementOutput*>(args.out), stride_O,
+         reinterpret_cast<ElementSink*>(args.sm_sink)},
         hw_info};
 
     // Define device-global scratch memory
@@ -186,28 +194,29 @@ template <typename TileShapeQK, typename TileShapePV, typename TileShapeOutput,
           typename ElementComputeEpilogue = float,
           typename GmemTiledCopyStore = XE_2D_U16x8x16_ST_N>
 struct FMHAKernel {
-  template <bool isVarLen, bool Causal, bool PagedKV, bool Local,
-            class Scheduler>
+  template <class Scheduler, bool Causal, bool Local, bool Sink>
   static void run(sycl::queue& queue, const chunk_prefill_args_t& args) {
     cutlass::KernelHardwareInfo hw_info;
 
+    static constexpr bool PagedKV = true;
     using LayoutQ = cutlass::layout::RowMajor;
     using LayoutK = cutlass::layout::ColumnMajor;
     using LayoutV = cutlass::layout::RowMajor;
     using LayoutO = cutlass::layout::RowMajor;
 
     using ElementInputKV = ElementInputQ;
     using ElementOutput = ElementInputQ;
+    using ElementSink = ElementInputQ;
 
     using GEMMDispatchPolicy =
         cutlass::gemm::MainloopIntelXeXMX16<PipelineStages>;
     using EpilogueDispatchPolicy = cutlass::epilogue::IntelXeXMX16;
     using CollectiveEpilogue =
         cutlass::flash_attention::collective::FlashChunkPrefillEpilogue<
-            EpilogueDispatchPolicy, MMAOperation, TileShapeOutput,
+            Sink, EpilogueDispatchPolicy, MMAOperation, TileShapeOutput,
             SubgroupLayout, ElementComputeEpilogue, ElementOutput,
             cutlass::gemm::TagToStrideC_t<LayoutO>, ElementOutput,
-            GmemTiledCopyStore>;
+            GmemTiledCopyStore, ElementSink>;
     using CollectiveSoftmaxEpilogue =
         cutlass::flash_attention::collective::FlashChunkPrefillSoftmaxEpilogue<
             Causal, Local, EpilogueDispatchPolicy, ElementAccumulator>;
@@ -216,8 +225,7 @@ struct FMHAKernel {
     using namespace cutlass::fmha::collective;
     using ProblemShapeVarlen =
         cute::tuple<int, int, int, VariableLength, VariableLength, int, int>;
-    using ProblemShapeType =
-        std::conditional_t<isVarLen, ProblemShapeVarlen, ProblemShapeRegular>;
+    using ProblemShapeType = ProblemShapeVarlen;
 
     // Mainloop
     using CollectiveMainloop =
@@ -237,18 +245,26 @@ struct FMHAKernel {
             ProblemShapeType, CollectiveMainloop, CollectiveSoftmaxEpilogue,
             CollectiveEpilogue, Scheduler>;
 
-    KernelLauncher<FMHAChunkPrefillKernel, isVarLen> launcher;
+    KernelLauncher<FMHAChunkPrefillKernel> launcher;
 
     launcher.run(queue, args, hw_info);
   }
 
-  static void dispatch(sycl::queue& queue, const chunk_prefill_args_t& args) {
-    if (args.is_causal) {
-      run<true, true, true, false,
-          cutlass::flash_attention::IndividualScheduler>(queue, args);
+  template <bool... Bs>
+  static void kernel_dispatch(sycl::queue& queue,
+                              const chunk_prefill_args_t& args) {
+    return run<cutlass::flash_attention::IndividualScheduler, Bs...>(queue,
+                                                                     args);
+  }
+
+  template <bool... Bs, typename... Ts>
+  static void kernel_dispatch(sycl::queue& queue,
+                              const chunk_prefill_args_t& args, bool b,
+                              Ts... ts) {
+    if (b) {
+      kernel_dispatch<Bs..., true>(queue, args, ts...);
     } else {
-      run<true, false, true, false,
-          cutlass::flash_attention::IndividualScheduler>(queue, args);
+      kernel_dispatch<Bs..., false>(queue, args, ts...);
     }
   }
 };
@@ -261,13 +277,17 @@ void policy_dispatch(sycl::queue& queue, CutlassType cuType,
     FMHAKernel<typename chunk_policy::ShapeQK, typename chunk_policy::ShapePV,
                typename chunk_policy::ShapeOutPut,
                typename chunk_policy::SubgroupLayout, PipelineStages,
-               cutlass::half_t, XE_8x16x16_F32F16F16F32_TT>::dispatch(queue,
-                                                                      args);
+               cutlass::half_t,
+               XE_8x16x16_F32F16F16F32_TT>::kernel_dispatch(queue, args,
+                                                            args.is_causal,
+                                                            args.is_local,
+                                                            args.is_sink);
   } else {
     FMHAKernel<typename chunk_policy::ShapeQK, typename chunk_policy::ShapePV,
                typename chunk_policy::ShapeOutPut,
                typename chunk_policy::SubgroupLayout,
-               PipelineStages>::dispatch(queue, args);
+               PipelineStages>::kernel_dispatch(queue, args, args.is_causal,
+                                                args.is_local, args.is_sink);
   }
 }
 
@@ -278,7 +298,9 @@ void cutlass_chunk_prefill_impl(
     const at::Tensor& value_cache, at::Tensor& out,
     const at::Tensor& block_table, const at::Tensor& cu_seqlens_q,
     const at::Tensor& cu_seqlens_k, int max_seqlen_q, int max_seqlen_k,
-    double sm_scale, bool is_causal) {
+    double sm_scale, std::optional<const at::Tensor>& sm_sink_,
+    int window_size_left, int window_size_right, bool is_causal, bool is_local,
+    bool is_sink) {
   int num_block = key_cache.size(0);
   int block_size = key_cache.size(1);
   int num_heads_q = query.size(1);
@@ -289,6 +311,12 @@ void cutlass_chunk_prefill_impl(
   int total_seqlen_q = query.size(0);
   int total_seqlen_k = num_block * block_size;
 
+  if (is_local) {
+    window_size_left = window_size_left == -1 ? max_seqlen_k : window_size_left;
+    window_size_right =
+        window_size_right == -1 ? max_seqlen_k : window_size_right;
+  }
+
   chunk_prefill_args_t args = {query.data_ptr(),
                                key_cache.data_ptr(),
                                value_cache.data_ptr(),
@@ -301,13 +329,18 @@ void cutlass_chunk_prefill_impl(
                                total_seqlen_q,
                                total_seqlen_k,
                                static_cast<float>(sm_scale),
+                               is_sink ? sm_sink_.value().data_ptr() : nullptr,
                                batch_size,
                                num_heads_q,
                                num_heads_kv,
                                head_size,
                                max_blocks_per_seq,
                                block_size,
-                               is_causal};
+                               window_size_left,
+                               window_size_right,
+                               is_causal,
+                               is_local,
+                               is_sink};
   CutlassType cuType = aten_to_Cutlass_dtype(query);
 
   if (args.head_size == HEAD_SIZE_LIMIT_0) {
 
@@ -100,6 +100,9 @@ class FMHAPrefillChunk {
   using EpilogueParams = typename CollectiveEpilogue::Params;
   using TileShapeOutput = typename CollectiveEpilogue::TileShapeOutput;
   using TiledMmaOutput = typename CollectiveEpilogue::TiledMmaOutput;
+  // sink
+  using ElementSink = typename CollectiveEpilogue::ElementSink;
+  static constexpr bool Sink = CollectiveEpilogue::Sink;
 
   static_assert(
       cute::is_same_v<ElementAccumulator,
@@ -111,7 +114,8 @@ class FMHAPrefillChunk {
   static constexpr bool CausalMask = CollectiveMainloop::CausalMask;
   static constexpr bool LocalMask = CollectiveMainloop::LocalMask;
 
-  static_assert(!(CausalMask && LocalMask), "Cannot be both causal and local");
+  // static_assert(!(CausalMask && LocalMask), "Cannot be both causal and
+  // local");
   static constexpr bool PagedKV = CollectiveMainloop::PagedKV;
 
   static constexpr int SubgroupSize =
@@ -455,23 +459,23 @@ class FMHAPrefillChunk {
         if constexpr (LocalMask) {
           // mask the elements of each tile where j - left > i || j + right < i
           const int item_id = thread_idx % SubgroupSize;
-          int col_idx = item_id;
-          col_idx += split * cute::min(QK_BLK_N, seq_len_kv_cache);
+          int col_idx = item_id + split * cute::min(QK_BLK_N, seq_len_kv_cache);
 
           CUTLASS_PRAGMA_UNROLL
           for (int n = 0; n < FragsN;
                n++, col_idx += get<1>(MmaAtomShape())) {  // 4
             CUTLASS_PRAGMA_UNROLL
             for (int m = 0; m < FragsM; m++) {  // 2
               int row_idx = m * Vec + seq_coord;
+              int col_ref = seq_len_kv_cache - seq_len_qo;
               CUTLASS_PRAGMA_UNROLL
               for (int row = 0; row < Vec; row++) {  // 8
                 bool left_mask =
-                    col_idx < cute::max(0, row + row_idx + seq_len_kv_cache -
+                    col_idx < cute::max(0, row + row_idx + col_ref -
                                                mainloop_params.window_left);
                 bool right_mask =
                     col_idx > cute::min(seq_len_kv_cache,
-                                        row + row_idx + seq_len_kv_cache +
+                                        row + row_idx + col_ref +
                                             mainloop_params.window_right);
                 if (left_mask || right_mask) {
                   tSr(row, m, n) = ElementAccumulator{-INFINITY};
@@ -544,8 +548,15 @@ class FMHAPrefillChunk {
               batch_coord, q_head_coord);
       CollectiveEpilogue epilogue{epilogue_params, shared_storage.epilogue};
       auto blk_coord_mnkl = make_coord(blk_m_coord, blk_n_coord, _, 0);
-      epilogue(params.problem_shape, sequence_length_shape, blk_coord_mnkl,
-               out_reg, max_reg, sum_reg);
+      if constexpr (Sink) {
+        ElementAccumulator max_scale{max_reg * params.softmax.scale};
+        epilogue(params.problem_shape, sequence_length_shape, blk_coord_mnkl,
+                 out_reg, max_scale, sum_reg,
+                 params.epilogue.ptr_sink[q_head_coord]);
+      } else {
+        epilogue(params.problem_shape, sequence_length_shape, blk_coord_mnkl,
+                 out_reg, max_reg, sum_reg, 0);
+      }
     }
   }
 };