zufangzhu
diff --git a/‎csrc/flash_attn/flash_api.cpp‎
Lines changed: 41 additions & 9 deletions b/‎csrc/flash_attn/flash_api.cpp‎
Lines changed: 41 additions & 9 deletions
diff --git a/‎csrc/xpu/attn/xe_2/collective/chunk_prefill_epilogue.hpp‎
Lines changed: 3 additions & 5 deletions b/‎csrc/xpu/attn/xe_2/collective/chunk_prefill_epilogue.hpp‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎csrc/xpu/attn/xe_2/collective/chunk_prefill_mainloop.hpp‎
Lines changed: 29 additions & 29 deletions b/‎csrc/xpu/attn/xe_2/collective/chunk_prefill_mainloop.hpp‎
Lines changed: 29 additions & 29 deletions
diff --git a/‎csrc/xpu/attn/xe_2/fmha_utils.hpp‎
Lines changed: 22 additions & 8 deletions b/‎csrc/xpu/attn/xe_2/fmha_utils.hpp‎
Lines changed: 22 additions & 8 deletions
diff --git a/‎csrc/xpu/attn/xe_2/fmha_xe2.cpp‎
Lines changed: 2 additions & 2 deletions b/‎csrc/xpu/attn/xe_2/fmha_xe2.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/xpu/attn/xe_2/kernel/chunk_prefill_kernel.hpp‎
Lines changed: 16 additions & 14 deletions b/‎csrc/xpu/attn/xe_2/kernel/chunk_prefill_kernel.hpp‎
Lines changed: 16 additions & 14 deletions
@@ -7,6 +7,33 @@
 
 namespace FLASH_NAMESPACE {
 
+inline int get_num_splits(
+    const sycl::queue& queue,
+    const int& batch_size,
+    const int& num_heads_kv,
+    const int& max_seqlen_k,
+    const int& block_size) {
+  auto device = queue.get_device();
+  int num_xe_cores =
+      device.get_info<sycl::ext::intel::info::device::gpu_slices>() *
+      device
+          .get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>();
+  int parallel_ = num_xe_cores;
+  int parallel_2 = num_xe_cores * 2;
+
+  int cur_parallel_d = batch_size * num_heads_kv;
+
+  int num_splits = (parallel_ + cur_parallel_d - 1) / cur_parallel_d;
+
+  if (cur_parallel_d * num_splits > parallel_ && num_splits > 1) {
+    num_splits = std::ceil(parallel_2 / static_cast<float>(cur_parallel_d)) - 1;
+  }
+
+  int max_splits = (max_seqlen_k + block_size - 1) / block_size;
+  max_splits = std::min(max_splits, parallel_);
+  return std::min(num_splits, max_splits);
+}
+
 std::vector<at::Tensor> mha_varlen_fwd(
     const at::Tensor& q,
     const at::Tensor& k,
@@ -32,7 +59,8 @@ std::vector<at::Tensor> mha_varlen_fwd(
     int window_size_right,
     const float softcap,
     const bool return_softmax,
-    std::optional<at::Generator> gen_) {
+    std::optional<at::Generator> gen_,
+    std::optional<int> num_splits) {
   auto q_type = q.scalar_type();
   auto k_type = k.scalar_type();
   TORCH_CHECK(
@@ -131,18 +159,22 @@ std::vector<at::Tensor> mha_varlen_fwd(
         is_local,
         is_sink);
   } else {
-    constexpr int partition_size = 512;
-    int num_kv_splits = (max_seqlen_k + partition_size - 1) / partition_size;
-    if (num_kv_splits > 20) num_kv_splits = 20;
-
     int num_tokens = q.size(0);
+    int batch_size = static_cast<int>(cu_seqlens_q.size(0)) - 1;
     int num_heads_q = q.size(1);
     int head_dim = q.size(2);
     int num_heads_kv = k.size(2);
     int block_size = k.size(1);
-    at::Tensor tmp_out = at::empty(
-        {num_tokens, num_heads_q * num_kv_splits, head_dim},
-        q.options().device(q.device()));
+
+    int num_kv_splits = num_splits.value_or(get_num_splits(
+        queue, batch_size, num_heads_kv, max_seqlen_k, block_size));
+
+    at::Tensor tmp_out =
+        num_kv_splits == 1
+            ? out
+            : at::empty(
+                  {num_tokens, num_heads_q * num_kv_splits, head_dim},
+                  q.options().device(q.device()));
     at::Tensor max_logits = at::empty(
         {num_tokens, num_heads_q, num_kv_splits},
         q.options().dtype(at::kFloat).device(q.device()));
@@ -200,7 +232,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "float softmax_scale, Tensor? softmax_sink, bool zero_tensors, "
       "bool is_causal, int window_size_left, int window_size_right, float "
       "softcap, bool return_softmax, "
-      "Generator? gen) -> Tensor[]");
+      "Generator? gen, int? num_splits) -> Tensor[]");
   ops.impl(
       "varlen_fwd",
       torch::kXPU,
 
@@ -518,8 +518,8 @@ class DecodeFwdEpilogue {
       const TensorLSE2D& max_logits,  // Global max logits tensor
       int idx_kv_split,
       int head_group_q,
-      TensorSink& tSink  // Sink for current head
-  ) {
+      TensorSink& tSink,  // Sink for current head
+      int num_kv_splits) {
     using namespace cute;
     using ElementA = typename FragA::element_type;
 
@@ -535,11 +535,9 @@ class DecodeFwdEpilogue {
 
     auto [rA, rA_max, rA_sum, active] = reduce_A(tArA, tA_max, tA_sum, thr_id);
 
-    int thr_id_sg = thr_id % intel::sg_size;
-
     // store exp sum and max logits for current KV split
     // assume seq_len_qo == 1
-    if (thr_id < head_group_q) {
+    if (thr_id < head_group_q && num_kv_splits > 1) {
       exp_sums(thr_id, idx_kv_split) = rA_sum(0);
       max_logits(thr_id, idx_kv_split) = rA_max(0);
     }
 
@@ -194,7 +194,7 @@ struct FMHAFwdMainloop<
     void* const scale_v;
 
     // Paged KV Cache
-    int* ptr_page_table;
+    const int* ptr_page_table;
     int page_size;
     int max_pages_per_seq;
     int total_seqlen_kv;
@@ -236,6 +236,20 @@ struct FMHAFwdMainloop<
     return true;
   }
 
+  CUTLASS_DEVICE int get_paged_idx(int K, int idx_b) {
+    int tiles_per_page = params.page_size / get<1>(TileShapeQK{});
+    int b_offset = idx_b * params.max_pages_per_seq;
+    int page_local_idx = K * get<1>(TileShapeQK{}) / params.page_size;
+
+    // Clamp page_local_idx to the valid range [0, max_pages_per_seq - 1]
+    if (page_local_idx >= params.max_pages_per_seq) {
+      page_local_idx = params.max_pages_per_seq - 1;
+    }
+
+    return params.ptr_page_table[b_offset + page_local_idx] * tiles_per_page +
+           K % tiles_per_page;
+  }
+
   template <typename QVCoord>
   CUTLASS_DEVICE void operator()(
       TensorQ2D const& Q_2D,  // (q,d)
@@ -339,24 +353,21 @@ struct FMHAFwdMainloop<
     // ------
 
     // PagedKV
-    int tiles_per_page = params.page_size / get<1>(TileShapeQK{});
-    int page_idx = blk_k0, next_page_idx;
-    int b_offset = idx_b * params.max_pages_per_seq;
+    int page_idx, next_page_idx;
     if constexpr (PagedKV) {
-      int page_local_idx = page_idx * get<1>(TileShapeQK{}) / params.page_size;
-      page_idx =
-          params.ptr_page_table[b_offset + page_local_idx] * tiles_per_page +
-          page_idx % tiles_per_page;
+      next_page_idx = get_paged_idx(blk_k0, idx_b);
     }
 
     /* Initialization steps for first block: Q/K prefetch, O init */
     /* TODO: limit D prefetch for large head size, and reorder K prefetches */
+    CUTLASS_PRAGMA_UNROLL
     for (int D = 0; D < size<3>(pQgQ); D++) {
       prefetch(prefetch_q, pQgQ(_, _, _, D));
     }
 
+    CUTLASS_PRAGMA_UNROLL
     for (int D = 0; D < size<4>(pKgK); D++) {
-      prefetch(prefetch_k, pKgK(_, _, _, page_idx, D));
+      prefetch(prefetch_k, pKgK(_, _, _, next_page_idx, D));
     }
 
     clear(tArA);
@@ -378,6 +389,12 @@ struct FMHAFwdMainloop<
       /* Split barrier to keep threads together */
       // barrier_arrive(ScopeSubgroup);
 
+      page_idx = next_page_idx;
+      // next paged_idx
+      if constexpr (PagedKV) {
+        next_page_idx = get_paged_idx(K + 1, idx_b);
+      }
+
       auto tKgK_cache =
           PagedKV ? tKgK(_, _, _, page_idx, _) : tKgK(_, _, _, K, _);
       auto tVgV_cache =
@@ -473,29 +490,12 @@ struct FMHAFwdMainloop<
       }
 
       // sycl::group_barrier(compat::get_nd_item<1>().get_group());
-      barrier();
-
-      // next paged_idx
-      next_page_idx = K + 1;
-      if constexpr (PagedKV) {
-        int next_page_local_idx =
-            next_page_idx * get<1>(TileShapeQK{}) / params.page_size;
-        bool valid_page = next_page_local_idx < params.max_pages_per_seq;
-        if (valid_page) {
-          next_page_idx =
-              params.ptr_page_table[b_offset + next_page_local_idx] *
-                  tiles_per_page +
-              next_page_idx % tiles_per_page;
-        } else {
-          // set to last page
-          next_page_idx = params.max_pages_per_seq * tiles_per_page - 1;
-        }
-      }
-      page_idx = next_page_idx;
+      // barrier();
 
       /* K prefetch */
+      CUTLASS_PRAGMA_UNROLL
       for (int D = 0; D < size<4>(pKgK); D++) {
-        prefetch(prefetch_k, pKgK(_, _, _, page_idx, D));
+        prefetch(prefetch_k, pKgK(_, _, _, next_page_idx, D));
       }
 
       // barrier_wait(ScopeSubgroup);
 
@@ -83,14 +83,28 @@ struct chunk_policy_head256 {
   using SubgroupLayoutQK = Layout<Shape<_32, _1, _1>>;
 };
 
-// define macro for decode policy
-#define DECODE_NUM_SG _4
-#define DECODE_KV_TILE _64  // KV tile size is set to 64 for page size is 64
-
-template <class q_packed, class head_dim>
+// define decode policy
+template <typename q_packed, typename head_dim, typename kv_tile>
 struct decode_policy_qpacked_head {
-  using ShapeQK = Shape<q_packed, DECODE_KV_TILE, _64>;
-  using ShapePV = Shape<q_packed, _32, DECODE_KV_TILE>;
+  static_assert(
+      cute::is_same_v<kv_tile, _64> || cute::is_same_v<kv_tile, _128>,
+      "Unsupported kv_tile(page_size) for decode_policy_qpacked_head");
+};
+
+// kv_tile == _64
+template <typename q_packed, typename head_dim>
+struct decode_policy_qpacked_head<q_packed, head_dim, _64> {
+  using ShapeQK = Shape<q_packed, _64, _64>;
+  using ShapePV = Shape<q_packed, _32, _64>;
+  using ShapeOut = Shape<q_packed, head_dim>;
+  using SubgroupLayoutQK = Layout<Shape<_1, _4, _1>>;
+};
+
+// kv_tile == _128
+template <typename q_packed, typename head_dim>
+struct decode_policy_qpacked_head<q_packed, head_dim, _128> {
+  using ShapeQK = Shape<q_packed, _128, _64>;
+  using ShapePV = Shape<q_packed, _32, _128>;
   using ShapeOut = Shape<q_packed, head_dim>;
-  using SubgroupLayoutQK = Layout<Shape<_1, DECODE_NUM_SG, _1>>;
+  using SubgroupLayoutQK = Layout<Shape<_1, _8, _1>>;
 };
@@ -89,10 +89,10 @@ void cutlass_chunk_prefill_impl(
     // query: [batch, num_heads, seq, head_size]
     batch_size = query.size(0);
     num_heads_q = query.size(1);
-    num_heads_kv = key_cache.size(1);
+    num_heads_kv = is_paged ? key_cache.size(2) : key_cache.size(1);
     head_size = query.size(3);
     max_seqlen_q = query.size(2);
-    max_seqlen_k = key_cache.size(2);
+    max_seqlen_k = is_paged ? max_seqlen_q : key_cache.size(2);
   }
   if (is_paged) {
     num_blocks = key_cache.size(0);
 
@@ -299,17 +299,18 @@ class XeFMHAFwdKernel {
         offset_o = s.num_heads_q * s.head_size_vo * qo_cumulative[idx_b];
       }
 
-      auto batch_dim = is_var_len ? 1 : s.batch;
+      auto batch_dim_qo = is_var_len ? 1 : s.batch;
+      auto batch_dim_kv = (PagedKV || is_var_len) ? 1 : s.batch;
       auto total_seqlen_kv =
           PagedKV ? params.mainloop.total_seqlen_kv : seq_len_kv;
       auto shape_Q =
-          make_shape(seq_len_qo, s.head_size_qk, s.num_heads_q, batch_dim);
+          make_shape(seq_len_qo, s.head_size_qk, s.num_heads_q, batch_dim_qo);
       auto shape_K = make_shape(
-          total_seqlen_kv, s.head_size_qk, s.num_heads_kv, batch_dim);
+          total_seqlen_kv, s.head_size_qk, s.num_heads_kv, batch_dim_kv);
       auto shape_V = make_shape(
-          s.head_size_vo, total_seqlen_kv, s.num_heads_kv, batch_dim);
+          s.head_size_vo, total_seqlen_kv, s.num_heads_kv, batch_dim_kv);
       auto shape_O =
-          make_shape(seq_len_qo, s.head_size_vo, s.num_heads_q, batch_dim);
+          make_shape(seq_len_qo, s.head_size_vo, s.num_heads_q, batch_dim_qo);
 
       auto dcQ = const_cast<ElementQ*>(p.Q + offset_q);
       auto dcK = const_cast<ElementK*>(p.K + offset_k);
@@ -319,10 +320,10 @@ class XeFMHAFwdKernel {
       auto layout_q = is_var_len
                           ? make_ordered_layout(shape_Q, Step<_2, _0, _1, _3>{})
                           : make_layout(shape_Q, p.dQ);
-      auto layout_k = is_var_len
+      auto layout_k = (PagedKV || is_var_len)
                           ? make_ordered_layout(shape_K, Step<_2, _0, _1, _3>{})
                           : make_layout(shape_K, p.dK);
-      auto layout_v = is_var_len
+      auto layout_v = (PagedKV || is_var_len)
                           ? make_ordered_layout(shape_V, Step<_0, _2, _1, _3>{})
                           : make_layout(shape_V, p.dV);
       auto layout_o = is_var_len
@@ -339,12 +340,13 @@ class XeFMHAFwdKernel {
       FragARow tA_max, tA_sum;
 
       // Main loop
-      int l_coord = is_var_len ? 0 : idx_b;
+      int l_coord_qo = is_var_len ? 0 : idx_b;
+      int l_coord_kv = (PagedKV || is_var_len) ? 0 : idx_b;
       CollectiveMainloop mainloop(params.mainloop, shared_storage.mainloop);
       mainloop(
-          Q(_, _, head_q, l_coord),
-          K(_, _, head, l_coord),
-          V(_, _, head, l_coord),
+          Q(_, _, head_q, l_coord_qo),
+          K(_, _, head, l_coord_kv),
+          V(_, _, head, l_coord_kv),
           tArA,
           tA_max,
           tA_sum,
@@ -368,7 +370,7 @@ class XeFMHAFwdKernel {
       if constexpr (Sink) {
         ElementSink s_head = p.ptr_S[head_q];
         epilogue(
-            O(_, _, head_q, l_coord),
+            O(_, _, head_q, l_coord_qo),
             tArA,
             tA_max,
             tA_sum,
@@ -377,7 +379,7 @@ class XeFMHAFwdKernel {
             thr_id);
       } else {
         epilogue(
-            O(_, _, head_q, l_coord),
+            O(_, _, head_q, l_coord_qo),
             tArA,
             tA_max,
             tA_sum,
@@ -389,4 +391,4 @@ class XeFMHAFwdKernel {
   }
 };
 
-}  // namespace cutlass::fmha::kernel
+}  // namespace cutlass::fmha::kernel