[Decode Attn] Change strategy of num_splits to avoid acc issue (vllm-project#204)

baodii · Copilot · web-flow · commit 1b4770e56e50 · 2026-03-20T09:56:34.000+08:00
* change strategy of num_splits

Signed-off-by: baodii &lt;di.bao@intel.com&gt;

* Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI &lt;175728472+Copilot@users.noreply.github.com&gt;
Signed-off-by: baodi &lt;di.bao@intel.com&gt;

---------

Signed-off-by: baodii &lt;di.bao@intel.com&gt;
Signed-off-by: baodi &lt;di.bao@intel.com&gt;
Co-authored-by: Copilot Autofix powered by AI &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/csrc/flash_attn/flash_api.cpp b/csrc/flash_attn/flash_api.cpp
@@ -18,20 +18,41 @@ inline int get_num_splits(
       device.get_info<sycl::ext::intel::info::device::gpu_slices>() *
       device
           .get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>();
-  int parallel_ = num_xe_cores;
-  int parallel_2 = num_xe_cores * 2;
 
-  int cur_parallel_d = batch_size * num_heads_kv;
+  int cur_parallel = batch_size * num_heads_kv;
+  int kv_blocks = (max_seqlen_k + block_size - 1) / block_size;
 
-  int num_splits = (parallel_ + cur_parallel_d - 1) / cur_parallel_d;
+  // Below 128 KV blocks the per-split FMHA compute is too small relative
+  // to the ReduceSplitK overhead, regardless of block size.
+  if (kv_blocks < 128) return 1;
 
-  if (cur_parallel_d * num_splits > parallel_ && num_splits > 1) {
-    num_splits = std::ceil(parallel_2 / static_cast<float>(cur_parallel_d)) - 1;
+  int target_splits;
+  if (cur_parallel < num_xe_cores) {
+    // Under-utilized: fill GPU cores.
+    // Scale by block_size since larger blocks mean more compute per WG.
+    int eff_parallel = cur_parallel * block_size / 64;
+    eff_parallel = std::max(1, eff_parallel);
+    target_splits = (num_xe_cores + eff_parallel - 1) / eff_parallel;
+  } else if (cur_parallel <= num_xe_cores * 2) {
+    // Well-utilized zone (1x-2x oversubscription):
+    // GPU is busy, splitting adds overhead without benefit.
+    return 1;
+  } else {
+    // Heavily oversubscribed (>2x): shorter WGs help.
+    // But gate out when compute is already saturated.
+    int eff_parallel = cur_parallel * block_size / 64;
+    if (eff_parallel >= num_xe_cores * 8) return 1;
+    target_splits = std::max(1, kv_blocks / 64);
+    int par_cap = std::max(1, num_xe_cores * 8 / cur_parallel);
+    target_splits = std::min(target_splits, par_cap);
   }
 
-  int max_splits = (max_seqlen_k + block_size - 1) / block_size;
-  max_splits = std::min(max_splits, parallel_);
-  return std::min(num_splits, max_splits);
+  // Each split must process at least 32 KV blocks.
+  int max_splits_blocks = std::max(1, kv_blocks / 32);
+  // Hard cap: more splits give diminishing returns and increase
+  // ReduceSplitK overhead and temporary buffer memory.
+  int num_splits = std::min({target_splits, max_splits_blocks, 8});
+  return std::max(1, num_splits);
 }
 
 std::vector<at::Tensor> mha_varlen_fwd(
@@ -181,10 +202,11 @@ std::vector<at::Tensor> mha_varlen_fwd(
             : at::empty(
                   {num_tokens, num_heads_q * num_kv_splits, head_dim},
                   q.options().device(q.device()));
-    at::Tensor max_logits = at::empty(
+    at::Tensor max_logits = at::full(
         {num_tokens, num_heads_q, num_kv_splits},
+        -std::numeric_limits<float>::infinity(),
         q.options().dtype(at::kFloat).device(q.device()));
-    at::Tensor exp_sums = at::empty(
+    at::Tensor exp_sums = at::zeros(
         {num_tokens, num_heads_q, num_kv_splits},
         q.options().dtype(at::kFloat).device(q.device()));
 
diff --git a/csrc/xpu/attn/xe_2/collective/chunk_prefill_epilogue.hpp b/csrc/xpu/attn/xe_2/collective/chunk_prefill_epilogue.hpp
@@ -519,7 +519,8 @@ class DecodeFwdEpilogue {
       int idx_kv_split,
       int head_group_q,
       TensorSink& tSink,  // Sink for current head
-      int num_kv_splits) {
+      int num_kv_splits,
+      bool is_single_split) {
     using namespace cute;
     using ElementA = typename FragA::element_type;
 
@@ -535,25 +536,36 @@ class DecodeFwdEpilogue {
 
     auto [rA, rA_max, rA_sum, active] = reduce_A(tArA, tA_max, tA_sum, thr_id);
 
-    // store exp sum and max logits for current KV split
+    // Always store exp sum and max logits for current KV split.
     // assume seq_len_qo == 1
-    if (thr_id < head_group_q && num_kv_splits > 1) {
-      exp_sums(thr_id, idx_kv_split) = rA_sum(0);
-      max_logits(thr_id, idx_kv_split) = rA_max(0);
+    if (thr_id < head_group_q) {
+      if (is_single_split) {
+        // Sentinel values: make ReduceSplitK a pass-through copy.
+        exp_sums(thr_id, idx_kv_split) = ElementA(1);
+        max_logits(thr_id, idx_kv_split) = ElementA(0);
+      } else if (num_kv_splits > 1) {
+        exp_sums(thr_id, idx_kv_split) = rA_sum(0);
+        max_logits(thr_id, idx_kv_split) = rA_max(0);
+      }
     }
 
     /* Some subgroups may not have any work to do; if so, quit early. */
     if (!active) return;
 
-    /* Complete softmax, dividing out sums. */
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < rA_sum.size(); i++) {
-      rA_sum(i) = ElementA(1) / rA_sum(i);
-    }
+    /* Complete softmax: normalize output for single-split sequences
+       (so ReduceSplitK pass-through gives correct result).
+       For multi-split, store unnormalized to avoid divide-multiply
+       precision loss in the reduce roundtrip. */
+    if (is_single_split || num_kv_splits <= 1) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < rA_sum.size(); i++) {
+        rA_sum(i) = ElementA(1) / rA_sum(i);
+      }
 
-    CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < rA.size(); i++) {
-      rA(i) *= broadcast<0>(rA_sum, rA, i);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < rA.size(); i++) {
+        rA(i) *= broadcast<0>(rA_sum, rA, i);
+      }
     }
 
     /* Tile output */
@@ -585,8 +597,7 @@ class DecodeFwdEpilogue {
     using namespace sycl::ext::oneapi::this_work_item;
 
     if constexpr (ReduceK{} == _1{}) {
-      ReduceFragARow rA_max;
-      return std::make_tuple(tArA, rA_max, tA_sum, true);
+      return std::make_tuple(tArA, tA_max, tA_sum, true);
     } else {
       /* Identify A tile ID and k block for this subgroup. */
       auto thr_vak = group<1, 3>(TiledMMAPV{}.get_thr_layout_vmnk())
diff --git a/csrc/xpu/attn/xe_2/kernel/paged_decode_kernel.hpp b/csrc/xpu/attn/xe_2/kernel/paged_decode_kernel.hpp
@@ -319,10 +319,29 @@ class XeFMHAFwdSplitKVKernel {
 
       int num_blocks_per_split =
           cute::ceil_div(windowed_k_blocks, num_kv_splits);
-      int kv_split_offset = k_block0 + idx_kv_split * num_blocks_per_split;
-      int num_effective_kv_blocks = cute::min(
-          windowed_k_blocks - idx_kv_split * num_blocks_per_split,
-          num_blocks_per_split);
+
+      // Per-sequence split decision: short sequences are treated as
+      // single-split even when num_kv_splits > 1, avoiding precision
+      // loss from the split-reduce roundtrip.
+      constexpr int kMinBlocksForSplit = 128;
+      bool is_single_split =
+          (num_kv_splits > 1) && (windowed_k_blocks < kMinBlocksForSplit);
+
+      int kv_split_offset;
+      int num_effective_kv_blocks;
+      if (is_single_split) {
+        // Split 0 processes all blocks; splits 1+ skip entirely.
+        if (idx_kv_split > 0) {
+          continue;
+        }
+        kv_split_offset = k_block0;
+        num_effective_kv_blocks = windowed_k_blocks;
+      } else {
+        kv_split_offset = k_block0 + idx_kv_split * num_blocks_per_split;
+        num_effective_kv_blocks = cute::min(
+            windowed_k_blocks - idx_kv_split * num_blocks_per_split,
+            num_blocks_per_split);
+      }
 
       if (num_effective_kv_blocks <= 0) {
         // no need computation
@@ -409,7 +428,8 @@ class XeFMHAFwdSplitKVKernel {
             idx_kv_split,
             head_group_q,
             sinks_per_kv,
-            num_kv_splits);
+            num_kv_splits,
+            is_single_split);
       } else {
         epilogue(
             O(_, _, head, idx_kv_split, l_coord),
@@ -423,7 +443,8 @@ class XeFMHAFwdSplitKVKernel {
             idx_kv_split,
             head_group_q,
             sinks,
-            num_kv_splits);
+            num_kv_splits,
+            is_single_split);
       }
     }
   }
@@ -702,16 +723,18 @@ class ReduceSplitK {
           ElementLSE local_max_logit = shared_storage.max_logits_slm_array[i];
           ElementLSE local_exp_sum = shared_storage.exp_sums_slm_array[i];
 
+          // Skip splits with no valid data (short sequences treated as
+          // single-split have exp_sums=0 / max_logits=-inf for unused splits).
+          if (local_exp_sum <= ElementLSE(0)) continue;
+
           ElementLSE rescale =
               sycl::native::exp2(local_max_logit - global_max_logits);
 
-          // in FMHA epilogue, it's divided by local_exp_sum, here we multiply
-          // back
-          ElementLSE adjusted_o_accum =
-              static_cast<ElementLSE>(
-                  Oaccum(seq_idx, idx, i * num_heads_q + head_q, l_coord)) *
-              local_exp_sum;
-          acc += adjusted_o_accum * rescale;
+          // Partial outputs are unnormalized (not divided by exp_sum in the
+          // epilogue), so combine them directly with the rescale factor.
+          ElementLSE o_accum_val = static_cast<ElementLSE>(
+              Oaccum(seq_idx, idx, i * num_heads_q + head_q, l_coord));
+          acc += o_accum_val * rescale;
 
           // update global exp sum
           global_exp_sums += local_exp_sum * rescale;