Fix workgroup barrier deadlock

frost-intel · frost-intel · commit e08162faed4d · 2026-04-24T09:42:19.000-07:00
diff --git a/.github/workflows/ut.yaml b/.github/workflows/ut.yaml
@@ -119,7 +119,7 @@ jobs:
       - name: test 
         run: |
           echo "Running tests with XPU_KERNEL_TEST_SCOPE=${{ env.XPU_KERNEL_TEST_SCOPE }}"
-          XPU_KERNEL_TEST_SCOPE=${{ env.XPU_KERNEL_TEST_SCOPE }} ZE_AFFINITY_MASK=0,1 SKIP_HANG_KERNEL=1 SKIP_ACC_ERROR_KERNEL=1 pytest -v -s tests/
+          XPU_KERNEL_TEST_SCOPE=${{ env.XPU_KERNEL_TEST_SCOPE }} ZE_AFFINITY_MASK=0,1 SKIP_ACC_ERROR_KERNEL=1 pytest -v -s tests/
           VLLM_XPU_FORCE_XE_DEFAULT_KERNEL=1 XPU_KERNEL_TEST_SCOPE=${{ env.XPU_KERNEL_TEST_SCOPE }} ZE_AFFINITY_MASK=0,1 pytest -v -s tests/fused_moe/test_grouped_gemm.py::test_grouped_gemm
 
   clean-repo-pvc:
diff --git a/csrc/xpu/attn/xe_2/kernel/chunk_prefill_kernel.hpp b/csrc/xpu/attn/xe_2/kernel/chunk_prefill_kernel.hpp
@@ -265,7 +265,7 @@ class XeFMHAFwdKernel {
           cute::min(seq_len_qo, (blk_q * get<0>(TileShapeQK{}) + q_offset_sg));
 
       // calc sg level seq_len_kv
-      const int seq_len =
+      const int sg_seq_len =
           CausalMask
               ? LocalMask
                     ? cute::min(
@@ -275,17 +275,39 @@ class XeFMHAFwdKernel {
                     : cute::min(
                           seq_len_kv, full_tile_offset + seq_coord + q_sg_tile)
               : seq_len_kv;
-      const int k_block0 =
+      const int sg_k_block0 =
           LocalMask
               ? cute::max(
                     seq_coord + full_tile_offset - params.mainloop.local_left,
                     0) /
                     get<1>(TileShapeQK{})
               : 0;
-      const int k_blocks = cute::ceil_div(seq_len, get<1>(TileShapeQK{}));
-      const int k_blocks_causal =
-          CausalMask ? (seq_coord + full_tile_offset) / get<1>(TileShapeQK{})
-                     : 0;
+      const int sg_k_blocks =
+          cute::ceil_div(sg_seq_len, get<1>(TileShapeQK{}));
+      const int sg_k_blocks_causal =
+          CausalMask
+              ? (seq_coord + full_tile_offset) / get<1>(TileShapeQK{})
+              : 0;
+
+      // The mainloop wraps each K iteration in a workgroup-scoped barrier
+      // pair, so every subgroup in the workgroup must execute the same
+      // K-loop trip count. Reduce the per-SG bounds across the WG:
+      //   k_block0        = min across WG (start no later than any SG)
+      //   k_blocks        = max across WG (end no earlier than any SG)
+      //   k_blocks_causal = min across WG (turn on causal masking no later
+      //                                    than any SG needs it)
+      // Per-element causal / local / remainder masking inside the mainloop
+      // handles the widened range safely for SGs that didn't need it.
+      auto wg = sycl::ext::oneapi::this_work_item::get_work_group<3>();
+      const int k_block0 = LocalMask
+          ? sycl::reduce_over_group(wg, sg_k_block0, sycl::minimum<int>{})
+          : 0;
+      const int k_blocks = (CausalMask || LocalMask)
+          ? sycl::reduce_over_group(wg, sg_k_blocks, sycl::maximum<int>{})
+          : sg_k_blocks;
+      const int k_blocks_causal = CausalMask
+          ? sycl::reduce_over_group(wg, sg_k_blocks_causal, sycl::minimum<int>{})
+          : 0;
 
       int offset_q = 0, offset_k = 0, offset_v = 0, offset_o = 0;
       if constexpr (is_var_len) {
@@ -347,7 +369,7 @@ class XeFMHAFwdKernel {
           k_blocks,
           k_blocks_causal,
           thr_id,
-          seq_len,
+          sg_seq_len,
           full_tile_offset);
 
       // return softmax_lse
diff --git a/tests/flash_attn/test_flash_attn_varlen_func.py b/tests/flash_attn/test_flash_attn_varlen_func.py
@@ -193,15 +193,6 @@ def test_varlen_with_paged_kv(
 ) -> None:
     torch.set_default_device("xpu")
     torch.xpu.set_device("xpu:0")
-    # # FIXME: remove skip
-    if (is_casual and seq_lens[1][0]
-            == 5) and (os.getenv("SKIP_HANG_KERNEL") is not None
-                       and os.getenv("SKIP_HANG_KERNEL") == "1"):
-        pytest.skip("skip casual for seqlen0 to avoid runtime hang on CI.")
-    if (window_size[0] != -1 or window_size[1]
-            != -1) and (os.getenv("SKIP_HANG_KERNEL") is not None
-                        and os.getenv("SKIP_HANG_KERNEL") == "1"):
-        pytest.skip("skip local attn to avoid runtime hang on CI.")
     if block_size == 128 and num_blocks == 32768 and head_size >= 192:
         pytest.skip("skip test cases that may run out of Memory.")
     if stride_pad > 0 and fp8_dtype is not None:
@@ -393,15 +384,6 @@ def test_varlen_with_interleaved_paged_kv(
 ) -> None:
     torch.set_default_device("xpu")
     torch.xpu.set_device("xpu:0")
-    # # FIXME: remove skip
-    if (is_casual and seq_lens[1][0]
-            == 5) and (os.getenv("SKIP_HANG_KERNEL") is not None
-                       and os.getenv("SKIP_HANG_KERNEL") == "1"):
-        pytest.skip("skip casual for seqlen0 to avoid runtime hang on CI.")
-    if (window_size[0] != -1 or window_size[1]
-            != -1) and (os.getenv("SKIP_HANG_KERNEL") is not None
-                        and os.getenv("SKIP_HANG_KERNEL") == "1"):
-        pytest.skip("skip local attn to avoid runtime hang on CI.")
     if block_size == 128 and num_blocks == 32768 and head_size >= 192:
         pytest.skip("skip test cases that may run out of Memory.")
 
@@ -808,9 +790,6 @@ def test_varlen_with_softmax_lse(
 ) -> None:
     torch.set_default_device("xpu")
     torch.xpu.set_device("xpu:0")
-    if (is_casual and seq_lens[1][0]
-            == 5) and (os.getenv("SKIP_HANG_KERNEL") == "1"):
-        pytest.skip("skip casual for seqlen0 to avoid runtime hang on CI.")
     torch.manual_seed(4242)
 
     query_lens = [x[0] for x in seq_lens]