format

jikunshang · jikunshang · commit 4edf4edccb64 · 2025-11-10T02:04:59.000-08:00
Signed-off-by: Kunshang Ji &lt;kunshang.ji@intel.com&gt;
diff --git a/csrc/attention/merge_attn_states.cpp b/csrc/attention/merge_attn_states.cpp
@@ -66,17 +66,18 @@ void merge_attn_states_kernel(scalar_t* output, float* output_lse,
     pack_128b_t o_out_pack;
 
 #pragma unroll
-  for (uint i = 0; i < pack_size; ++i) {
+    for (uint i = 0; i < pack_size; ++i) {
       // Always use float for FMA to keep high precision.
       // half(uint16_t), bfloat16, float -> float.
-      const float p_out_f =
-          vllm::xpu::to_float(reinterpret_cast<const scalar_t*>(&p_out_pack)[i]);
-      const float s_out_f =
-          vllm::xpu::to_float(reinterpret_cast<const scalar_t*>(&s_out_pack)[i]);
+      const float p_out_f = vllm::xpu::to_float(
+          reinterpret_cast<const scalar_t*>(&p_out_pack)[i]);
+      const float s_out_f = vllm::xpu::to_float(
+          reinterpret_cast<const scalar_t*>(&s_out_pack)[i]);
       // fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale)
       const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale);
       // float -> half(uint16_t), bfloat16, float.
-      vllm::xpu::from_float(reinterpret_cast<scalar_t*>(&o_out_pack)[i], o_out_f);
+      vllm::xpu::from_float(reinterpret_cast<scalar_t*>(&o_out_pack)[i],
+                            o_out_f);
     }
 
     // Pack 128b storage
@@ -100,7 +101,7 @@ void merge_attn_states_kernel(scalar_t* output, float* output_lse,
     if (scalar_dtype == at::ScalarType::Float) {                        \
       fn(float);                                                        \
     } else if (scalar_dtype == at::ScalarType::Half) {                  \
-      fn(sycl::half);                                                     \
+      fn(sycl::half);                                                   \
     } else if (scalar_dtype == at::ScalarType::BFloat16) {              \
       fn(sycl::ext::oneapi::bfloat16);                                  \
     } else {                                                            \
@@ -110,7 +111,7 @@ void merge_attn_states_kernel(scalar_t* output, float* output_lse,
 
 #define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS)                  \
   {                                                                      \
-    ((sycl::queue)(queue)).submit([&](sycl::handler& cgh) {           \
+    ((sycl::queue)(queue)).submit([&](sycl::handler& cgh) {              \
       auto output_data_ptr_ct0 =                                         \
           reinterpret_cast<scalar_t*>(output.data_ptr());                \
       auto output_lse_ptr_ct1 = output_lse_ptr;                          \
@@ -181,8 +182,8 @@ void merge_attn_states_launcher(torch::Tensor& output,
   const uint threads_per_head = head_size / pack_size;
   const uint total_threads = num_tokens * num_heads * threads_per_head;
 
-  sycl::range<3> block(1,1,NUM_THREADS);
-  sycl::range<3> grid(1,1,(total_threads + NUM_THREADS - 1) / NUM_THREADS);
+  sycl::range<3> block(1, 1, NUM_THREADS);
+  sycl::range<3> grid(1, 1, (total_threads + NUM_THREADS - 1) / NUM_THREADS);
 
   at::Device curDevice = at::Device(at::kXPU, at::xpu::current_device());
   at::DeviceGuard device_guard(curDevice);
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -92,13 +92,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
   // can be used to combine partial attention results (in the split-KV case)
   ops.def(
-    "merge_attn_states("
-    "    Tensor! output,"
-    "    Tensor!? output_lse,"
-    "    Tensor prefix_output,"
-    "    Tensor prefix_lse,"
-    "    Tensor suffix_output,"
-    "    Tensor suffix_lse) -> ()");
+      "merge_attn_states("
+      "    Tensor! output,"
+      "    Tensor!? output_lse,"
+      "    Tensor prefix_output,"
+      "    Tensor prefix_lse,"
+      "    Tensor suffix_output,"
+      "    Tensor suffix_lse) -> ()");
   ops.impl("merge_attn_states", torch::kXPU, &merge_attn_states);
 }
 
diff --git a/csrc/utils.h b/csrc/utils.h
@@ -69,23 +69,18 @@ struct alignas(sizeof(scalar_t) * vec_size) aligned_vec {
 // From float to float.
 inline void from_float(float& dst, float src) { dst = src; }
 // From float32 to float16.
-inline void from_float(sycl::half& dst, float src) {
-  dst = sycl::half(src);
-}
+inline void from_float(sycl::half& dst, float src) { dst = sycl::half(src); }
 // From float32 to bfloat16.
 inline void from_float(sycl::ext::oneapi::bfloat16& dst, float src) {
   dst = sycl::ext::oneapi::bfloat16(src);
 }
 
 // From float to float.
-inline  float to_float(float u) { return u; }
+inline float to_float(float u) { return u; }
 // From float16 to float32.
 inline float to_float(sycl::half u) { return float(u); }
 // From bfloat16 to float32.
-inline float to_float(sycl::ext::oneapi::bfloat16 u) {
-  return float(u);
-}
-
+inline float to_float(sycl::ext::oneapi::bfloat16 u) { return float(u); }
 
 }  // namespace xpu
 
diff --git a/tests/register_ops.py b/tests/register_ops.py
@@ -85,9 +85,9 @@ def merge_attn_states(
     suffix_lse: torch.Tensor,
     output_lse: torch.Tensor | None = None,
 ) -> None:
-    torch.ops._C.merge_attn_states(
-        output, output_lse, prefix_output, prefix_lse, suffix_output, suffix_lse
-    )
+    torch.ops._C.merge_attn_states(output, output_lse, prefix_output,
+                                   prefix_lse, suffix_output, suffix_lse)
+
 
 def reshape_and_cache(
     key: torch.Tensor,
diff --git a/tests/test_merge_attn_states.py b/tests/test_merge_attn_states.py
@@ -1,30 +1,29 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 """Tests for merge_attn_states function.
 
 Run `pytest tests/test_merge_attn_states.py`.
 """
 
+import logging
+
 import pytest
 import torch
-import logging
 
 from tests.register_ops import merge_attn_states as merge_attn_states_xpu
 
 logger = logging.getLogger("vllm_xpu_kernel")
 
 
-
 # Naive PyTorch Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
 # can be used to combine partial attention results (in the split-KV case)
 def merge_attn_states_torch(
-    output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
-    prefix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
-    prefix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
-    suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
-    suffix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
-    output_lse: torch.Tensor | None = None,  # [NUM_HEADS, NUM_TOKENS]
+        output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        prefix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        prefix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+        suffix_output: torch.Tensor,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+        suffix_lse: torch.Tensor,  # [NUM_HEADS, NUM_TOKENS]
+        output_lse: torch.Tensor | None = None,  # [NUM_HEADS, NUM_TOKENS]
 ):
     p_lse = prefix_lse
     s_lse = suffix_lse
@@ -42,8 +41,10 @@ def merge_attn_states_torch(
         output_lse = torch.log(out_se) + max_lse
     p_scale = p_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
     s_scale = s_lse_exp / out_se  # [NUM_HEADS, NUM_TOKENS]
-    p_scale = torch.transpose(p_scale, 0, 1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
-    s_scale = torch.transpose(s_scale, 0, 1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    p_scale = torch.transpose(p_scale, 0,
+                              1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
+    s_scale = torch.transpose(s_scale, 0,
+                              1).unsqueeze(2)  # [NUM_TOKENS, NUM_HEADS, 1]
     output = prefix_output * p_scale + suffix_output * s_scale
     return output, output_lse
 
@@ -66,13 +67,10 @@ def merge_attn_states_torch(
 }
 
 
-
 def generate_markdown_table():
     global all_case_info
-    table_header = (
-        "| tokens | heads | headsize | dtype "
-        "| device | torch | cuda | speedup |"
-    )
+    table_header = ("| tokens | heads | headsize | dtype "
+                    "| device | torch | cuda | speedup |")
     table_separator = "| --- | --- | --- | --- | --- | --- | --- | --- |"
 
     def shortly_dtype(dtype: torch.dtype) -> str:
@@ -96,36 +94,33 @@ def shortly_device(device: str) -> str:
         ) = info
         dtype = shortly_dtype(dtype)
         device = shortly_device(device)
-        print(
-            f"| {num_tokens} | {num_heads} | {head_size} "
-            f"| {dtype} | {device} | {avg_time_torch_kernel:.5f}ms "
-            f"| {avg_time_xpu_kernel:.5f}ms "
-            f"| {performance_improved:.4f}x |"
-        )
+        print(f"| {num_tokens} | {num_heads} | {head_size} "
+              f"| {dtype} | {device} | {avg_time_torch_kernel:.5f}ms "
+              f"| {avg_time_xpu_kernel:.5f}ms "
+              f"| {performance_improved:.4f}x |")
 
 
 @pytest.mark.parametrize("num_tokens", NUM_BATCH_TOKENS)
 @pytest.mark.parametrize("num_query_heads", NUM_QUERY_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("output_dtype", DTYPES)
 @torch.inference_mode()
-def test_merge_attn_states(
-    num_tokens: int, num_query_heads: int, head_size: int, output_dtype: torch.dtype
-):
+def test_merge_attn_states(num_tokens: int, num_query_heads: int,
+                           head_size: int, output_dtype: torch.dtype):
 
     NUM_TOKENS = num_tokens
     NUM_HEADS = num_query_heads
     HEAD_SIZE = head_size
 
-    logger.debug(
-        f"\nNUM_TOKENS:{NUM_TOKENS}, NUM_HEADS:{NUM_HEADS}, "
-        f"HEAD_SIZE:{HEAD_SIZE}, DTYPE: {output_dtype}, "
-        f"Device: xpu."
-    )
-
     # prefix_lse and suffix_lse contain inf and normal values
-    prefix_lse = torch.randn(NUM_HEADS, NUM_TOKENS, dtype=torch.float32, device="xpu")
-    suffix_lse = torch.randn(NUM_HEADS, NUM_TOKENS, dtype=torch.float32, device="xpu")
+    prefix_lse = torch.randn(NUM_HEADS,
+                             NUM_TOKENS,
+                             dtype=torch.float32,
+                             device="xpu")
+    suffix_lse = torch.randn(NUM_HEADS,
+                             NUM_TOKENS,
+                             dtype=torch.float32,
+                             device="xpu")
 
     # Generate boolean masks
     mask_prefix = torch.rand(NUM_HEADS, NUM_TOKENS) < 0.1
@@ -140,18 +135,18 @@ def test_merge_attn_states(
 
     # Other input tensors (need to be initialized but
     # no actual calculation needed)
-    output = torch.zeros(
-        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="xpu"
-    )
-    output_lse = torch.zeros(
-        (NUM_HEADS, NUM_TOKENS), dtype=torch.float32, device="xpu"
-    )
-    prefix_output = torch.randn(
-        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="xpu"
-    )
-    suffix_output = torch.randn(
-        (NUM_TOKENS, NUM_HEADS, HEAD_SIZE), dtype=output_dtype, device="xpu"
-    )
+    output = torch.zeros((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
+                         dtype=output_dtype,
+                         device="xpu")
+    output_lse = torch.zeros((NUM_HEADS, NUM_TOKENS),
+                             dtype=torch.float32,
+                             device="xpu")
+    prefix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
+                                dtype=output_dtype,
+                                device="xpu")
+    suffix_output = torch.randn((NUM_TOKENS, NUM_HEADS, HEAD_SIZE),
+                                dtype=output_dtype,
+                                device="xpu")
 
     warmup_times = 2
     repeat_times = 20
@@ -226,60 +221,39 @@ def test_merge_attn_states(
 
     # 2. Performance compare
     performance_improved = avg_time_torch_kernel / avg_time_xpu_kernel
-    logger.debug(f" Torch time: {avg_time_torch_kernel:.6f}ms")
-    logger.debug(
-        f"  XPU time: {avg_time_xpu_kernel:.6f}ms, "
-        f"Performance: {performance_improved:.5f}x"
-    )
-    logger.debug("-" * 100)
-
-    # 4. Correctness compare
+    # print(f" Torch time: {avg_time_torch_kernel:.6f}ms")
+    # print(f"  XPU time: {avg_time_xpu_kernel:.6f}ms, "
+    #              f"Performance: {performance_improved:.5f}x")
+    # print("-" * 100)
+
+    # 3. Correctness compare
     # Liger Kernel: Efficient Triton Kernels for LLM Training
     # https://arxiv.org/pdf/2410.10989, 3.3 Correctness
     # use rtol = 1e-2 for bfloat16.
     rtol = 1e-2 if output_dtype == torch.bfloat16 else 1e-3
 
-    def diff(a: torch.Tensor, b: torch.Tensor):
-        max_diff = torch.max(torch.abs(a.float() - b.float()))
-        return max_diff
-
-    # Use Triton output as reference because we want to replace
-    # the Triton kernel with custom XPU kernel for merge attn
-    # states operation.
-    torch.testing.assert_close(
-        output_xpu.float(), output_torch.float(), atol=1e-3, rtol=rtol
-    )
-    logger.debug("Output all match, max abs diff:")
-    logger.debug(f"  (XPU vs Torch) : {diff(output_torch, output_xpu)}")
-    logger.debug("-" * 100)
-
-    torch.testing.assert_close(
-        output_lse_xpu.float(), output_lse_torch.float(), atol=1e-3, rtol=rtol
-    )
-    logger.debug("Output LSE all match, max abs diff:")
-    logger.debug(f"  (XPU vs Torch) : {diff(output_lse_torch, output_lse_xpu)}")
-    logger.debug("-" * 100)
-
-    logger.debug(
-        "All output values test passed! All inf values "
-        "are correctly replaced with -inf."
-    )
-    logger.debug("-" * 100)
+    # Use torch output as reference
+    torch.testing.assert_close(output_xpu.float(),
+                               output_torch.float(),
+                               atol=1e-3,
+                               rtol=rtol)
+
+    torch.testing.assert_close(output_lse_xpu.float(),
+                               output_lse_torch.float(),
+                               atol=1e-3,
+                               rtol=rtol)
 
     device = "xpu"
-    all_case_info.append(
-        (
-            NUM_TOKENS,
-            NUM_HEADS,
-            HEAD_SIZE,
-            output_dtype,
-            device,
-            avg_time_torch_kernel,
-            avg_time_xpu_kernel,
-            performance_improved,
-        )
-    )
-    if len(all_case_info) == (
-        len(NUM_BATCH_TOKENS) * len(HEAD_SIZES) * len(NUM_QUERY_HEADS) * len(DTYPES)
-    ):
+    all_case_info.append((
+        NUM_TOKENS,
+        NUM_HEADS,
+        HEAD_SIZE,
+        output_dtype,
+        device,
+        avg_time_torch_kernel,
+        avg_time_xpu_kernel,
+        performance_improved,
+    ))
+    if len(all_case_info) == (len(NUM_BATCH_TOKENS) * len(HEAD_SIZES) *
+                              len(NUM_QUERY_HEADS) * len(DTYPES)):
         generate_markdown_table()