Vectorize act-and-mul kernels for speedup (vllm-project#207)

Liangliang-Ma · web-flow · commit 14631a4a48b0 · 2026-03-20T15:32:26.000+08:00
Add vectorized memory access to activation-and-mul kernels using
aligned_vec loads/stores with dynamic vec_size dispatch (1-16).
Switch from 3D to 1D nd_range for simpler indexing.

All 4 fused ops (silu_and_mul, mul_and_silu, gelu_and_mul,
gelu_tanh_and_mul) now use the vectorized path. The original
scalar kernel is retained as VEC_SIZE=1 fallback.

Benchmark results (avg GPU time in us, 200 iterations, no per-iter sync):

| Model | Tokens | Dtype | d (intermediate_size) | Baseline (us) | Vectorized (us) | Change |
|-------|--------|-------|-----------------------|---------------|-----------------|--------|
| llama3-70b | 128 | fp16 | 28672 | 24.01 | 8.96 | -62.7% |
| llama3-70b | 128 | bf16 | 28672 | 27.25 | 11.25 | -58.7% |
| llama3-70b | 512 | fp16 | 28672 | 262.79 | 202.13 | -23.1% |
| llama3-70b | 512 | bf16 | 28672 | 261.46 | 202.67 | -22.5% |
| llama3-70b | 1024 | fp16 | 28672 | 545.11 | 424.03 | -22.2% |
| llama3-70b | 1024 | bf16 | 28672 | 545.13 | 424.82 | -22.1% |
| llama3-70b | 2048 | fp16 | 28672 | 1108.82 | 872.10 | -21.3% |
| llama3-70b | 2048 | bf16 | 28672 | 1108.13 | 872.70 | -21.2% |
| llama3-8b | 128 | fp16 | 14336 | 33.05 | 6.51 | -80.3% |
| llama3-8b | 128 | bf16 | 14336 | 26.65 | 6.15 | -76.9% |
| llama3-8b | 512 | fp16 | 14336 | 169.74 | 92.10 | -45.7% |
| llama3-8b | 512 | bf16 | 14336 | 139.62 | 93.25 | -33.2% |
| llama3-8b | 1024 | fp16 | 14336 | 261.68 | 201.64 | -22.9% |
| llama3-8b | 1024 | bf16 | 14336 | 260.92 | 201.73 | -22.7% |
| llama3-8b | 2048 | fp16 | 14336 | 539.98 | 420.75 | -22.1% |
| llama3-8b | 2048 | bf16 | 14336 | 541.28 | 422.87 | -21.9% |
| qwen-14b | 512 | fp16 | 13824 | 116.04 | 85.32 | -26.5% |
| qwen-14b | 512 | bf16 | 13824 | 114.37 | 85.69 | -25.1% |
| qwen-14b | 1024 | fp16 | 13824 | 238.41 | 193.29 | -18.9% |
| qwen-14b | 1024 | bf16 | 13824 | 254.00 | 193.76 | -23.7% |
| qwen-14b | 2048 | fp16 | 13824 | 527.05 | 407.07 | -22.8% |
| qwen-14b | 2048 | bf16 | 13824 | 521.38 | 403.80 | -22.6% |
| qwen-32b | 128 | fp16 | 27648 | 20.65 | 6.29 | -69.5% |
| qwen-32b | 128 | bf16 | 27648 | 21.35 | 6.89 | -67.7% |
| qwen-32b | 512 | fp16 | 27648 | 253.84 | 193.79 | -23.7% |
| qwen-32b | 512 | bf16 | 27648 | 253.64 | 193.84 | -23.6% |
| qwen-32b | 1024 | fp16 | 27648 | 526.81 | 407.99 | -22.6% |
| qwen-32b | 1024 | bf16 | 27648 | 523.08 | 408.52 | -21.9% |
| qwen-32b | 2048 | fp16 | 27648 | 1069.97 | 838.01 | -21.7% |
| qwen-32b | 2048 | bf16 | 27648 | 1068.91 | 838.34 | -21.6% |

Signed-off-by: Ma, Liangliang &lt;liangliang.ma@intel.com&gt;
diff --git a/csrc/activation.cpp b/csrc/activation.cpp
@@ -121,6 +121,49 @@ class act_and_mul_kernel {
   const int d_;
 };
 
+// Vectorized version of act_and_mul_kernel using aligned vector loads/stores.
+// Each work-item processes VEC_SIZE elements per iteration, reducing memory
+// transactions and improving bandwidth utilization.
+template <
+    typename scalar_t,
+    scalar_t (*ACT_FN)(const scalar_t&),
+    bool act_first,
+    int VEC_SIZE>
+class act_and_mul_vec_kernel {
+ public:
+  act_and_mul_vec_kernel(
+      scalar_t* __restrict__ out,
+      const scalar_t* __restrict__ input,
+      const int d)
+      : out_(out), input_(input), d_(d) {}
+
+  void operator()(sycl::nd_item<1> item) const {
+    using vec_t = vllm::xpu::aligned_vec<scalar_t, VEC_SIZE>;
+    const int64_t token_idx = item.get_group(0);
+    const int64_t offset = item.get_local_linear_id();
+    const int64_t step = item.get_local_range(0);
+    const int64_t bound = d_ / VEC_SIZE;
+
+    for (int64_t i = offset; i < bound; i += step) {
+      auto x_vec =
+          reinterpret_cast<const vec_t*>(input_)[token_idx * bound * 2 + i];
+      auto y_vec = reinterpret_cast<const vec_t*>(
+          input_)[token_idx * bound * 2 + i + bound];
+      vec_t out_vec;
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; ++j) {
+        out_vec[j] = compute<scalar_t, ACT_FN, act_first>(x_vec[j], y_vec[j]);
+      }
+      reinterpret_cast<vec_t*>(out_)[token_idx * bound + i] = out_vec;
+    }
+  }
+
+ private:
+  scalar_t* __restrict__ out_;
+  const scalar_t* __restrict__ input_;
+  const int d_;
+};
+
 template <typename T>
 [[intel::device_indirectly_callable]] inline __attribute__((always_inline)) T
 swigluoai_and_mul(const T& gate, const T& up, float alpha, float limit) {
@@ -201,12 +244,56 @@ class swigluoai_and_mul_kernel {
             (sycl_t*)out_ptr, (sycl_t*)input_ptr, d));       \
   });
 
+// Vectorized launch: dispatch to vec_size=1,2,4,8,16 based on d and dtype.
+#define VEC_LAUNCH_ACT_AND_MUL(KERNEL, ACT_FIRST, N)                  \
+  case N: {                                                           \
+    queue.submit([&](sycl::handler& cgh) {                            \
+      cgh.parallel_for(                                               \
+          sycl::nd_range<1>(num_tokens * wg_size, wg_size),           \
+          vllm::act_and_mul_vec_kernel<sycl_t, KERNEL, ACT_FIRST, N>( \
+              (sycl_t*)out_ptr, (sycl_t*)input_ptr, d));              \
+    });                                                               \
+    break;                                                            \
+  }
+
+#define LAUNCH_ACTIVATION_GATE_KERNEL_VEC(KERNEL, ACT_FIRST)             \
+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;               \
+  int d = input.size(-1) / 2;                                            \
+  int64_t num_tokens = input.numel() / input.size(-1);                   \
+  if (num_tokens == 0) {                                                 \
+    return;                                                              \
+  }                                                                      \
+  auto out_ptr = out.data_ptr<scalar_t>();                               \
+  auto input_ptr = input.data_ptr<scalar_t>();                           \
+  at::DeviceGuard device_guard(input.device());                          \
+  auto& queue = vllm::xpu::vllmGetQueue();                               \
+  int vec_size = static_cast<int>(sizeof(float) * 4 / sizeof(scalar_t)); \
+  {                                                                      \
+    int64_t tmp_wg =                                                     \
+        std::min(static_cast<int64_t>(d), static_cast<int64_t>(1024));   \
+    while (vec_size > 1 && (vec_size >> 1) * tmp_wg >= d) {              \
+      vec_size = vec_size >> 1;                                          \
+    }                                                                    \
+  }                                                                      \
+  if (d % vec_size != 0) vec_size = 1;                                   \
+  int64_t wg_size = std::min(                                            \
+      static_cast<int64_t>(d / vec_size), static_cast<int64_t>(1024));   \
+  switch (vec_size) {                                                    \
+    VEC_LAUNCH_ACT_AND_MUL(KERNEL, ACT_FIRST, 1);                        \
+    VEC_LAUNCH_ACT_AND_MUL(KERNEL, ACT_FIRST, 2);                        \
+    VEC_LAUNCH_ACT_AND_MUL(KERNEL, ACT_FIRST, 4);                        \
+    VEC_LAUNCH_ACT_AND_MUL(KERNEL, ACT_FIRST, 8);                        \
+    VEC_LAUNCH_ACT_AND_MUL(KERNEL, ACT_FIRST, 16);                       \
+    default:                                                             \
+      TORCH_CHECK(false, "Unsupported vector size: ", vec_size);         \
+  }
+
 void silu_and_mul(
     torch::Tensor& out,    // [..., d]
     torch::Tensor& input)  // [..., 2 * d]
 {
   VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "silu_and_mul", [&] {
-    LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true);
+    LAUNCH_ACTIVATION_GATE_KERNEL_VEC(vllm::silu_kernel, true);
   });
 }
 
@@ -215,7 +302,7 @@ void mul_and_silu(
     torch::Tensor& input)  // [..., 2 * d]
 {
   VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "mul_and_silu", [&] {
-    LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false);
+    LAUNCH_ACTIVATION_GATE_KERNEL_VEC(vllm::silu_kernel, false);
   });
 }
 
@@ -224,7 +311,7 @@ void gelu_and_mul(
     torch::Tensor& input)  // [..., 2 * d]
 {
   VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_and_mul", [&] {
-    LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true);
+    LAUNCH_ACTIVATION_GATE_KERNEL_VEC(vllm::gelu_kernel, true);
   });
 }
 
@@ -233,7 +320,7 @@ void gelu_tanh_and_mul(
     torch::Tensor& input)  // [..., 2 * d]
 {
   VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_tanh_and_mul", [&] {
-    LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true);
+    LAUNCH_ACTIVATION_GATE_KERNEL_VEC(vllm::gelu_tanh_kernel, true);
   });
 }