zufangzhu
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎csrc/activation.cpp‎
Lines changed: 122 additions & 0 deletions b/‎csrc/activation.cpp‎
Lines changed: 122 additions & 0 deletions
@@ -439,8 +439,10 @@ if(BASIC_KERNELS_ENABLED)
   set(VLLM_EXT_SRC
       "csrc/cache.cpp"
       "csrc/layernorm.cpp"
+      "csrc/layernorm_quant.cpp"
       "csrc/activation.cpp"
       "csrc/pos_encoding_kernels.cpp"
+      "csrc/fused_qknorm_rope.cpp"
       "csrc/torch_bindings.cpp"
       "csrc/quantization/fp8/fp8_quant.cpp"
       "csrc/quantization/fp4/mxfp4_quant.cpp"
 
@@ -1,9 +1,14 @@
 #include <sycl/sycl.hpp>
 #include <cmath>
 #include <algorithm>
+#include <numeric>
 #include "utils.h"
 #include "dispatch_utils.h"
 
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e5m2.h>
+#include "quantization/fp8/quant_utils.h"
+
 #define VLLM_LDG(arg) *(arg)
 
 namespace vllm {
@@ -172,6 +177,56 @@ class act_and_mul_vec_kernel {
   const int d_;
 };
 
+template <
+    typename scalar_t,
+    scalar_t (*ACT_FN)(const scalar_t&),
+    typename fp8_type,
+    int VEC_SIZE>
+class act_and_mul_quant_vec_kernel {
+ public:
+  act_and_mul_quant_vec_kernel(
+      fp8_type* __restrict__ out,          // [..., d]
+      const scalar_t* __restrict__ input,  // [..., 2 * d]
+      const float* __restrict__ scale,     // [1]
+      const int d)
+      : out_(out), input_(input), scale_(scale), d_(d) {}
+
+  void operator()(sycl::nd_item<1> item) const {
+    using vec_t = vllm::xpu::aligned_vec<scalar_t, VEC_SIZE>;
+
+    const int64_t token_idx = item.get_group(0);
+    const int64_t offset = item.get_local_linear_id();
+    const int64_t step = item.get_local_range(0);
+    const int64_t bound = d_ / VEC_SIZE;
+
+    const float inv_scale = 1.0f / (*scale_);
+    const float fp8_max = static_cast<float>(fp8::quant_type_max_v<fp8_type>);
+
+    // x and y halves are laid out contiguously: [x0..xd-1, y0..yd-1]
+    const auto* v_x =
+        reinterpret_cast<const vec_t*>(input_) + token_idx * bound * 2;
+    const auto* v_y = v_x + bound;
+
+    for (int64_t i = offset; i < bound; i += step) {
+      vec_t xv = v_x[i];
+      vec_t yv = v_y[i];
+#pragma unroll
+      for (int j = 0; j < VEC_SIZE; j++) {
+        float val = static_cast<float>(ACT_FN(xv[j]) * yv[j]) * inv_scale;
+        float clamped = sycl::fmax(-fp8_max, sycl::fmin(val, fp8_max));
+        out_[token_idx * d_ + i * VEC_SIZE + j] =
+            static_cast<fp8_type>(clamped);
+      }
+    }
+  }
+
+ private:
+  fp8_type* __restrict__ out_;          // [..., d]
+  const scalar_t* __restrict__ input_;  // [..., 2 * d]
+  const float* __restrict__ scale_;     // [1]
+  const int d_;
+};
+
 template <typename T>
 [[intel::device_indirectly_callable]] inline __attribute__((always_inline)) T
 swigluoai_and_mul(const T& gate, const T& up, float alpha, float limit) {
@@ -352,6 +407,73 @@ void silu_and_mul(
   });
 }
 
+// Fused SiLU + Mul + FP8 Quantization
+// Input: [..., 2*d] in FP16/BF16, Output: [..., d] in FP8
+// Dispatches to the vectorized kernel (VEC_SIZE=1..8) based on alignment.
+#define LAUNCH_ACT_AND_MUL_QUANT_VEC(KERNEL, N)                               \
+  case N: {                                                                   \
+    int64_t wg_size =                                                         \
+        std::min(static_cast<int64_t>(d / N), static_cast<int64_t>(1024));    \
+    VLLM_DISPATCH_FP8_TYPES(                                                  \
+        out.scalar_type(), "act_and_mul_quant_vec_kernel_fp8", [&] {          \
+          auto out_ptr = out.data_ptr<fp8_t>();                               \
+          queue.submit([&](sycl::handler& cgh) {                              \
+            cgh.parallel_for(                                                 \
+                sycl::nd_range<1>(num_tokens * wg_size, wg_size),             \
+                vllm::act_and_mul_quant_vec_kernel<sycl_t, KERNEL, fp8_t, N>( \
+                    out_ptr, (sycl_t*)input_ptr, scale_ptr, d));              \
+          });                                                                 \
+        });                                                                   \
+    break;                                                                    \
+  }
+
+#define LAUNCH_ACTIVATION_GATE_QUANT_KERNEL(KERNEL)                          \
+  using sycl_t = vllm::xpu::SyclTypeTrait<scalar_t>::Type;                   \
+  int d = input.size(-1) / 2;                                                \
+  int64_t num_tokens = input.numel() / input.size(-1);                       \
+  if (num_tokens == 0) {                                                     \
+    return;                                                                  \
+  }                                                                          \
+  auto input_ptr = input.data_ptr<scalar_t>();                               \
+  auto scale_ptr = scale.data_ptr<float>();                                  \
+  at::DeviceGuard device_guard(input.device());                              \
+  auto& queue = vllm::xpu::vllmGetQueue();                                   \
+  /* Compute vec_size like non-quant path: gcd(4*sizeof(float)/sizeof, d) */ \
+  int vec_size = static_cast<int>(sizeof(float) * 4 / sizeof(sycl_t));       \
+  {                                                                          \
+    int64_t tmp_wg =                                                         \
+        std::min(static_cast<int64_t>(d), static_cast<int64_t>(1024));       \
+    while (vec_size > 1 && (vec_size >> 1) * tmp_wg >= d) {                  \
+      vec_size = vec_size >> 1;                                              \
+    }                                                                        \
+  }                                                                          \
+  if (d % vec_size != 0) vec_size = 1;                                       \
+  switch (vec_size) {                                                        \
+    LAUNCH_ACT_AND_MUL_QUANT_VEC(KERNEL, 1);                                 \
+    LAUNCH_ACT_AND_MUL_QUANT_VEC(KERNEL, 2);                                 \
+    LAUNCH_ACT_AND_MUL_QUANT_VEC(KERNEL, 4);                                 \
+    LAUNCH_ACT_AND_MUL_QUANT_VEC(KERNEL, 8);                                 \
+    LAUNCH_ACT_AND_MUL_QUANT_VEC(KERNEL, 16);                                \
+    default:                                                                 \
+      TORCH_CHECK(false, "Unsupported vector size: ", vec_size);             \
+  }
+
+void silu_and_mul_quant(
+    torch::Tensor& out,    // [..., d] FP8
+    torch::Tensor& input,  // [..., 2 * d] FP16/BF16
+    torch::Tensor& scale)  // [1] FP32
+{
+  TORCH_CHECK(
+      out.dtype() == torch::kFloat8_e4m3fn ||
+      out.dtype() == torch::kFloat8_e5m2);
+  TORCH_CHECK(
+      input.dtype() == torch::kFloat16 || input.dtype() == torch::kBFloat16);
+  TORCH_CHECK(input.size(-1) % 2 == 0);
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "silu_and_mul_quant", [&] {
+    LAUNCH_ACTIVATION_GATE_QUANT_KERNEL(vllm::silu_kernel);
+  });
+}
+
 void mul_and_silu(
     torch::Tensor& out,    // [..., d]
     torch::Tensor& input)  // [..., 2 * d]