layernorm support uncontiguous (vllm-project#131)

zufangzhu · web-flow · commit 699feaf72730 · 2026-02-28T12:46:33.000+08:00
* add ut for static quant fp8

Signed-off-by: Zhu, Zufang &lt;zufang.zhu@intel.com&gt;

* remove useless val

Signed-off-by: Zhu, Zufang &lt;zufang.zhu@intel.com&gt;

* layernorm support contiguous and add vec

Signed-off-by: Zhu, Zufang &lt;zufang.zhu@intel.com&gt;

* use fixed VEC_SIZE

Signed-off-by: Zhu, Zufang &lt;zufang.zhu@intel.com&gt;

* fix typo

Signed-off-by: Zhu, Zufang &lt;zufang.zhu@intel.com&gt;

* add ut for uncontiguous input of rms_norm and format

Signed-off-by: Zhu, Zufang &lt;zufang.zhu@intel.com&gt;

* fix conor case

Signed-off-by: Zhu, Zufang &lt;zufang.zhu@intel.com&gt;

---------

Signed-off-by: Zhu, Zufang &lt;zufang.zhu@intel.com&gt;
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
@@ -87,3 +87,24 @@
     constexpr bool const_expr = false;            \
     __VA_ARGS__();                                \
   }
+
+#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...)                                   \
+  switch (NUM_DIMS) {                                                          \
+    case 2: {                                                                  \
+      constexpr int tensor_rank = 2;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    case 3: {                                                                  \
+      constexpr int tensor_rank = 3;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    case 4: {                                                                  \
+      constexpr int tensor_rank = 4;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    default:                                                                   \
+      TORCH_CHECK(false, "Expects rank 2, 3 or 4 tensors but got ", NUM_DIMS); \
+  }
diff --git a/csrc/layernorm.cpp b/csrc/layernorm.cpp
@@ -7,20 +7,35 @@
 namespace vllm {
 
 template <typename scalar_t>
+struct alignas(8) vec4_t {
+  scalar_t val[4];
+};
+
+// The vector width is fixed at 4 to avoid excessive branching in the kernel,
+// which could degrade performance.
+template <typename scalar_t, int NUM_DIMS, int VEC_SIZE = 4>
 class rms_norm_kernel {
  public:
   rms_norm_kernel(
       scalar_t* out_,
       const scalar_t* input_,
-      const int64_t input_stride_,
+      const int64_t input_stride_d2_,  // input.stride(-2)
+      const int64_t input_stride_d3_,  // input.stride(-3)
+      const int64_t input_stride_d4_,  // input.stride(-4)
+      const int64_t input_shape_d2_,   // input.size(-2)
+      const int64_t input_shape_d3_,   // input.size(-3)
       const scalar_t* weight_,
       const float epsilon_,
       const int num_tokens_,
       const int hidden_size_,
       sycl::local_accessor<float, 1> s_variance_)
       : out(out_),
         input(input_),
-        input_stride(input_stride_),
+        input_stride_d2(input_stride_d2_),
+        input_stride_d3(input_stride_d3_),
+        input_stride_d4(input_stride_d4_),
+        input_shape_d2(input_shape_d2_),
+        input_shape_d3(input_shape_d3_),
         weight(weight_),
         epsilon(epsilon_),
         num_tokens(num_tokens_),
@@ -33,10 +48,80 @@ class rms_norm_kernel {
         s_variance.template get_multi_ptr<sycl::access::decorated::no>().get();
     float variance = 0.0f;
 
-    for (int idx = item_ct1.get_local_id(2); idx < hidden_size;
-         idx += item_ct1.get_local_range(2)) {
-      const float x = (float)input[item_ct1.get_group(2) * input_stride + idx];
+    const scalar_t* input_row;
+    if constexpr (NUM_DIMS == 2) {
+      // 2D for layernorm normal case [batch_size, hidden]
+      input_row = input + item_ct1.get_group(2) * input_stride_d2;
+    } else if constexpr (NUM_DIMS == 3) {
+      // 3D for q/k norm [batch_size, num_heads, head_size]
+      int batch_idx = item_ct1.get_group(2) / input_shape_d2;
+      int head_idx = item_ct1.get_group(2) % input_shape_d2;
+      input_row =
+          input + batch_idx * input_stride_d3 + head_idx * input_stride_d2;
+    } else if constexpr (NUM_DIMS == 4) {
+      // 4D for transformers model_impl qk norm [batch, seq, head, head_dim]
+      int batch_idx = item_ct1.get_group(2) / (input_shape_d3 * input_shape_d2);
+      int remaining = item_ct1.get_group(2) % (input_shape_d3 * input_shape_d2);
+      int seq_idx = remaining / input_shape_d2;
+      int head_idx = remaining % input_shape_d2;
+      input_row = input + batch_idx * input_stride_d4 +
+                  seq_idx * input_stride_d3 + head_idx * input_stride_d2;
+    }
+
+    auto vec_op = [&variance](
+                      const vec4_t<scalar_t>& vec, int vec_size = VEC_SIZE) {
+      for (int i = 0; i < vec_size; ++i) {
+        float x = static_cast<float>(vec.val[i]);
+        variance += x * x;
+      }
+    };
+    auto scalar_op = [&variance](const scalar_t& val) {
+      float x = static_cast<float>(val);
       variance += x * x;
+    };
+
+    constexpr int WIDTH = VEC_SIZE * sizeof(scalar_t);
+    uintptr_t addr_in = reinterpret_cast<uintptr_t>(input_row);
+
+    // fast path when the whole region is already aligned
+    bool can_vec =
+        ((addr_in & (WIDTH - 1)) == 0) && ((hidden_size & (VEC_SIZE - 1)) == 0);
+    if (can_vec) {
+      int64_t const num_vec_elems = hidden_size / VEC_SIZE;
+      auto const* vec_in = reinterpret_cast<const vec4_t<scalar_t>*>(input_row);
+      for (int i = item_ct1.get_local_id(2); i < num_vec_elems;
+           i += item_ct1.get_local_range(2)) {
+        vec4_t<scalar_t> tmp = vec_in[i];
+        vec_op(tmp);
+      }
+    } else {
+      int misalignment_offset = addr_in & (WIDTH - 1);
+      int alignment_bytes = WIDTH - misalignment_offset;
+      int prefix_elems = alignment_bytes & (WIDTH - 1);
+      prefix_elems /= sizeof(scalar_t);
+      prefix_elems = prefix_elems < hidden_size ? prefix_elems : hidden_size;
+
+      // 1. handle the possibly unaligned prefix with scalar access.
+      for (int i = item_ct1.get_local_id(2); i < prefix_elems;
+           i += item_ct1.get_local_range(2)) {
+        scalar_op(input_row[i]);
+      }
+
+      int64_t const num_vec_elems = (hidden_size - prefix_elems) / VEC_SIZE;
+      auto const* vec_in =
+          reinterpret_cast<const vec4_t<scalar_t>*>(input_row + prefix_elems);
+      for (int i = item_ct1.get_local_id(2); i < num_vec_elems;
+           i += item_ct1.get_local_range(2)) {
+        vec4_t<scalar_t> tmp = vec_in[i];
+        vec_op(tmp);
+      }
+
+      // 3. handle remaining tail elements.
+      for (int i = item_ct1.get_local_id(2) + num_vec_elems * VEC_SIZE;
+           i < hidden_size - prefix_elems;
+           i += item_ct1.get_local_range(2)) {
+        scalar_op((input_row + prefix_elems)[i]);
+      }
     }
 
     variance = sycl::reduce_over_group(
@@ -49,18 +134,47 @@ class rms_norm_kernel {
 
     item_ct1.barrier(sycl::access::fence_space::local_space);
 
-    for (int idx = item_ct1.get_local_id(2); idx < hidden_size;
-         idx += item_ct1.get_local_range(2)) {
-      float x = (float)input[item_ct1.get_group(2) * input_stride + idx];
-      out[item_ct1.get_group(2) * hidden_size + idx] =
-          ((scalar_t)(x * (*s_variance_ptr))) * weight[idx];
+    scalar_t* out_row = out + item_ct1.get_group(2) * hidden_size;
+    uintptr_t addr_weight = reinterpret_cast<uintptr_t>(weight);
+    uintptr_t addr_out = reinterpret_cast<uintptr_t>(out_row);
+    bool can_vec_out = ((addr_in & (WIDTH - 1)) == 0) &&
+                       ((addr_weight & (WIDTH - 1)) == 0) &&
+                       ((addr_out & (WIDTH - 1)) == 0) &&
+                       ((hidden_size & (VEC_SIZE - 1)) == 0);
+    if (can_vec_out) {
+      auto* v_in = reinterpret_cast<const vec4_t<scalar_t>*>(input_row);
+      auto* v_w = reinterpret_cast<const vec4_t<scalar_t>*>(weight);
+      auto* v_out = reinterpret_cast<vec4_t<scalar_t>*>(out_row);
+      int64_t const out_num_vec_elems = hidden_size / VEC_SIZE;
+      float s_variance_val = *s_variance_ptr;
+      for (int idx = item_ct1.get_local_id(2); idx < out_num_vec_elems;
+           idx += item_ct1.get_local_range(2)) {
+        vec4_t<scalar_t> dst;
+        vec4_t<scalar_t> src1 = v_in[idx];
+        vec4_t<scalar_t> src2 = v_w[idx];
+        for (int j = 0; j < VEC_SIZE; j++) {
+          float x = static_cast<float>(src1.val[j]);
+          dst.val[j] = ((scalar_t)(x * s_variance_val)) * src2.val[j];
+        }
+        v_out[idx] = dst;
+      }
+    } else {
+      for (int idx = item_ct1.get_local_id(2); idx < hidden_size;
+           idx += item_ct1.get_local_range(2)) {
+        float x = (float)input_row[idx];
+        out_row[idx] = ((scalar_t)(x * (*s_variance_ptr))) * weight[idx];
+      }
     }
   }
 
  private:
   scalar_t* __restrict__ out;          // [..., hidden_size]
   const scalar_t* __restrict__ input;  // [..., hidden_size]
-  const int64_t input_stride;
+  const int64_t input_stride_d2;
+  const int64_t input_stride_d3;
+  const int64_t input_stride_d4;
+  const int64_t input_shape_d2;
+  const int64_t input_shape_d3;
   const scalar_t* __restrict__ weight;  // [hidden_size]
   const float epsilon;
   const int num_tokens;
@@ -77,26 +191,39 @@ void call_rms_norm_kernel(
   using sycl_t = typename vllm::xpu::SyclTypeTrait<scalar_t>::Type;
   int hidden_size = input.size(-1);
   int num_tokens = input.numel() / hidden_size;
-  int64_t input_stride = input.stride(-2);
+  int num_dims = input.dim();
+  int64_t input_stride_d2 = input.stride(-2);
+  int64_t input_stride_d3 = (num_dims >= 3) ? input.stride(-3) : 0;
+  int64_t input_stride_d4 = (num_dims >= 4) ? input.stride(-4) : 0;
+  int64_t input_shape_d2 = (num_dims >= 3) ? input.size(-2) : 0;
+  int64_t input_shape_d3 = (num_dims >= 4) ? input.size(-3) : 0;
+
   auto out_ptr = out.data_ptr<scalar_t>();
   auto input_ptr = input.data_ptr<scalar_t>();
   auto weight_ptr = weight.data_ptr<scalar_t>();
   sycl::range<3> grid(1, 1, num_tokens);
   sycl::range<3> block(1, 1, std::min(hidden_size, 1024));
   auto& queue = vllm::xpu::vllmGetQueue();
-  queue.submit([&](sycl::handler& cgh) {
-    sycl::local_accessor<float, 1> s_variance(sycl::range<1>(1), cgh);
-    cgh.parallel_for(
-        sycl::nd_range<3>(grid * block, block),
-        vllm::rms_norm_kernel<sycl_t>(
-            (sycl_t*)out_ptr,
-            (const sycl_t*)input_ptr,
-            input_stride,
-            (const sycl_t*)weight_ptr,
-            epsilon,
-            num_tokens,
-            hidden_size,
-            s_variance));
+
+  VLLM_DISPATCH_RANK234(num_dims, [&]() {
+    queue.submit([&](sycl::handler& cgh) {
+      sycl::local_accessor<float, 1> s_variance(sycl::range<1>(1), cgh);
+      cgh.parallel_for(
+          sycl::nd_range<3>(grid * block, block),
+          vllm::rms_norm_kernel<sycl_t, tensor_rank>(
+              (sycl_t*)out_ptr,
+              (const sycl_t*)input_ptr,
+              input_stride_d2,
+              input_stride_d3,
+              input_stride_d4,
+              input_shape_d2,
+              input_shape_d3,
+              (const sycl_t*)weight_ptr,
+              epsilon,
+              num_tokens,
+              hidden_size,
+              s_variance));
+    });
   });
 }
 
@@ -205,7 +332,10 @@ void rms_norm(
     torch::Tensor& weight,
     double epsilon) {
   TORCH_CHECK(out.is_contiguous());
-  input = input.contiguous();
+  if (input.stride(-1) != 1) {
+    input = input.contiguous();
+  }
+  TORCH_CHECK(input.stride(-1) == 1);
   TORCH_CHECK(weight.is_contiguous());
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "call_rms_norm_kernel", [&] {
diff --git a/tests/test_layernorm.py b/tests/test_layernorm.py
@@ -9,17 +9,19 @@
 
 DTYPES = [torch.half, torch.bfloat16]
 NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
-#TODO: add back  5120, 5124, 5125, 5126, 8192, 8199 after ci env issue fixed
+# TODO: add back  5120, 5124, 5125, 5126, 8192, 8199 after ci env issue fixed
 HIDDEN_SIZES = [8, 768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
                 8199]  # Arbitrary values for testing
-
+HEAD_DIMS = [128, 64]
+NUM_Q_HEADS = [32, 40, 64]
+NUM_KV_HEADS = [8, 32]
 ADD_RESIDUAL = [False, True]
 SEEDS = [0]
 XPU_DEVICES = [
     f"xpu:{i}" for i in range(1 if torch.xpu.device_count() == 1 else 2)
 ]
 
-#override pytest parameters when enable mini pytest
+# override pytest parameters when enable mini pytest
 MINI_PYTEST_PARAMS = {
     "default": {
         "num_tokens": [7],
@@ -78,3 +80,42 @@ def test_rms_norm(
     else:
         opcheck(torch.ops._C.rms_norm,
                 (out, x, layer.weight.data, layer.variance_epsilon))
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("head_dim", HEAD_DIMS)
+@pytest.mark.parametrize("num_q_heads", NUM_Q_HEADS)
+@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", XPU_DEVICES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_rms_norm_uncontigous(
+    num_tokens: int,
+    head_dim: int,
+    num_q_heads: int,
+    num_kv_heads: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+) -> None:
+    torch.manual_seed(seed)
+    torch.set_default_device("xpu")
+    torch.xpu.set_device(device)
+
+    hidden_size = (num_q_heads + 2 * num_kv_heads) * head_dim
+    qkv = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    q_size = num_q_heads * head_dim
+    kv_size = num_kv_heads * head_dim
+    q, _, _ = qkv.split([q_size, kv_size, kv_size], dim=-1)
+    q_by_head = q.view(*q.shape[:-1], q.shape[-1] // head_dim, head_dim)
+
+    layer = RMSNorm(head_dim).to(dtype=dtype)
+    ref_out = layer.forward_native(q_by_head)
+    out = layer(q_by_head)
+    torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
+
+    opcheck(
+        torch.ops._C.rms_norm,
+        (out, q_by_head, layer.weight.data, layer.variance_epsilon),
+    )