Code review update

clee30 · clee30 · commit dd777a060f30 · 2025-04-09T15:50:28.000+08:00
Besides, fix functional test for sdpa also.
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp
@@ -268,6 +268,9 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
         if (query_shape[query_shape.size() - 1].is_static())
             config.k_head_size = query_shape[query_shape.size() - 1].get_length();
 
+        if (value_shape[value_shape.size() - 1].is_static())
+            config.v_head_size = value_shape[value_shape.size() - 1].get_length();
+
         config.is_causal = desc->is_causal;
 
         if (desc->scale_val.has_value()) {
diff --git a/src/plugins/intel_gpu/src/graph/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/paged_attention.cpp
@@ -21,13 +21,19 @@ layout paged_attention_inst::calc_output_layout(const paged_attention_node& /*no
 
 template<typename ShapeType>
 std::vector<layout> paged_attention_inst::calc_output_layouts(paged_attention_node const& /*node*/, kernel_impl_params const& impl_param) {
-    auto q_layout = impl_param.get_input_layout(0);
-    auto v_layout = impl_param.get_input_layout(2);
+    const auto& q_layout = impl_param.get_input_layout(0);
+    const auto& v_layout = impl_param.get_input_layout(2);
     auto data_layout = q_layout;
 
     if (v_layout.is_static()) {
-        ShapeType v_shape = v_layout.get_shape();
-        data_layout = layout{v_shape, data_layout.data_type, data_layout.format};
+        const auto& key_cache_ps = impl_param.get_input_layout(3).get_partial_shape();
+        const auto& value_cache_ps = impl_param.get_input_layout(4).get_partial_shape();
+        // output layout may similar to value layout if key and value has different head size
+        if (key_cache_ps[2].get_length() != value_cache_ps[3].get_length() ||
+            key_cache_ps[3].get_length() != value_cache_ps[2].get_length()) {
+            ShapeType v_shape = v_layout.get_shape();
+            data_layout = data_layout.clone_with_other_shape(v_shape);
+        }
     }
 
     data_layout.data_padding = padding();
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_kv_cache_update_ref.cl
@@ -4,20 +4,21 @@
 
 #include "include/batch_headers/common.cl"
 
-inline void FUNC(quantize_and_save_k)(__global const INPUT0_TYPE* in_data,
+inline void FUNC(quantize_and_save)(__global const INPUT0_TYPE* in_data,
                                     const uint in_data_offset,
                                     __global OUTPUT_TYPE* out_data,
                                     const uint out_data_offset,
                                     const uint out_data_pitch,
                                     const uint comp_offset,
                                     const uint token_pos_in_block,
-                                    const uint sglid) {
-    INPUT0_TYPE input_data[K_HEAD_SIZE / SUBGROUP_SIZE];
+                                    const uint sglid,
+                                    const uint num_groups,
+                                    INPUT0_TYPE* input_data) {
     INPUT0_TYPE grp_max = 0.001;
     INPUT0_TYPE max_value = INPUT0_VAL_MIN;
     INPUT0_TYPE min_value = INPUT0_VAL_MAX;
 
-    unroll_for (uint i = 0; i < K_HEAD_SIZE / SUBGROUP_SIZE; i++) {
+    unroll_for (uint i = 0; i < num_groups; i++) {
         input_data[i] = BLOCK_READN(INPUT0_TYPE, 1, in_data, in_data_offset + i * SUBGROUP_SIZE);
         max_value = fmax(max_value, input_data[i]);
         min_value = fmin(min_value, input_data[i]);
@@ -35,54 +36,7 @@ inline void FUNC(quantize_and_save_k)(__global const INPUT0_TYPE* in_data,
     INPUT0_TYPE zp = (INPUT1_TYPE)(zp_tmp);
     #undef ACCUMULATOR_TYPE
 
-    unroll_for (uint i = 0; i < K_HEAD_SIZE / SUBGROUP_SIZE; i++) {
-        OUTPUT_TYPE res = convert_char_rte(input_data[i] * scale + zp);
-
-        uint offset = out_data_offset + (i * SUBGROUP_SIZE + sglid) * out_data_pitch;
-        out_data[offset] = res;
-    }
-
-    INPUT0_TYPE* comp_ptr = out_data + comp_offset;
-
-    if (sglid == 0) {
-        comp_ptr[token_pos_in_block] = 1.0 / scale;
-        comp_ptr[PAGED_ATTENTION_BLOCK_SIZE + token_pos_in_block] = zp;
-    }
-}
-
-inline void FUNC(quantize_and_save_v)(__global const INPUT0_TYPE* in_data,
-                                    const uint in_data_offset,
-                                    __global OUTPUT_TYPE* out_data,
-                                    const uint out_data_offset,
-                                    const uint out_data_pitch,
-                                    const uint comp_offset,
-                                    const uint token_pos_in_block,
-                                    const uint sglid) {
-    INPUT0_TYPE input_data[V_HEAD_SIZE / SUBGROUP_SIZE];
-    INPUT0_TYPE grp_max = 0.001;
-    INPUT0_TYPE max_value = INPUT0_VAL_MIN;
-    INPUT0_TYPE min_value = INPUT0_VAL_MAX;
-
-    unroll_for (uint i = 0; i < V_HEAD_SIZE / SUBGROUP_SIZE; i++) {
-        input_data[i] = BLOCK_READN(INPUT0_TYPE, 1, in_data, in_data_offset + i * SUBGROUP_SIZE);
-        max_value = fmax(max_value, input_data[i]);
-        min_value = fmin(min_value, input_data[i]);
-    }
-
-    min_value = sub_group_reduce_min(min_value);
-    max_value = sub_group_reduce_max(max_value);
-
-    // If the range of input data is zero, it is adjusted to the minimum value(0.001).
-    #define ACCUMULATOR_TYPE float
-    ACCUMULATOR_TYPE diff_value = max_value == min_value ? (grp_max) : (max_value - min_value);
-    ACCUMULATOR_TYPE scale_tmp = (ACCUMULATOR_TYPE)((CHAR_MAX - CHAR_MIN) / diff_value);
-    ACCUMULATOR_TYPE zp_tmp = (ACCUMULATOR_TYPE)(-min_value * scale_tmp) + CHAR_MIN;
-    INPUT0_TYPE scale = (INPUT1_TYPE)(scale_tmp);
-    INPUT0_TYPE zp = (INPUT1_TYPE)(zp_tmp);
-    #undef ACCUMULATOR_TYPE
-
-
-    unroll_for (uint i = 0; i < V_HEAD_SIZE / SUBGROUP_SIZE; i++) {
+    unroll_for (uint i = 0; i < num_groups; i++) {
         OUTPUT_TYPE res = convert_char_rte(input_data[i] * scale + zp);
 
         uint offset = out_data_offset + (i * SUBGROUP_SIZE + sglid) * out_data_pitch;
@@ -178,11 +132,19 @@ KERNEL(pa_kv_cache_update)(
         }
 
 #else // IS_KV_COMPRESSED
-        // key processing
-        FUNC_CALL(quantize_and_save_k)(key_data, key_in_offset, key_cache_data, key_out_offset, PAGED_ATTENTION_BLOCK_SIZE, comp_k_offset, current_token_pos_in_block, sglid);
+        {
+            // key processing
+            INPUT0_TYPE input_data[K_HEAD_SIZE / SUBGROUP_SIZE];
+            FUNC_CALL(quantize_and_save)(key_data, key_in_offset, key_cache_data, key_out_offset, PAGED_ATTENTION_BLOCK_SIZE, comp_k_offset,
+                current_token_pos_in_block, sglid, K_HEAD_SIZE / SUBGROUP_SIZE, &input_data[0]);
+        }
 
-        // value processing
-        FUNC_CALL(quantize_and_save_v)(value_data, value_in_offset, value_cache_data, value_out_offset, 1, comp_v_offset, current_token_pos_in_block, sglid);
+        {
+            // value processing
+            INPUT0_TYPE input_data[V_HEAD_SIZE / SUBGROUP_SIZE];
+            FUNC_CALL(quantize_and_save)(value_data, value_in_offset, value_cache_data, value_out_offset, 1, comp_v_offset,
+                current_token_pos_in_block, sglid, V_HEAD_SIZE / SUBGROUP_SIZE, &input_data[0]);
+        }
 #endif // IS_KV_COMPRESSED
     } else {
         // 1st token
@@ -343,11 +305,19 @@ KERNEL(pa_kv_cache_update)(
             }
 
 #else // IS_KV_COMPRESSED
+            {
                 // key processing
-                FUNC_CALL(quantize_and_save_k)(key_data, key_in_offset, key_cache_data, key_out_offset, PAGED_ATTENTION_BLOCK_SIZE, comp_k_offset, token_num, sglid);
+                INPUT0_TYPE input_data[K_HEAD_SIZE / SUBGROUP_SIZE];
+                FUNC_CALL(quantize_and_save)(key_data, key_in_offset, key_cache_data, key_out_offset, PAGED_ATTENTION_BLOCK_SIZE,
+                    comp_k_offset, token_num, sglid, K_HEAD_SIZE / SUBGROUP_SIZE, &input_data[0]);
+            }
 
+            {
                 // value processing
-                FUNC_CALL(quantize_and_save_v)(value_data, value_in_offset, value_cache_data, value_out_offset, 1, comp_v_offset, token_num, sglid);
+                INPUT0_TYPE input_data[V_HEAD_SIZE / SUBGROUP_SIZE];
+                FUNC_CALL(quantize_and_save)(value_data, value_in_offset, value_cache_data, value_out_offset, 1,
+                    comp_v_offset, token_num, sglid, V_HEAD_SIZE / SUBGROUP_SIZE, &input_data[0]);
+            }
 #endif // IS_KV_COMPRESSED
 
                 key_in_offset += (KV_HEADS_NUM * K_HEAD_SIZE + INPUT0_PAD_AFTER_FEATURE_NUM + INPUT0_PAD_BEFORE_FEATURE_NUM);
@@ -379,14 +349,22 @@ KERNEL(pa_kv_cache_update)(
                         uint value_offset = value_out_offset + head_idx_index + sglid + SUBGROUP_SIZE * i;
                         value_cache_data[value_offset] = input_data;
                     }
-               }
+                }
 
 #else // IS_KV_COMPRESSED
-                // key processing
-                FUNC_CALL(quantize_and_save_k)(key_data, key_in_offset, key_cache_data, key_out_offset, PAGED_ATTENTION_BLOCK_SIZE, comp_k_offset, token_start_pos + token_num, sglid);
+                {
+                    // key processing
+                    INPUT0_TYPE input_data[K_HEAD_SIZE / SUBGROUP_SIZE];
+                    FUNC_CALL(quantize_and_save)(key_data, key_in_offset, key_cache_data, key_out_offset, PAGED_ATTENTION_BLOCK_SIZE,
+                        comp_k_offset, token_start_pos + token_num, sglid, K_HEAD_SIZE / SUBGROUP_SIZE, &input_data[0]);
+                }
 
-                // value processing
-                FUNC_CALL(quantize_and_save_v)(value_data, value_in_offset, value_cache_data, value_out_offset, 1, comp_v_offset, token_start_pos + token_num, sglid);
+                {
+                    // value processing
+                    INPUT0_TYPE input_data[V_HEAD_SIZE / SUBGROUP_SIZE];
+                    FUNC_CALL(quantize_and_save)(value_data, value_in_offset, value_cache_data, value_out_offset, 1,
+                        comp_v_offset, token_start_pos + token_num, sglid, V_HEAD_SIZE / SUBGROUP_SIZE, &input_data[0]);
+                }
 #endif // IS_KV_COMPRESSED
                 key_in_offset += (KV_HEADS_NUM * K_HEAD_SIZE + INPUT0_PAD_AFTER_FEATURE_NUM + INPUT0_PAD_BEFORE_FEATURE_NUM);
                 value_in_offset += (KV_HEADS_NUM * V_HEAD_SIZE + INPUT1_PAD_AFTER_FEATURE_NUM + INPUT1_PAD_BEFORE_FEATURE_NUM);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp
@@ -137,7 +137,6 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const {
 
     TransposedDimensionAccessHelperJit dims_q(params.inputs[0], params.input0_order);
     const auto num_heads = params.conf.is_paged_attention ? std::to_string(params.conf.heads_num) : dims_q.f();
-//    TransposedDimensionAccessHelperJit dims_v(params.inputs[2], params.input2_order);
     jit.AddConstant(MakeJitConstant("TARGET_SEQ_LEN", dims_q.y()));
     jit.AddConstant(MakeJitConstant("NUM_HEADS", num_heads));
     jit.AddConstant(MakeJitConstant("NUM_KV_HEADS", params.conf.kv_heads_num));
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp
@@ -254,11 +254,12 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag
     const auto& V = params.inputs[2];
 
     auto& out = params.outputs[0];
-    const auto head_size = params.conf.k_head_size;
-    const auto d_max = get_d_max(head_size);
+    const auto k_head_size = params.conf.k_head_size;
+    const auto v_head_size = params.conf.v_head_size;
+    const auto d_max = get_d_max(k_head_size);
     const Tensor::Dim n_keys = get_seq_length(params, K, params.input1_order);
     const Tensor::Dim n_queries = get_seq_length(params, Q, params.input0_order);
-    const Tensor::Dim n_values = Tensor::Dim(head_size);
+    const Tensor::Dim n_values = Tensor::Dim(v_head_size);
     const auto batch = out.Batch().v * out.Feature().v;
 
     /* Retrieve pre-tuned kernel configuration */
@@ -269,13 +270,15 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag
                         (V.GetDType() == Datatype::UINT8 || V.GetDType() == Datatype::INT8);
     switch (params.engineInfo.arch) {
         case gpu_arch::xe_hpg: {
-            config = choose_config_xehpg(static_cast<int32_t>(head_size), static_cast<int32_t>(n_keys.v), thin_q, is_quantized, params.conf.is_paged_attention);
+            config = choose_config_xehpg(static_cast<int32_t>(k_head_size), static_cast<int32_t>(n_keys.v), thin_q,
+                is_quantized, params.conf.is_paged_attention);
             break;
         }
         case gpu_arch::xe_hpc:
         case gpu_arch::xe2:
         case gpu_arch::xe3: {
-            config = choose_config_xehpc(static_cast<int32_t>(head_size), static_cast<int32_t>(n_keys.v), thin_q, is_quantized, params.conf.is_paged_attention);
+            config = choose_config_xehpc(static_cast<int32_t>(k_head_size), static_cast<int32_t>(n_keys.v), thin_q,
+                is_quantized, params.conf.is_paged_attention);
             break;
         }
         default: break;
@@ -334,7 +337,7 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag
 
     problem_kq.B.layout = micro::MatrixLayout::Pr;
     problem_kq.C.layout = micro::MatrixLayout::T;
-    problem_kq.A.setAlignment(micro::alignment_for_ld(head_size * problem.Ta));
+    problem_kq.A.setAlignment(micro::alignment_for_ld(k_head_size * problem.Ta));
     problem_kq.B.setAlignment(64); // Q is packed in VNNI format in SLM
     problem_kq.B.crosspack = 2;
     problem_kq.B.tileR = d_max;
@@ -344,7 +347,7 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag
     micro::SizeParams sizes;
     sizes.m = static_cast<int64_t>(n_keys.v);
     sizes.n = static_cast<int64_t>(n_queries.v);
-    sizes.k = static_cast<int64_t>(head_size);
+    sizes.k = static_cast<int64_t>(k_head_size);
     sizes.batch = static_cast<int64_t>(batch);
 
     /* Set up microkernel requirements */
@@ -390,7 +393,7 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag
     }
 
     if (params.conf.is_kv_compressed) {
-        problem_vs.aqGroupM = (vs_common_scales || vs_common_zp) ? 1 : micro::rnd_up_pow2(params.conf.k_head_size);
+        problem_vs.aqGroupM = (vs_common_scales || vs_common_zp) ? 1 : micro::rnd_up_pow2(v_head_size);
         problem_vs.aqGroupK = 1;
     }
 
@@ -399,7 +402,7 @@ void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Packag
 
     problem_vs.B.layout = micro::MatrixLayout::Pr;
     problem_vs.C.layout = micro::MatrixLayout::N;
-    problem_vs.A.setAlignment(micro::alignment_for_ld(head_size * problem.Ta));
+    problem_vs.A.setAlignment(micro::alignment_for_ld(v_head_size * problem.Ta));
     problem_vs.B.setAlignment(64); // S is packed in SLM
     problem_vs.B.crosspack = 16;
     sizes.m = static_cast<int64_t>(n_values.v);
@@ -474,6 +477,9 @@ bool SDPAKernelMicro::Validate(const Params& p) const {
     if (params.conf.k_head_size > 256)
         return false;
 
+    if (params.conf.v_head_size > 256)
+        return false;
+
     // TODO: To support sdpa_micro kernel with non-const scalar mask / scale inputs
     if (!params.conf.is_paged_attention) {
         const auto mask_idx = 3lu;
@@ -512,18 +518,18 @@ JitConstants SDPAKernelMicro::GetJitConstants(const sdpa_params& params, const m
     const auto& K = prim_params.inputs[1];
     const auto& V = prim_params.inputs[2];
 
-    const auto head_size = prim_params.conf.k_head_size;
-    const auto v_head_size = prim_params.conf.k_head_size;
+    const auto k_head_size = prim_params.conf.k_head_size;
+    const auto v_head_size = prim_params.conf.v_head_size;
 
-    auto ldq = head_size * Q.ElementSize();
-    auto ldk = head_size * K.ElementSize();
+    auto ldq = k_head_size * Q.ElementSize();
+    auto ldk = k_head_size * K.ElementSize();
     auto ldv = v_head_size * V.ElementSize();
-    auto lda = head_size * prim_params.outputs[0].ElementSize();
+    auto lda = k_head_size * prim_params.outputs[0].ElementSize();
 
-    const auto d_max = get_d_max(head_size);
+    const auto d_max = get_d_max(k_head_size);
     const auto n_keys = get_seq_length(params, K, prim_params.input1_order);
     const auto n_queries = get_seq_length(params, Q, prim_params.input0_order);
-    const auto n_values = Tensor::Dim(head_size);
+    const auto n_values = Tensor::Dim(v_head_size);
 
     auto data_inputs = params.inputs.size();
     if (params.conf.is_paged_attention)
@@ -533,7 +539,7 @@ JitConstants SDPAKernelMicro::GetJitConstants(const sdpa_params& params, const m
     jit.AddConstant(MakeJitConstant("SUBGROUP_SIZE", subgroup_size(prim_params.engineInfo.arch)));
     jit.AddConstant(MakeJitConstant("INVERT_SCALE", false));
     jit.AddConstant(MakeJitConstant("SCALE_DATA_T", "half"));
-    jit.AddConstant(MakeJitConstant("HEAD_SIZE", head_size));
+    jit.AddConstant(MakeJitConstant("HEAD_SIZE", k_head_size));
 
     size_t attn_input_idx = 3;
     size_t scale_input_idx = 4;
@@ -616,8 +622,8 @@ JitConstants SDPAKernelMicro::GetJitConstants(const sdpa_params& params, const m
     int tile_q = gemm_kq.getSetting("wg_tile_n");
     int tile_v = gemm_vs.getSetting("wg_tile_m");
 
-    bool d_full = (head_size == d_max);
-    bool v_full = (head_size == tile_v);
+    bool d_full = (k_head_size == d_max);
+    bool v_full = (v_head_size == tile_v);
     bool k_full = !n_keys.is_dynamic && (n_keys.v % tile_k) == 0;
     bool q_full = !n_queries.is_dynamic && (n_queries.v % tile_q) == 0;
 
@@ -814,11 +820,11 @@ clKernelData SDPAKernelMicro::get_kernel_data(const sdpa_params& params, bool is
     const auto n_queries = get_seq_length(params, Q, params.input0_order);
     const auto n_keys = get_seq_length(params, K, params.input1_order);
 
-    auto head_size = params.conf.k_head_size;
+    auto k_head_size = params.conf.k_head_size;
 
     ScalarDescriptor s_d;
     s_d.t = ScalarDescriptor::Types::INT32;
-    s_d.v.s32 = static_cast<uint32_t>(head_size);
+    s_d.v.s32 = static_cast<uint32_t>(k_head_size);
 
     ScalarDescriptor s_k;
     s_k.t = ScalarDescriptor::Types::INT32;
@@ -890,11 +896,11 @@ void SDPAKernelMicro::GetUpdateDispatchDataFunc(KernelData& kd) const {
         const auto n_queries = get_seq_length(prim_params, Q, prim_params.input0_order);
         const auto n_keys = get_seq_length(prim_params, K, prim_params.input1_order);
 
-        auto head_size = prim_params.conf.k_head_size;
+        auto k_head_size = prim_params.conf.k_head_size;
 
         ScalarDescriptor s_d;
         s_d.t = ScalarDescriptor::Types::INT32;
-        s_d.v.s32 = static_cast<uint32_t>(head_size);
+        s_d.v.s32 = static_cast<uint32_t>(k_head_size);
 
         ScalarDescriptor s_k;
         s_k.t = ScalarDescriptor::Types::INT32;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_ref.cpp
@@ -49,8 +49,8 @@ JitConstants SDPAKernelRef::GetJitConstants(const sdpa_params& params) const {
         jit.AddConstant(MakeJitConstant("STATIC_SCALE_VALUE", params.conf.scale_val));
         jit.AddConstant(MakeJitConstant("STATIC_SCALE_VALUE_INV", 1.0f / params.conf.scale_val));
     } else {
-        jit.AddConstant(MakeJitConstant("STATIC_SCALE_VALUE_INV", std::sqrt(static_cast<float>(params.conf.head_size))));
-        jit.AddConstant(MakeJitConstant("STATIC_SCALE_VALUE", 1.0f / std::sqrt(static_cast<float>(params.conf.head_size))));
+        jit.AddConstant(MakeJitConstant("STATIC_SCALE_VALUE_INV", std::sqrt(static_cast<float>(params.conf.k_head_size))));
+        jit.AddConstant(MakeJitConstant("STATIC_SCALE_VALUE", 1.0f / std::sqrt(static_cast<float>(params.conf.k_head_size))));
     }
 
     return jit;