openvinotoolkit · Kotomi-Du · Jan 21, 2026 · Nov 25, 2025 · Dec 2, 2025 · Nov 17, 2025
diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp
@@ -79,6 +79,10 @@ struct kernel_impl_params final {
     std::vector<size_t> output_size;
     std::vector<size_t> img_size;
 
+    // KV cache trim length - set at runtime during shape inference
+    // Marked as mutable to allow modification even when kernel_impl_params is passed as const reference
+    mutable int64_t kv_cache_trim_length = 0;
+
     std::map<size_t, size_t> in_port_to_shape_info_offset = {};
     std::map<size_t, size_t> out_port_to_shape_info_offset = {};
 

@@ -33,6 +33,26 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension {
             int64_t gather_axis,
             const ov::element::Type output_type = ov::element::dynamic);
 
+    KVCache(const Output<Node>& past,
+            const Output<Node>& new_token_data,
+            const Output<Node>& split_seq,
+            const Output<Node>& src_idx,
+            const Output<Node>& dst_idx,
+            const std::shared_ptr<ov::op::util::Variable>& past_values,
+            int64_t concat_axis,
+            const ov::element::Type output_type = ov::element::dynamic);
+
+    KVCache(const Output<Node>& past,
+            const Output<Node>& new_token_data,
+            const Output<Node>& beam_idx,
+            const Output<Node>& split_seq,
+            const Output<Node>& src_idx,
+            const Output<Node>& dst_idx,
+            const std::shared_ptr<ov::op::util::Variable>& past_values,
+            int64_t concat_axis,
+            int64_t gather_axis,
+            const ov::element::Type output_type = ov::element::dynamic);
+
     bool visit_attributes(ov::AttributeVisitor& visitor) override;
 
     void validate_and_infer_types() override;
@@ -51,6 +71,10 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension {
     void set_gather_axis(int64_t axis) { m_gather_axis = axis; }
 
     bool get_indirect() const { return m_indirect; }
+    bool get_trim() const { return m_trim; }
+
+    uint64_t get_trim_length() const { return m_trim_length; }
+    void set_trim_length(uint64_t trim_length) { m_trim_length = trim_length; }
 
 protected:
     KVCache(const OutputVector& inputs,
@@ -63,6 +87,8 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension {
     int64_t m_concat_axis = 0;
     int64_t m_gather_axis = 0;
     bool m_indirect = false;
+    bool m_trim = false;
+    uint64_t m_trim_length = 0;
 
     ov::element::Type m_output_type;
 };

@@ -27,18 +27,20 @@ struct kv_cache : public primitive_base<kv_cache> {
              const ov::op::util::VariableInfo& variable_info,
              const int64_t concat_axis,
              const int64_t gather_axis,
-             const bool indirect)
+             const bool indirect,
+             const bool trim)
         : primitive_base(id, inputs)
         , variable_info(variable_info)
         , concat_axis(concat_axis)
         , gather_axis(gather_axis)
-        , indirect(indirect) {}
+        , indirect(indirect)
+        , trim(trim) {}
 
     ov::op::util::VariableInfo variable_info;
     int64_t concat_axis = 0;
     int64_t gather_axis = 0;
     bool indirect = false;
-
+    bool trim = false;
     bool compressed = false;
     QuantizationAttributes quantization_attributes;
 
@@ -47,6 +49,7 @@ struct kv_cache : public primitive_base<kv_cache> {
         seed = hash_combine(seed, concat_axis);
         seed = hash_combine(seed, gather_axis);
         seed = hash_combine(seed, indirect);
+        seed = hash_combine(seed, trim);
         seed = hash_combine(seed, compressed);
         seed = hash_range(seed, quantization_attributes.scales_zp_output_order.begin(), quantization_attributes.scales_zp_output_order.end());
         seed = hash_range(seed, quantization_attributes.group_sizes.begin(), quantization_attributes.group_sizes.end());
@@ -69,6 +72,7 @@ struct kv_cache : public primitive_base<kv_cache> {
                concat_axis == rhs_casted.concat_axis &&
                gather_axis == rhs_casted.gather_axis &&
                indirect == rhs_casted.indirect &&
+               trim == rhs_casted.trim &&
                compressed == rhs_casted.compressed &&
                quantization_attributes.scales_zp_output_order == rhs_casted.quantization_attributes.scales_zp_output_order &&
                quantization_attributes.output_storage_type == rhs_casted.quantization_attributes.output_storage_type &&
@@ -88,6 +92,7 @@ struct kv_cache : public primitive_base<kv_cache> {
         ob << concat_axis;
         ob << gather_axis;
         ob << indirect;
+        ob << trim;
         ob << compressed;
         ob << make_data(&quantization_attributes.quantization_type, sizeof(quantization_attributes.quantization_type));
         ob << make_data(&quantization_attributes.quantization_dt, sizeof(quantization_attributes.quantization_dt));
@@ -110,6 +115,7 @@ struct kv_cache : public primitive_base<kv_cache> {
         ib >> concat_axis;
         ib >> gather_axis;
         ib >> indirect;
+        ib >> trim;
         ib >> compressed;
         ib >> make_data(&quantization_attributes.quantization_type, sizeof(quantization_attributes.quantization_type));
         ib >> make_data(&quantization_attributes.quantization_dt, sizeof(quantization_attributes.quantization_dt));

@@ -1,4 +1,4 @@
-// Copyright (C) 2023-2024 Intel Corporation
+// Copyright (C) 2023-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -16,6 +16,8 @@
 #include "beam_table_update/beam_table_update_kernel_ref.hpp"
 #include "dynamic_quantize/dynamic_quantize_kernel_selector.h"
 #include "dynamic_quantize/dynamic_quantize_kernel_kv_cache.h"
+#include "reorder_kv_cache/reorder_kv_cache_kernel_selector.hpp"
+#include "reorder_kv_cache/reorder_kv_cache_kernel_ref.hpp"
 #include "openvino/core/dimension.hpp"
 
 #include <limits.h>
@@ -71,34 +73,38 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         return make_deep_copy<kv_cache_impl, kernel_params_t>(*this);
     }
 
-    const size_t concat_stage = 0;
-    const size_t beam_table_stage = 1;
-    const size_t dq_stage = 2;
-    const size_t scale_concat_stage = 3;
-    const size_t zp_concat_stage = 4;
+    const size_t reorder_trim_stage = 0;
+    const size_t concat_stage = 1;
+    const size_t beam_table_stage = 2;
+    const size_t dq_stage = 3;
+    const size_t scale_concat_stage = 4;
+    const size_t zp_concat_stage = 5;
 
     cldnn::memory::ptr beam_table_prev = nullptr;
     cldnn::memory::ptr beam_table_new = nullptr;
+    size_t indirect_offset = 0;
 
     void load(BinaryInputBuffer& ib) override {
         parent::load(ib);
         if (is_dynamic()) {
             auto& kernel_selector = kernel_selector_t::Instance();
-            auto kernel_impl = kernel_selector.GetImplementation(_kernels_data[concat_stage].kernelName);
-            kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[concat_stage]);
-            if (_kernels_data.size() >= 2) {
+            auto reorder_kernel_impl = kernel_selector.GetImplementation(_kernels_data[reorder_trim_stage].kernelName);
+            reorder_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[reorder_trim_stage]);
+            auto concat_kernel_impl = kernel_selector.GetImplementation(_kernels_data[concat_stage].kernelName);
+            concat_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[concat_stage]);
+            if (_kernels_data.size() >= 3) {
                 auto& bt_kernel_selector = bt_kernel_selector_t::Instance();
                 auto bt_kernel_impl = bt_kernel_selector.GetImplementation(_kernels_data[beam_table_stage].kernelName);
                 bt_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[beam_table_stage]);
             }
 
-            if (_kernels_data.size() >= 3) {
+            if (_kernels_data.size() >= 4) {
                 auto& dq_kernel_selector = dq_kernel_selector_t::Instance();
                 auto dq_kernel_impl = dq_kernel_selector.GetImplementation(_kernels_data[dq_stage].kernelName);
                 dq_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[dq_stage]);
             }
 
-            if (_kernels_data.size() >= 4) {
+            if (_kernels_data.size() >= 5) {
                 auto& scale_zp_concat_kernel_selector = kernel_selector_t::Instance();
                 auto scale_zp_concat_kernel_impl = scale_zp_concat_kernel_selector.GetImplementation(_kernels_data[scale_concat_stage].kernelName);
                 scale_zp_concat_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[scale_concat_stage]);
@@ -112,7 +118,10 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         // output buffers order: [current, (beam_table), (current_scale), (current_zp)]
         kernel_arguments_data args;
         args.shape_info = instance.shape_info_memory_ptr();
-        if (stage == concat_stage) {
+        if (stage == reorder_trim_stage) {
+            args.inputs = { instance.input_memory_ptr(0), instance.input_memory_ptr(3 + indirect_offset), instance.input_memory_ptr(4 + indirect_offset) };
+            args.outputs = {instance.input_memory_ptr(0)};
+        } else if (stage == concat_stage) {
             args.inputs = { instance.input_memory_ptr(0), instance.input_memory_ptr(1) };
             args.outputs = { instance.output_memory_ptr(0) };
         } else if (stage == beam_table_stage) {
@@ -186,10 +195,17 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         const auto& desc = instance.get_typed_desc<kv_cache>();
         auto& variable = instance.get_network().get_variable(desc->variable_info.variable_id);
         std::vector<event::ptr> res_events;
+        const auto& impl_param = *instance.get_impl_params();
+
+        if (impl_param.input_layouts.size() >= 3) {
+            indirect_offset = desc->indirect ? 1 : 0;
+            if (instance.input_memory_ptr(0) && instance.input_memory_ptr(3 + indirect_offset)->size()) {
+                execute_stage(events, instance, res_events, reorder_trim_stage);
+            }
+        }
 
         execute_stage(events, instance, res_events, concat_stage);
 
-        const auto& impl_param = *instance.get_impl_params();
         const auto& kv_in_shape = impl_param.input_layouts[0].get_partial_shape();
         const auto& kv_out_shape = impl_param.output_layouts[0].get_partial_shape();
         if (desc->indirect && ((kv_out_shape[desc->gather_axis].get_length() > 1) ||
@@ -296,6 +312,38 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         return layout{beam_table_shape, impl_param.output_layouts[1].data_type, format::get_default_format(beam_table_shape.size())};
     }
 
+    static kernel_selector::reorder_kv_cache_params get_reorder_trim_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
+        const auto& primitive = impl_param.typed_desc<kv_cache>();
-        const auto& primitive = impl_param.typed_desc<kv_cache>();
+        auto primitive = impl_param.typed_desc<kv_cache>();
-        const auto& primitive = impl_param.typed_desc<kv_cache>();
+        auto primitive = impl_param.typed_desc<kv_cache>();
+        auto params = get_default_params<kernel_selector::reorder_kv_cache_params>(impl_param, is_shape_agnostic);
+
+        auto inputs_count = 3;
+
+        params.inputs.resize(inputs_count);
+        params.inputs[0] = convert_data_tensor(impl_param.input_layouts[0], tensor());
+        params.inputs[1] = convert_data_tensor(impl_param.input_layouts[3 + (impl_param.typed_desc<kv_cache>()->indirect ? 1 : 0)], tensor());
+        params.inputs[2] = convert_data_tensor(impl_param.input_layouts[4 + (impl_param.typed_desc<kv_cache>()->indirect ? 1 : 0)], tensor());
+        params.outputs[0] = convert_data_tensor(impl_param.output_layouts[0], tensor());
+        params.seq_len = params.inputs[0].Y().pitch ? params.inputs[0].Feature().pitch / params.inputs[0].Y().pitch : 0;
+        params.idx_len = params.inputs[2].Y().v;
+
+        const auto& desc = impl_param.typed_desc<kv_cache>();
+
+        const auto& in_offsets_map = impl_param.in_port_to_shape_info_offset;  // [kv_past, kv_new_token, [beam_idx, [scale_past], [zp_past], beam_table_past]]
+        const auto& out_offsets_map = impl_param.out_port_to_shape_info_offset;  // [kv_present, beam_table_present, compression_scale_present]
+        std::map<size_t, size_t> in_tensor_to_offset_map = {
+            {0, in_offsets_map.at(0)},  // kv_past
+            {1, in_offsets_map.at(3 + impl_param.typed_desc<kv_cache>()->indirect ? 1 : 0)},  // src_idx
+            {2, in_offsets_map.at(4 + impl_param.typed_desc<kv_cache>()->indirect ? 1 : 0)},  // dst_idx
+        };
+        std::map<size_t, size_t> out_tensor_to_offset_map = {
+            {0, in_offsets_map.at(0)},  // kv_present
+        };
+
+        params.set_dynamic_shape_offsets(in_tensor_to_offset_map, out_tensor_to_offset_map);
+
+        return params;
+    }
+
     static kernel_params_t get_concat_kernel_params(const kernel_impl_params& impl_param, bool is_shape_agnostic = false) {
         const auto& primitive = impl_param.typed_desc<kv_cache>();
         auto params = get_default_params<kernel_selector::concatenation_params>(impl_param, is_shape_agnostic);
@@ -304,7 +352,14 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
         const auto inputs_count = 2;
         params.inputs.resize(inputs_count);
         for (size_t i = 0; i < inputs_count; ++i) {
-            params.inputs[i] = convert_data_tensor(impl_param.input_layouts[i]);
+            auto target_layout = impl_param.input_layouts[i];
+            // Trim the cache
+            /*if (i == 0 && primitive->trim) {
+                auto shape = target_layout.get_partial_shape();
+                shape[axis] = shape[axis] - primitive->trim;
+                target_layout.set_partial_shape(shape);
+            }*/
+            params.inputs[i] = convert_data_tensor(target_layout);
         }
 
         params.axis = convert_axis(axis, impl_param.get_output_layout().get_rank());
@@ -454,6 +509,11 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
 
     static std::unique_ptr<primitive_impl> create(const typed_program_node<kv_cache>& arg, const kernel_impl_params& impl_param) {
         std::vector<kernel_selector::kernel_data> kernels_data;
+        if (impl_param.typed_desc<kv_cache>()->input.size() >= 3) {
+            auto reorder_kernel_params = get_reorder_trim_kernel_params(impl_param, impl_param.is_dynamic());
+            auto& reorder_kernel_selector = kernel_selector::reorder_kv_cache_kernel_selector::Instance();
+            kernels_data.push_back(reorder_kernel_selector.get_best_kernel(reorder_kernel_params));
+        }
         auto concat_kernel_params = get_concat_kernel_params(impl_param, impl_param.is_dynamic());
         auto& concat_kernel_selector = kernel_selector_t::Instance();
         kernels_data.push_back(concat_kernel_selector.get_best_kernel(concat_kernel_params));
@@ -486,6 +546,10 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
     }
 
     void update_dispatch_data(const kernel_impl_params& impl_param) override {
+        auto reorder_kernel_params = get_reorder_trim_kernel_params(impl_param, true);
+        (_kernels_data[reorder_trim_stage].update_dispatch_data_func)(reorder_kernel_params, _kernels_data[reorder_trim_stage]);
+        _kernels_data[reorder_trim_stage].kernels[0].skip_execution = (reorder_kernel_params.seq_len == 0) || (reorder_kernel_params.idx_len == 0);
+
         // If model loaded from cache, params are not initialized, so we create a new object and reuse it in the future
         if (_kernels_data[concat_stage].params == nullptr) {
             _kernels_data[concat_stage].params = std::make_shared<kernel_params_t>(get_concat_kernel_params(impl_param, true));

@@ -22,7 +22,13 @@ struct typed_program_node<kv_cache> : public typed_program_node_base<kv_cache> {
 
     program_node& input() const { return get_dependency(0); }
 
-    std::vector<size_t> get_shape_infer_dependencies() const override { return {}; }
+    std::vector<size_t> get_shape_infer_dependencies() const override { 
+        std::vector<size_t> vec;
+        for (size_t i  = 1; i < get_dependencies().size(); i++) {
+            vec.push_back(i);
+        }
+        return vec;
+     }
 
     std::vector<layout> get_shape_info_input_layouts() const override {
         std::vector<layout> res;

@@ -10,6 +10,7 @@
 #include "primitive_type_base.h"
 #include <sstream>
 #include <json_object.h>
+#include "utils.hpp"
 
 namespace cldnn {
 GPU_DEFINE_PRIMITIVE_TYPE_ID(kv_cache)
@@ -31,6 +32,17 @@ std::vector<layout> kv_cache_inst::calc_output_layouts(kv_cache_node const& /*no
 
     std::vector<ShapeType> input_shapes = {impl_param.get_input_layout(0).get<ShapeType>(),
                                            impl_param.get_input_layout(1).get<ShapeType>()};
+
+    std::unordered_map<size_t, ov::Tensor> const_data;
+    if (desc->trim) {
+        if(impl_param.memory_deps.count(2) > 0)
+        {
+            auto past_seq_len_mem = impl_param.memory_deps.at(2);
+            cldnn::mem_lock<uint8_t, mem_lock_type::read> past_seq_len_mem_lock(past_seq_len_mem, impl_param.get_stream());
+            const_data.emplace(1, make_tensor(past_seq_len_mem->get_layout(), past_seq_len_mem_lock.data()));   
+        }
+    }
+
     if (desc->indirect) {
         input_shapes.push_back(impl_param.get_input_layout(2).get<ShapeType>());
     }
@@ -50,14 +62,44 @@ std::vector<layout> kv_cache_inst::calc_output_layouts(kv_cache_node const& /*no
         op.set_concat_axis(desc->concat_axis);
         op.set_gather_axis(desc->gather_axis);
         op.set_quantization_attrs(desc->quantization_attributes);
+        if (desc->trim) {
+            if (auto past_dim_updated = ov::op::get_input_const_data_as<ov::PartialShape, int64_t>(&op, 1, ov::make_tensor_accessor(const_data))) {
+                auto past_dim_stored = input_shapes[0][desc->concat_axis];
+                if (past_dim_stored.is_static()) {
+                    auto trim_length = past_dim_stored.get_length() - (*past_dim_updated)[0];
+                    if (trim_length > 0) {
+                        op.set_trim_length(static_cast<uint64_t>(trim_length));
+                        impl_param.kv_cache_trim_length = trim_length;
+                    } else {
+                        op.set_trim_length(static_cast<uint64_t>(0));
+                        impl_param.kv_cache_trim_length = 0;
+
+                    }
+                }
+            }
+        }
 
         output_shapes = shape_infer(&op, input_shapes);
     } else {
         ov::intel_gpu::op::KVCache op;
         op.set_output_size(desc->num_outputs);
         op.set_concat_axis(desc->concat_axis);
         op.set_gather_axis(desc->gather_axis);
-
+       if (desc->trim) {
+            if (auto past_dim_updated = ov::op::get_input_const_data_as<ov::PartialShape, int64_t>(&op, 1, ov::make_tensor_accessor(const_data))) {
+                auto past_dim_stored = input_shapes[0][desc->concat_axis];
+                if (past_dim_stored.is_static()) {
+                    auto trim_length = past_dim_stored.get_length() - (*past_dim_updated)[0];
+                    if (trim_length > 0) {
+                        op.set_trim_length(static_cast<uint64_t>(trim_length));
+                        impl_param.kv_cache_trim_length = trim_length;
+                    } else {
+                        op.set_trim_length(static_cast<uint64_t>(0));
+                        impl_param.kv_cache_trim_length = 0;
+                    }
+                }
+            }
+        }
         output_shapes = shape_infer(&op, input_shapes);
     }