openvinotoolkit · Kotomi-Du · Jan 21, 2026 · Nov 25, 2025 · Dec 2, 2025 · Nov 17, 2025
@@ -131,6 +131,9 @@ ov::OutputVector ov::pass::GroupQueryAttentionDecomposition::decompose(
             v0::Constant::create(ov::element::i64, ov::Shape{1}, {past_key.get_partial_shape()[2].get_length()}));
         past_key = register_new_node<v8::Slice>(past_key, current_kv_len_const, past_kv_len_const, one, two);
         past_value = register_new_node<v8::Slice>(past_value, current_kv_len_const, past_kv_len_const, one, two);
+    } else {
+        past_key = register_new_node<v8::Slice>(past_key, zero, past_seqlen, one, two);
+        past_value = register_new_node<v8::Slice>(past_value, zero, past_seqlen, one, two);
     }
     K = construct_kv_cache(past_key, K);
     V = construct_kv_cache(past_value, V);

@@ -33,6 +33,34 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension {
             int64_t gather_axis,
             const ov::element::Type output_type = ov::element::dynamic);
 
+    /// KVCache with seq_len trimming
+    KVCache(const Output<Node>& past,
+            const Output<Node>& new_token_data,
+            const Output<Node>& past_seq_len,
+            const std::shared_ptr<ov::op::util::Variable>& past_values,
+            int64_t concat_axis,
+            const ov::element::Type output_type = ov::element::dynamic);
+
+    /// KVCache with seq_len trimming and beam_idx
+    KVCache(const Output<Node>& past,
+            const Output<Node>& new_token_data,
+            const Output<Node>& beam_idx,
+            const Output<Node>& past_seq_len,
+            const std::shared_ptr<ov::op::util::Variable>& past_values,
+            int64_t concat_axis,
+            int64_t gather_axis,
+            const ov::element::Type output_type = ov::element::dynamic);
+
+    /// KVCache with update&trimming for tree-based speculative decoding
+    KVCache(const Output<Node>& past,
+            const Output<Node>& new_token_data,
+            const Output<Node>& past_seq_len,
+            const Output<Node>& dst_idx,
+            const Output<Node>& update_data,
+            const std::shared_ptr<ov::op::util::Variable>& past_values,
+            int64_t concat_axis,
+            const ov::element::Type output_type = ov::element::dynamic);
+
     bool visit_attributes(ov::AttributeVisitor& visitor) override;
 
     void validate_and_infer_types() override;
@@ -51,18 +79,30 @@ class KVCache : public ov::op::Op, public ov::op::util::VariableExtension {
     void set_gather_axis(int64_t axis) { m_gather_axis = axis; }
 
     bool get_indirect() const { return m_indirect; }
+    bool get_trim() const { return m_trim; }
+    bool get_update_kv() const { return m_update_kv; }
+
+    void set_trim(bool trim) { m_trim = trim; }
+    void set_update_kv(bool update_kv) { m_update_kv = update_kv; }
+
+    int64_t get_trim_length() const { return m_trim_length; }
+    void set_trim_length(int64_t trim_length) { m_trim_length = trim_length; }
 
 protected:
     KVCache(const OutputVector& inputs,
             const std::shared_ptr<ov::op::util::Variable>& past_values,
             bool indirect,
+            bool trim,
             int64_t concat_axis,
             int64_t gather_axis,
             const ov::element::Type output_type = ov::element::dynamic);
 
     int64_t m_concat_axis = 0;
     int64_t m_gather_axis = 0;
     bool m_indirect = false;
+    bool m_trim = false;
+    bool m_update_kv = false;
+    int64_t m_trim_length = 0;
 
     ov::element::Type m_output_type;
 };

@@ -21,6 +21,7 @@ class KVCacheCompressed : public ov::intel_gpu::op::KVCache {
 
     KVCacheCompressed(const OutputVector& inputs,
                       const std::shared_ptr<ov::op::util::Variable>& past_values,
+                      bool trim,
                       int64_t concat_axis,
                       int64_t gather_axis,
                       const QuantizationAttrs& quantization_attrs,

@@ -27,18 +27,27 @@ struct kv_cache : public primitive_base<kv_cache> {
              const ov::op::util::VariableInfo& variable_info,
              const int64_t concat_axis,
              const int64_t gather_axis,
-             const bool indirect)
+             const bool indirect,
+             const bool trim,
+             const bool update_kv)
         : primitive_base(id, inputs)
         , variable_info(variable_info)
         , concat_axis(concat_axis)
         , gather_axis(gather_axis)
-        , indirect(indirect) {}
+        , indirect(indirect)
+        , trim(trim)
+        , update_kv(update_kv) {
+            if (update_kv){
+                OPENVINO_ASSERT(trim, "update_kv must use trim");
+            }
+        }
 
     ov::op::util::VariableInfo variable_info;
     int64_t concat_axis = 0;
     int64_t gather_axis = 0;
     bool indirect = false;
-
+    bool trim = false;
+    bool update_kv = false;
     bool compressed = false;
     QuantizationAttributes quantization_attributes;
 
@@ -47,6 +56,8 @@ struct kv_cache : public primitive_base<kv_cache> {
         seed = hash_combine(seed, concat_axis);
         seed = hash_combine(seed, gather_axis);
         seed = hash_combine(seed, indirect);
+        seed = hash_combine(seed, trim);
+        seed = hash_combine(seed, update_kv);
         seed = hash_combine(seed, compressed);
         seed = hash_range(seed, quantization_attributes.scales_zp_output_order.begin(), quantization_attributes.scales_zp_output_order.end());
         seed = hash_range(seed, quantization_attributes.group_sizes.begin(), quantization_attributes.group_sizes.end());
@@ -69,6 +80,8 @@ struct kv_cache : public primitive_base<kv_cache> {
                concat_axis == rhs_casted.concat_axis &&
                gather_axis == rhs_casted.gather_axis &&
                indirect == rhs_casted.indirect &&
+               trim == rhs_casted.trim &&
+               update_kv == rhs_casted.update_kv &&
                compressed == rhs_casted.compressed &&
                quantization_attributes.scales_zp_output_order == rhs_casted.quantization_attributes.scales_zp_output_order &&
                quantization_attributes.output_storage_type == rhs_casted.quantization_attributes.output_storage_type &&
@@ -88,6 +101,8 @@ struct kv_cache : public primitive_base<kv_cache> {
         ob << concat_axis;
         ob << gather_axis;
         ob << indirect;
+        ob << trim;
+        ob << update_kv;
         ob << compressed;
         ob << make_data(&quantization_attributes.quantization_type, sizeof(quantization_attributes.quantization_type));
         ob << make_data(&quantization_attributes.quantization_dt, sizeof(quantization_attributes.quantization_dt));
@@ -110,6 +125,8 @@ struct kv_cache : public primitive_base<kv_cache> {
         ib >> concat_axis;
         ib >> gather_axis;
         ib >> indirect;
+        ib >> trim;
+        ib >> update_kv;
         ib >> compressed;
         ib >> make_data(&quantization_attributes.quantization_type, sizeof(quantization_attributes.quantization_type));
         ib >> make_data(&quantization_attributes.quantization_dt, sizeof(quantization_attributes.quantization_dt));

@@ -444,15 +444,37 @@ bool crop_in_place_optimization::can_crop_be_optimized_simple_data_format(const
 }
 
 static bool can_read_value_be_optimize(const read_value_node& node) {
-    std::unordered_set<const cldnn::program_node*> unique_users(node.get_users().begin(), node.get_users().end());
-    if (unique_users.size() == 1)
+    std::unordered_set<const cldnn::program_node*> unique_users;
+    for (const auto user : node.get_users()) {
+        if (!user->is_type<shape_of>()) {
+            unique_users.insert(user);
+        }
+    }
+    if (unique_users.size() <= 1)
         return true;
 
-    const auto non_shape_of_users_count = std::count_if(unique_users.begin(), unique_users.end(), [](const program_node* user) {
-        return !user->is_type<shape_of>();
-    });
-    if (non_shape_of_users_count <= 1)
-        return true;
+    // following pattern should be optimized, otherwise it could lead to corruptted data.
+    // readvalue's users eventually need to pass kvcache before assign, which makes kvcache node the dominator of assign node, 
+    // it could be safely treated as if readvalue is directly connecting to kvcache.
+    // readvalue --> any
+    //       |         |
+    //       |         v
+    //       ------> kvcache
+    if (unique_users.size() == 2) {
+        const auto user0 = *unique_users.begin();
+        const auto user1 = *(++unique_users.begin());
+        const bool is_user0_kvcache = user0->is_type<kv_cache>();
+        const auto kvcache = is_user0_kvcache ? user0 : (user1->is_type<kv_cache>() ? user1 : nullptr);
+        if (kvcache) {
+            const auto other_user = is_user0_kvcache ? user1 : user0;
+            const bool only_used_by_kvcache = std::none_of(other_user->get_users().begin(), other_user->get_users().end(), [kvcache](const auto user) {
+                return user != kvcache && !user->template is_type<shape_of>();
+            });
+            if (only_used_by_kvcache) {
+                return true;
+            }
+        }
+    }
 
     return false;
 }