openvinotoolkit
diff --git a/‎modules/custom_operations/tests/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎modules/custom_operations/tests/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modules/custom_operations/tests/run_tests.py‎
Lines changed: 8 additions & 3 deletions b/‎modules/custom_operations/tests/run_tests.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎modules/custom_operations/user_ie_extensions/fft.cpp‎
Lines changed: 8 additions & 5 deletions b/‎modules/custom_operations/user_ie_extensions/fft.cpp‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎modules/custom_operations/user_ie_extensions/ov_extension.cpp‎
Lines changed: 3 additions & 1 deletion b/‎modules/custom_operations/user_ie_extensions/ov_extension.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎modules/genai_optimizations/README.md‎
Lines changed: 15 additions & 0 deletions b/‎modules/genai_optimizations/README.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎modules/genai_optimizations/benchmarks/README.md‎
Lines changed: 31 additions & 0 deletions b/‎modules/genai_optimizations/benchmarks/README.md‎
Lines changed: 31 additions & 0 deletions
@@ -3,3 +3,4 @@ onnx
 tensorboard
 pytest
 # open3d==0.16.0 - need to update with new release
+onnxscript==0.5.4
@@ -44,13 +44,17 @@ def test_fft(shape, inverse, centered, test_onnx, dims):
     from examples.fft.export_model import export
 
     if len(shape) == 3 and dims != [1] or \
-       len(shape) == 4 and dims == [2, 3] or \
-       len(shape) == 5 and dims == [1] or \
+       len(shape) == 4 and dims in ([1, 2], [2, 3]) or \
+       len(shape) == 5 and dims in ([1], [1, 2], [2, 3]) or \
        centered and len(dims) != 2:
         pytest.skip("unsupported configuration")
 
+    if len(shape) == 4 and dims == [1]:
+        pytest.skip("Custom FFT executed but there is accuracy error, requires FFT::evaluate fix")
+
+
     inp, ref = export(shape, inverse, centered, dims)
-    run_test(inp, ref, test_onnx=test_onnx)
+    run_test(inp, ref, test_onnx=test_onnx) 
 
 
 @pytest.mark.parametrize("shape", [[3, 2, 4, 8, 2], [3, 1, 4, 8, 2]])
@@ -86,6 +90,7 @@ def test_sparse_conv_transpose(in_channels, filters, kernel_size, out_pos):
     run_test(inp, ref, test_onnx=True, threshold=1e-4)
 
 
+@pytest.mark.skip(reason="Exported model do not contains calculate_grid operator")
 def test_calculate_grid():
     from examples.calculate_grid.export_model import export
     inp, ref = export(num_points=10, max_grid_extent=5)
 
@@ -112,7 +112,10 @@ void FFT::validate_and_infer_types() {
 }
 
 std::shared_ptr<ov::Node> FFT::clone_with_new_inputs(const ov::OutputVector& new_args) const {
-    OPENVINO_ASSERT(new_args.size() == 2, "Incorrect number of new arguments");
+    const ov::Dimension exp_no_inputs{2};
+    OPENVINO_ASSERT(exp_no_inputs.compatible(new_args.size()),
+                    "Incorrect number of new arguments, provided: ",
+                    new_args.size());
     return std::make_shared<FFT>(new_args, inverse, centered);
 }
 
@@ -128,15 +131,15 @@ bool FFT::visit_attributes(ov::AttributeVisitor& visitor) {
 
 bool FFT::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inputs) const {
     //const_cast because the cvSetData use user pointer as non-const, should be ok as it looks like input data
-    float *inpData = reinterpret_cast<float *>(const_cast<void*>(inputs[0].data()));
+    auto *inpData = const_cast<float*>(inputs[0].data<float>());
 
     if (inputs[1].get_element_type() != ov::element::i32)
         OPENVINO_THROW("Unexpected dims type: " + inputs[1].get_element_type().to_string());
 
-    const int32_t *signalDimsData = reinterpret_cast<const int32_t *>(inputs[1].data());
-    float* outData = reinterpret_cast<float*>(outputs[0].data());
+    auto *signalDimsData = inputs[1].data<int32_t>();
+    auto *outData = outputs[0].data<float>();
     std::vector<size_t> dims = inputs[0].get_shape();
-    const size_t numSignalDims = inputs[1].get_shape()[0];
+    const size_t numSignalDims = inputs[1].get_shape().empty() ? 1:  inputs[1].get_shape().size();
 
     if (!((dims.size() == 3 && numSignalDims == 1 && signalDimsData[0] == 1) ||
           (dims.size() == 4 && ((numSignalDims == 1 && signalDimsData[0] == 1) ||
 
@@ -29,7 +29,9 @@
 #    include "fft.hpp"
 #    define FFT_EXT                                                                                    \
             std::make_shared<ov::OpExtension<TemplateExtension::FFT>>(),                               \
-            std::make_shared<ov::frontend::OpExtension<TemplateExtension::FFT>>(),
+            std::make_shared<ov::frontend::OpExtension<TemplateExtension::FFT>>(                       \
+                "DFT",                                                                                 \
+                std::map<std::string, std::string>{ {"centered", "onesided"}, {"inverse", "inverse"} }),
 #else
 #    define FFT_EXT
 #endif
 
@@ -6,6 +6,7 @@ This module provides experimental optimizations for GenAI models in PyTorch. The
 
 - Text Generation Using LLMs
 - Visual language text generation
+- Reasoning and Problem Solving
 
 ## Supported Generative AI Optimization Methods
 
@@ -34,6 +35,14 @@ This module provides experimental optimizations for GenAI models in PyTorch. The
   Paper: https://arxiv.org/pdf/2306.14048
   - **SnapKV Mode** – Modifies the *H2O* approach by computing token importance within a small sliding window of the most recent queries during the prefill stage, then reverting to the H2O strategy during decoding. The authors observed that only a small subset of prompt tokens is sufficient for accurate response generation.
   Paper: https://arxiv.org/pdf/2404.14469
+  - **RKV Mode** - Computes token importance scores based on attention weights over a sliding window of the most recent queries during both the prefill and decode stages. Importance scores are stabilized using per-token max-pooling and then averaged across attention heads.
+
+Refined modes enhance standard eviction strategies by selecting the most representative tokens or blocks from the evictable (intermediate) region. These methods aim to balance contextual importance with redundancy reduction to optimize cache efficiency. If `refined_algorithm` is enabled but `refined_tokens` is not specified or set to 0, the number of refined tokens is determined dynamically as part of the intermediate token budget. Budget for primary algorithm is allocated by selecting the minimal number of tokens or groups that together capture at least 90% of the total attention mass, ensuring that all high-importance tokens are retained. For the remaining eviction budget, each token’s dissimilarity is computed relative to the already retained set, promoting information diversity and reducing redundancy.
+
+ Supported refined modes:
+  - **KVCrush Mode** - Selects representative blocks based on diversity rather than raw importance. This is achieved by generating binary indicators for each token, constructing an anchor point (reference pattern) using one of several modes: `random`, `zeros`, `ones`, `mean`, `alternate`, and selecting blocks with the highest Hamming distance to the anchor point.
+  Paper: https://arxiv.org/pdf/2503.00022
+  - **DiverseKV Mode** – Implements a dynamic redundancy scoring mechanism to identify and de-prioritize repetitive tokens based on cosine similarity of key vectors with already retained tokens. Key vectors are normalized, and cosine similarities are computed with diagonal values zeroed to avoid self-similarity. Similarities are thresholded on a per-head basis—only values greater than or equal to the mean similarity for each head are kept and then aggregated across heads. For the remaining eviction budget, each token or group's dissimilarity to already retained tokens or groups is calculated. Tokens/groups with the highest dissimilarity scores are retained, maximizing contextual diversity while reducing redundancy.
 
 ## Supported and tested models
 
@@ -53,6 +62,12 @@ Multimodal Large Language Models:
 - [Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)
 - [Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)
 
+Large Reasoning Models:
+
+- [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)
+- [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B)
+- [microsoft/Phi-4-mini-reasoning](https://huggingface.co/microsoft/Phi-4-mini-reasoning)
+
 ## Prerequisites
 
 Before running algorithms, ensure you have **Python 3.10+** installed and set up your environment.
 
@@ -10,6 +10,8 @@ This [example](./longbench.py) demonstrates how to evaluate and optimize LLMs us
 
 Sparse attention speeds up the prefill stage in LLMs by attending only to the most relevant query-key blocks. Static patterns like Tri-Shape and dynamic mechanisms like XAttention reduce memory and computation without significant accuracy loss, enabling efficient handling of long prompts.
 
+KV-Cache Token Eviction accelerates the decoding stage in LLMs by removing less important cached tokens while preserving those essential for contextual understanding, allowing efficient long-sequence inference under constrained memory.
+
 ### Run Example
 
 ```bash
@@ -100,3 +102,32 @@ This will automatically:
 - Evaluate the model and report the score
 
 </details>
+
+<details>
+<summary><b>Large Reasoning Models Optimization Example: MATH500 and GSM8K Benchmarks</b></summary>
+
+This [example](./math500_gsm_bench.py) demonstrates how to evaluate and optimize LRMs using the KV-Cache Token Eviction algorithm. The example leverages [MATH500](https://huggingface.co/datasets/HuggingFaceH4/MATH-500) and [GSM8K](https://huggingface.co/datasets/openai/gsm8k) datasets.
+MATH500 contains a subset of 500 problems from the [MATH](https://github.com/hendrycks/math) benchmark, originally introduced in OpenAI’s Let’s Verify Step by Step paper. The subset covers six domains: algebra, geometry, intermediate algebra, number theory, precalculus, and probability.
+GSM8K (Grade School Math 8K) is a dataset of 8,500 high-quality, linguistically diverse grade-school math word problems. While the problems are conceptually simple, they often require multi-step reasoning, making them challenging for state-of-the-art language models due to the high diversity of problems.
+
+
+### Run Example
+
+```bash
+python math500_gsm_bench.py \
+    --dataset MATH500 \
+    --model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --max_tokens 5000 \
+    --max_examples 100 \
+    --enable_eviction \
+    --algorithm rkv \
+    --granularity per_group \
+    --intermediate_tokens 512
+```
+This will automatically:
+
+- Download the selected model and dataset
+- Apply token eviction during the decoding stage
+- Evaluate the model and report the score
+
+</details>