openvinotoolkit
diff --git a/‎.github/workflows/code_style.yml‎
Lines changed: 7 additions & 1 deletion b/‎.github/workflows/code_style.yml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎.github/workflows/linux.yml‎
Lines changed: 14 additions & 12 deletions b/‎.github/workflows/linux.yml‎
Lines changed: 14 additions & 12 deletions
diff --git a/‎.github/workflows/mac.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/mac.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/ollama_openvino_build_and_test.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ollama_openvino_build_and_test.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/windows.yml‎
Lines changed: 3 additions & 2 deletions b/‎.github/workflows/windows.yml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 0 deletions b/‎README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎modules/genai_optimizations/.gitignore‎
Lines changed: 50 additions & 0 deletions b/‎modules/genai_optimizations/.gitignore‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎modules/genai_optimizations/README.md‎
Lines changed: 79 additions & 0 deletions b/‎modules/genai_optimizations/README.md‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎modules/genai_optimizations/benchmarks/README.md‎
Lines changed: 102 additions & 0 deletions b/‎modules/genai_optimizations/benchmarks/README.md‎
Lines changed: 102 additions & 0 deletions
@@ -16,8 +16,14 @@ jobs:
     steps:
       - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
 
+      - uses: actions/setup-java@dded0888837ed1f317902acf8a20df0ad188d165 # v5.0.0
+        with:
+          distribution: "temurin"
+          java-version: 21
+
       - name: Fix code java style
-        uses: axel-op/googlejavaformat-action@dbff853fb823671ec5781365233bf86543b13215 # v3
+        uses: axel-op/googlejavaformat-action@c1134ebd196c4cbffb077f9476585b0be8b6afcd # v4
         with:
           args: "--set-exit-if-changed -a -i"
           commit-message: "[github actions] Apply google-java-format code style fixes"
+          files: "modules/java_api/**/*.java"
@@ -1,4 +1,4 @@
-name: Linux (Ubuntu 20.04, Python 3.11)
+name: Linux (Ubuntu 22.04, Python 3.11)
 on:
   workflow_dispatch:
   pull_request:
@@ -25,9 +25,9 @@ jobs:
     defaults:
       run:
         shell: bash
-    runs-on: ubuntu-20.04-16-cores
+    runs-on: ubuntu-22.04-16-cores
     container:
-      image: ubuntu:20.04
+      image: ubuntu:22.04
     env:
       DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
       CMAKE_BUILD_TYPE: 'Release'
@@ -120,6 +120,8 @@ jobs:
             -DCMAKE_VERBOSE_MAKEFILE=ON \
             -DCMAKE_BUILD_TYPE=${{ env.CMAKE_BUILD_TYPE }} \
             -DBUILD_nvidia_plugin=OFF \
+            -DBUILD_ollama_openvino=OFF \
+            -DBUILD_llama_cpp_plugin=OFF \
             -DENABLE_INTEL_GPU=OFF \
             -DENABLE_OV_TF_FRONTEND=OFF \
             -DENABLE_OV_PADDLE_FRONTEND=OFF \
@@ -131,7 +133,6 @@ jobs:
             -DENABLE_WHEEL=ON \
             -DENABLE_TESTS=ON \
             -DENABLE_INTEL_NPU=OFF \
-            -DBUILD_ollama_openvino=OFF \
             -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \
             -DCMAKE_C_COMPILER_LAUNCHER=${{ env.CMAKE_C_COMPILER_LAUNCHER }} \
             -S ${OPENVINO_REPO} \
@@ -214,9 +215,9 @@ jobs:
     defaults:
       run:
         shell: bash
-    runs-on: ubuntu-20.04-16-cores
+    runs-on: ubuntu-22.04-16-cores
     container:
-      image: nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
+      image: nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
     env:
       CMAKE_BUILD_TYPE: 'Release'
       CMAKE_GENERATOR: 'Ninja Multi-Config'
@@ -298,20 +299,21 @@ jobs:
 
       - name: Install CUDA
         run: |
-          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
-          mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin
+          mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
 
-          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
-          add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub
+          add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /"
           apt update
           apt install -y --allow-downgrades --allow-change-held-packages \
+            linux-headers-5.15.0-25-generic \
             libcudnn8=8.9.4.*-1+cuda11.8 \
             libcudnn8-dev=8.9.4.*-1+cuda11.8 \
             libcudnn8-samples=8.9.4.*-1+cuda11.8 \
             cuda-runtime-11-8 \
             cuda-11-8 \
-            libcutensor1=1.6.1.5-1 \
-            libcutensor-dev=1.6.1.5-1 \
+            libcutensor1=1.7.0.1-1 \
+            libcutensor-dev=1.7.0.1-1 \
             cuda-drivers=520.61.05-1
 
       #
 
@@ -101,6 +101,8 @@ jobs:
             -DCMAKE_VERBOSE_MAKEFILE=ON \
             -DCMAKE_BUILD_TYPE=${{ env.CMAKE_BUILD_TYPE }} \
             -DBUILD_nvidia_plugin=OFF \
+            -DBUILD_ollama_openvino=OFF \
+            -DBUILD_llama_cpp_plugin=OFF \
             -DOPENVINO_EXTRA_MODULES=${{ env.OPENVINO_CONTRIB_REPO }}/modules \
             -DCMAKE_OSX_DEPLOYMENT_TARGET=${{ env.MACOSX_DEPLOYMENT_TARGET }} \
             -DCMAKE_OSX_ARCHITECTURES=${{ env.OSX_ARCHITECTURES }} \
@@ -112,7 +114,6 @@ jobs:
             -DENABLE_OV_PYTORCH_FRONTEND=OFF \
             -DENABLE_CPPLINT=OFF \
             -DENABLE_INTEL_NPU=OFF \
-            -DBUILD_ollama_openvino=OFF \
             -S ${{ env.OPENVINO_REPO }} \
             -B ${{ env.BUILD_DIR }}
 
 
@@ -9,8 +9,8 @@ on:
 permissions: read-all
 
 jobs:
-  test_ubuntu20:
-    runs-on: ubuntu-20.04
+  test_ubuntu22:
+    runs-on: ubuntu-22.04
     steps:
       - name: Download repo
         uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
 
@@ -26,7 +26,7 @@ jobs:
     defaults:
       run:
         shell: pwsh
-    runs-on: windows-2019-16-core
+    runs-on: windows-2022-16-core
     env:
       CMAKE_BUILD_TYPE: 'Release'
       CMAKE_CXX_COMPILER_LAUNCHER: ccache
@@ -127,6 +127,8 @@ jobs:
           cmake -GNinja `
             -DCMAKE_BUILD_TYPE=${{ env.CMAKE_BUILD_TYPE }} `
             -DBUILD_nvidia_plugin=OFF `
+            -DBUILD_ollama_openvino=OFF `
+            -DBUILD_llama_cpp_plugin=OFF `
             -DENABLE_OV_TF_FRONTEND=OFF `
             -DENABLE_OV_PADDLE_FRONTEND=OFF `
             -DENABLE_OV_TF_LITE_FRONTEND=OFF `
@@ -137,7 +139,6 @@ jobs:
             -DENABLE_PYTHON=ON `
             -DENABLE_INTEL_NPU=OFF `
             -DENABLE_JS=OFF `
-            -DBUILD_ollama_openvino=OFF `
             -DOPENVINO_EXTRA_MODULES=${{ env.OPENVINO_CONTRIB_REPO }}/modules `
             -DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} `
             -DCMAKE_C_COMPILER_LAUNCHER=${{ env.CMAKE_C_COMPILER_LAUNCHER }} `
 
@@ -15,6 +15,7 @@ This list gives an overview of all modules available inside the contrib reposito
 * [**Token Merging**](./modules/token_merging/): adaptation of [Token Merging method](https://arxiv.org/abs/2210.09461) for OpenVINO.
 * [**OpenVINO Code**](./modules/openvino_code): VSCode extension for AI code completion with OpenVINO.
 * [**Ollama-OpenVINO**](./modules/ollama_openvino): OpenVINO GenAI empowered Ollama which accelerate LLM on Intel platforms(including CPU, iGPU/dGPU, NPU).
+* [**ov_training_kit**](./modules/ov_training_kit): Training Kit Python library -- provides scikit-learn, PyTorch and Tensorflow wrappers for training, optimization, and deployment with OpenVINO on AI PCs.
 
 ## How to build OpenVINO with extra modules
 You can build OpenVINO, so it will include the modules from this repository. Contrib modules are under constant development and it is recommended to use them alongside the master branch or latest releases of OpenVINO.
@@ -38,6 +39,8 @@ Additional build instructions are available for the following modules:
 * [**nvidia_plugin**](./modules/nvidia_plugin/README.md)
 * [**custom_operations**](./modules/custom_operations/README.md)
 * [**ollama_OpenVINO**](./modules/ollama_openvino)
+* [**openvino-langchain**](./modules/openvino-langchain): LangChain.js integrations for OpenVINO™
+
 ## Update the repository documentation
 In order to keep a clean overview containing all contributed modules, the following files need to be created/adapted:
 
 
@@ -0,0 +1,50 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# pyenv
+.python-version
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+env*
+
+# datasets
+*.tar*
+MileBench/
+
+# VSCode
+.vscode/
@@ -0,0 +1,79 @@
+# GenAI Optimizations
+
+This module provides experimental optimizations for GenAI models in PyTorch. The goal is to improve efficiency and performance for generative AI tasks while minimizing accuracy loss. This is PoC code and is intended to be compatible with OpenVINO GenAI.
+
+## Supported Generative AI Scenarios
+
+- Text Generation Using LLMs
+- Visual language text generation
+
+## Supported Generative AI Optimization Methods
+
+- [**Visual Token Pruning**](./visual_token_pruning.py):
+  Designed to accelerate inference in VLMs, where the number of input visual tokens is often significantly larger than that of textual tokens. Pruning these tokens reduces first-token latency and overall FLOPs while preserving accuracy. In this repository, we implement a visual token pruning method called [CDPruner](https://arxiv.org/pdf/2506.10967), which maximizes the conditional diversity of retained tokens. It can reduce FLOPs by 95% and CUDA latency by 78%, while maintaining 94% of the original accuracy.
+
+- [**Sparse Attention**](./sparse_attention.py):
+  Designed to accelerate the prefill stage in LLMs and MMLLMs with long prompts, high-resolution images, or videos by attending only to the most relevant query-key blocks. This block-wise attention mechanism reduces memory usage and FLOPs while preserving model accuracy. Supported modes:
+  - **Tri-Shape Mode** – A static block-sparse attention pattern that preserves the initial tokens, local windows, and the final segment of the query, forming a triangular structure to capture critical tokens while maintaining instruction-following performance in both turn-0 and multi-request scenarios. Paper: https://arxiv.org/pdf/2412.10319
+  - **XAttention Mode** – A dynamic block-sparse attention mechanism that accelerates inference by focusing computation on the most important regions of the attention matrix using antidiagonal block scoring, reducing FLOPs and memory usage without significant loss of accuracy. Paper: https://arxiv.org/pdf/2503.16428
+
+- [**KV Cache Token Eviction**](./token_eviction.py):
+  Designed to optimize KV cache memory usage during autoregressive generation in LLMs. It selectively removes less important cached tokens while preserving those crucial for contextual understanding, enabling efficient long-sequence inference under constrained memory. Note that currently eviction starts only after the full prompt has been processed; i.e., no eviction takes place during the prefill phase.
+
+  The KV cache is split into three parts: **start**, **intermediate (evictable)**, and **recent**. The size of each part is configurable:
+  - **Start Area** – Initial tokens that are never evicted.
+  - **Intermediate Area** – Tokens that can be evicted based on importance scores.
+  - **Recent Area** – Most recent tokens that are preserved (not evicted while in this area, but naturally migrate toward the evictable area as text generation continues).
+
+  Eviction granularity can be **per-token** or **per-group**:
+  - **Per-token** – Tokens are evicted independently from the KV cache.
+  - **Per-group** – Only fully filled blocks from the evictable area are removed. Tokens are managed in consecutive, non-overlapping groups, following the concept of *Paged Attention*, which organizes the KV cache into pages. Each token belongs to a single page and remains there for the entire generation process. To maximize eviction efficiency, entire pages are evicted rather than individual tokens. The `group_size` is a configurable algorithm parameter.
+
+  Supported modes:
+  - **H2O Mode** – Evicts tokens using the *Heavy-Hitter Oracle* strategy, which accumulates attention scores to identify and retain high-impact tokens. It also preserves recent tokens due to their strong correlation with the current context. Scores are accumulated throughout the entire generation process, and their weighting can be adjusted via the `normalize_scores` parameter, which controls whether attention scores are normalized by the number of times each token was attended to.
+  Paper: https://arxiv.org/pdf/2306.14048
+  - **SnapKV Mode** – Modifies the *H2O* approach by computing token importance within a small sliding window of the most recent queries during the prefill stage, then reverting to the H2O strategy during decoding. The authors observed that only a small subset of prompt tokens is sufficient for accurate response generation.
+  Paper: https://arxiv.org/pdf/2404.14469
+
+## Supported and tested models
+
+Large Language Models:
+
+- [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)
+- [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
+- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
+- [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)
+- [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct)
+- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
+
+Multimodal Large Language Models:
+
+- [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
+- [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
+- [Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)
+- [Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)
+
+## Prerequisites
+
+Before running algorithms, ensure you have **Python 3.10+** installed and set up your environment.
+
+### 1. Create and activate a virtual environment
+
+```bash
+python3 -m venv env
+source env/bin/activate      # On Windows: env\Scripts\activate.bat
+```
+
+### 2. Installation
+
+You can install the package directly from the repository. To avoid running out of memory during the build, you can limit threads with `MAX_JOBS=4`:
+
+```bash
+pip install git+https://github.com/openvinotoolkit/openvino_contrib.git#egg=genai_opt&subdirectory=modules/genai_optimizations
+```
+
+Or install it locally with extra dependencies for benchmarks support:
+
+```bash
+pip install .[benchmarks]
+```
@@ -0,0 +1,102 @@
+# Generative AI Models Optimization Examples
+
+This folder provides examples for evaluating and optimizing Generative AI models across different scenarios.
+
+
+<details>
+<summary><b>Large Language Models Optimization Example: LongBench</b></summary>
+
+This [example](./longbench.py) demonstrates how to evaluate and optimize LLMs using the [LongBench](https://arxiv.org/pdf/2308.14508), a bilingual, multi-task benchmark designed to assess long-context understanding. LongBench includes 21 datasets across six task categories—single-document QA, multi-document QA, summarization, few-shot learning, synthetic reasoning, and code completion—in both English and Chinese.
+
+Sparse attention speeds up the prefill stage in LLMs by attending only to the most relevant query-key blocks. Static patterns like Tri-Shape and dynamic mechanisms like XAttention reduce memory and computation without significant accuracy loss, enabling efficient handling of long prompts.
+
+### Run Example
+
+```bash
+python longbench.py \
+    --subset samsum \
+    --model meta-llama/Llama-3.2-1B-Instruct \
+    --use_custom_attention \
+    --prefill_impl tri-shape \
+    --enable_eviction \
+    --algorithm h2o \
+    --granularity per_group \
+    --normalize_scores \
+    --intermediate_tokens 1024
+```
+This will automatically:
+
+- Download the selected model and dataset
+- Apply sparse attention computation during the prefill stage
+- Apply token eviction during the decoding stage
+- Evaluate the model and report the score
+
+</details>
+
+<details>
+<summary><b>Multimodal Large Language Models Optimization Example: MME Benchmark</b></summary>
+
+This [example](./mmebench.py) demonstrates how to evaluate and optimize MLLMs using the [MME benchmark](https://arxiv.org/pdf/2306.13394), which measures both perception and cognition abilities across 14 subtasks. Its concise instruction design enables fair comparison of MLLMs without the need for extensive prompt engineering.
+
+Visual token pruning enables significant acceleration of inference in VLMs, where the number of input visual tokens is often much larger than the number of textual tokens. By pruning these tokens, we reduce first-token latency and overall FLOPs while preserving accuracy.
+
+Sparse attention speeds up the prefill stage in LLMs and MMLLMs by attending only to the most relevant query-key blocks. Static patterns like Tri-Shape and dynamic mechanisms like XAttention reduce memory and computation without significant accuracy loss, enabling efficient handling of long prompts, high-resolution images, and multi-frame videos.
+
+### Run Example
+
+```bash
+python mmebench.py \
+    --subset artwork \
+    --model Qwen/Qwen2.5-VL-3B-Instruct \
+    --enable_visual_pruning \
+    --num_keep_tokens 128 \
+    --theta 0.5 \
+    --use_custom_attention \
+    --prefill_impl x-attention \
+    --enable_eviction \
+    --algorithm snapkv \
+    --granularity per_group \
+    --window_size 8
+```
+This will automatically:
+
+- Download the selected model and dataset
+- Apply the visual token pruning algorithm
+- Apply sparse attention computation during the prefill stage
+- Apply token eviction during the decoding stage
+- Evaluate the model and report the score
+
+</details>
+
+<details>
+<summary><b>Multimodal Large Language Models Optimization Example: MileBench</b></summary>
+
+This [example](./milebench.py) demonstrates how to optimize MLLMs using an experimental visual token pruning algorithm. The example leverages [MileBench](https://arxiv.org/pdf/2404.18532), a pioneering benchmark designed to rigorously evaluate the multimodal long-context capabilities of MLLMs. MileBench encompasses diverse tasks requiring both comprehension and generation, and introduces two distinct evaluation sets— diagnostic and realistic — that systematically assess models’ capacity for long-context adaptation and effective task completion.
+
+
+### Run Example
+
+```bash
+python milebench.py \
+    --subset WikiVQA \
+    --model Qwen/Qwen2-VL-2B-Instruct \
+    --enable_visual_pruning \
+    --num_keep_tokens 64 \
+    --theta 0.5 \
+    --use_custom_attention \
+    --prefill_impl tri-shape \
+    --enable_eviction \
+    --algorithm snapkv \
+    --granularity per_group \
+    --window_size 8
+```
+
+This will automatically:
+
+- Download the selected model and dataset
+- Apply the visual token pruning algorithm
+- Apply sparse attention computation during the prefill stage
+- Apply token eviction during the decoding stage
+- Evaluate the model and report the score
+
+</details>