Skip to content

Commit d063b50

Browse files
authored
Merge branch 'master' into tensor-as-long-pr
2 parents af7d19c + fda6fb5 commit d063b50

File tree

190 files changed

+20817
-262
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

190 files changed

+20817
-262
lines changed

.github/workflows/code_style.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,14 @@ jobs:
1616
steps:
1717
- uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
1818

19+
- uses: actions/setup-java@dded0888837ed1f317902acf8a20df0ad188d165 # v5.0.0
20+
with:
21+
distribution: "temurin"
22+
java-version: 21
23+
1924
- name: Fix code java style
20-
uses: axel-op/googlejavaformat-action@dbff853fb823671ec5781365233bf86543b13215 # v3
25+
uses: axel-op/googlejavaformat-action@c1134ebd196c4cbffb077f9476585b0be8b6afcd # v4
2126
with:
2227
args: "--set-exit-if-changed -a -i"
2328
commit-message: "[github actions] Apply google-java-format code style fixes"
29+
files: "modules/java_api/**/*.java"

.github/workflows/linux.yml

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Linux (Ubuntu 20.04, Python 3.11)
1+
name: Linux (Ubuntu 22.04, Python 3.11)
22
on:
33
workflow_dispatch:
44
pull_request:
@@ -25,9 +25,9 @@ jobs:
2525
defaults:
2626
run:
2727
shell: bash
28-
runs-on: ubuntu-20.04-16-cores
28+
runs-on: ubuntu-22.04-16-cores
2929
container:
30-
image: ubuntu:20.04
30+
image: ubuntu:22.04
3131
env:
3232
DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
3333
CMAKE_BUILD_TYPE: 'Release'
@@ -120,6 +120,8 @@ jobs:
120120
-DCMAKE_VERBOSE_MAKEFILE=ON \
121121
-DCMAKE_BUILD_TYPE=${{ env.CMAKE_BUILD_TYPE }} \
122122
-DBUILD_nvidia_plugin=OFF \
123+
-DBUILD_ollama_openvino=OFF \
124+
-DBUILD_llama_cpp_plugin=OFF \
123125
-DENABLE_INTEL_GPU=OFF \
124126
-DENABLE_OV_TF_FRONTEND=OFF \
125127
-DENABLE_OV_PADDLE_FRONTEND=OFF \
@@ -131,7 +133,6 @@ jobs:
131133
-DENABLE_WHEEL=ON \
132134
-DENABLE_TESTS=ON \
133135
-DENABLE_INTEL_NPU=OFF \
134-
-DBUILD_ollama_openvino=OFF \
135136
-DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} \
136137
-DCMAKE_C_COMPILER_LAUNCHER=${{ env.CMAKE_C_COMPILER_LAUNCHER }} \
137138
-S ${OPENVINO_REPO} \
@@ -214,9 +215,9 @@ jobs:
214215
defaults:
215216
run:
216217
shell: bash
217-
runs-on: ubuntu-20.04-16-cores
218+
runs-on: ubuntu-22.04-16-cores
218219
container:
219-
image: nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
220+
image: nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
220221
env:
221222
CMAKE_BUILD_TYPE: 'Release'
222223
CMAKE_GENERATOR: 'Ninja Multi-Config'
@@ -298,20 +299,21 @@ jobs:
298299
299300
- name: Install CUDA
300301
run: |
301-
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
302-
mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
302+
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin
303+
mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
303304
304-
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
305-
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
305+
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub
306+
add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /"
306307
apt update
307308
apt install -y --allow-downgrades --allow-change-held-packages \
309+
linux-headers-5.15.0-25-generic \
308310
libcudnn8=8.9.4.*-1+cuda11.8 \
309311
libcudnn8-dev=8.9.4.*-1+cuda11.8 \
310312
libcudnn8-samples=8.9.4.*-1+cuda11.8 \
311313
cuda-runtime-11-8 \
312314
cuda-11-8 \
313-
libcutensor1=1.6.1.5-1 \
314-
libcutensor-dev=1.6.1.5-1 \
315+
libcutensor1=1.7.0.1-1 \
316+
libcutensor-dev=1.7.0.1-1 \
315317
cuda-drivers=520.61.05-1
316318
317319
#

.github/workflows/mac.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ jobs:
101101
-DCMAKE_VERBOSE_MAKEFILE=ON \
102102
-DCMAKE_BUILD_TYPE=${{ env.CMAKE_BUILD_TYPE }} \
103103
-DBUILD_nvidia_plugin=OFF \
104+
-DBUILD_ollama_openvino=OFF \
105+
-DBUILD_llama_cpp_plugin=OFF \
104106
-DOPENVINO_EXTRA_MODULES=${{ env.OPENVINO_CONTRIB_REPO }}/modules \
105107
-DCMAKE_OSX_DEPLOYMENT_TARGET=${{ env.MACOSX_DEPLOYMENT_TARGET }} \
106108
-DCMAKE_OSX_ARCHITECTURES=${{ env.OSX_ARCHITECTURES }} \
@@ -112,7 +114,6 @@ jobs:
112114
-DENABLE_OV_PYTORCH_FRONTEND=OFF \
113115
-DENABLE_CPPLINT=OFF \
114116
-DENABLE_INTEL_NPU=OFF \
115-
-DBUILD_ollama_openvino=OFF \
116117
-S ${{ env.OPENVINO_REPO }} \
117118
-B ${{ env.BUILD_DIR }}
118119

.github/workflows/ollama_openvino_build_and_test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ on:
99
permissions: read-all
1010

1111
jobs:
12-
test_ubuntu20:
13-
runs-on: ubuntu-20.04
12+
test_ubuntu22:
13+
runs-on: ubuntu-22.04
1414
steps:
1515
- name: Download repo
1616
uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6

.github/workflows/windows.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jobs:
2626
defaults:
2727
run:
2828
shell: pwsh
29-
runs-on: windows-2019-16-core
29+
runs-on: windows-2022-16-core
3030
env:
3131
CMAKE_BUILD_TYPE: 'Release'
3232
CMAKE_CXX_COMPILER_LAUNCHER: ccache
@@ -127,6 +127,8 @@ jobs:
127127
cmake -GNinja `
128128
-DCMAKE_BUILD_TYPE=${{ env.CMAKE_BUILD_TYPE }} `
129129
-DBUILD_nvidia_plugin=OFF `
130+
-DBUILD_ollama_openvino=OFF `
131+
-DBUILD_llama_cpp_plugin=OFF `
130132
-DENABLE_OV_TF_FRONTEND=OFF `
131133
-DENABLE_OV_PADDLE_FRONTEND=OFF `
132134
-DENABLE_OV_TF_LITE_FRONTEND=OFF `
@@ -137,7 +139,6 @@ jobs:
137139
-DENABLE_PYTHON=ON `
138140
-DENABLE_INTEL_NPU=OFF `
139141
-DENABLE_JS=OFF `
140-
-DBUILD_ollama_openvino=OFF `
141142
-DOPENVINO_EXTRA_MODULES=${{ env.OPENVINO_CONTRIB_REPO }}/modules `
142143
-DCMAKE_CXX_COMPILER_LAUNCHER=${{ env.CMAKE_CXX_COMPILER_LAUNCHER }} `
143144
-DCMAKE_C_COMPILER_LAUNCHER=${{ env.CMAKE_C_COMPILER_LAUNCHER }} `

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ This list gives an overview of all modules available inside the contrib reposito
1515
* [**Token Merging**](./modules/token_merging/): adaptation of [Token Merging method](https://arxiv.org/abs/2210.09461) for OpenVINO.
1616
* [**OpenVINO Code**](./modules/openvino_code): VSCode extension for AI code completion with OpenVINO.
1717
* [**Ollama-OpenVINO**](./modules/ollama_openvino): OpenVINO GenAI empowered Ollama which accelerate LLM on Intel platforms(including CPU, iGPU/dGPU, NPU).
18+
* [**ov_training_kit**](./modules/ov_training_kit): Training Kit Python library -- provides scikit-learn, PyTorch and Tensorflow wrappers for training, optimization, and deployment with OpenVINO on AI PCs.
1819

1920
## How to build OpenVINO with extra modules
2021
You can build OpenVINO, so it will include the modules from this repository. Contrib modules are under constant development and it is recommended to use them alongside the master branch or latest releases of OpenVINO.
@@ -38,6 +39,8 @@ Additional build instructions are available for the following modules:
3839
* [**nvidia_plugin**](./modules/nvidia_plugin/README.md)
3940
* [**custom_operations**](./modules/custom_operations/README.md)
4041
* [**ollama_OpenVINO**](./modules/ollama_openvino)
42+
* [**openvino-langchain**](./modules/openvino-langchain): LangChain.js integrations for OpenVINO™
43+
4144
## Update the repository documentation
4245
In order to keep a clean overview containing all contributed modules, the following files need to be created/adapted:
4346

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# Distribution / packaging
7+
.Python
8+
env/
9+
build/
10+
develop-eggs/
11+
dist/
12+
eggs/
13+
.eggs/
14+
lib/
15+
lib64/
16+
parts/
17+
sdist/
18+
var/
19+
wheels/
20+
*.egg-info/
21+
.installed.cfg
22+
*.egg
23+
24+
# PyInstaller
25+
# Usually these files are written by a python script from a template
26+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
27+
*.manifest
28+
*.spec
29+
30+
# Installer logs
31+
pip-log.txt
32+
pip-delete-this-directory.txt
33+
34+
# pyenv
35+
.python-version
36+
37+
# dotenv
38+
.env
39+
40+
# virtualenv
41+
.venv
42+
venv/
43+
env*
44+
45+
# datasets
46+
*.tar*
47+
MileBench/
48+
49+
# VSCode
50+
.vscode/
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# GenAI Optimizations
2+
3+
This module provides experimental optimizations for GenAI models in PyTorch. The goal is to improve efficiency and performance for generative AI tasks while minimizing accuracy loss. This is PoC code and is intended to be compatible with OpenVINO GenAI.
4+
5+
## Supported Generative AI Scenarios
6+
7+
- Text Generation Using LLMs
8+
- Visual language text generation
9+
10+
## Supported Generative AI Optimization Methods
11+
12+
- [**Visual Token Pruning**](./visual_token_pruning.py):
13+
Designed to accelerate inference in VLMs, where the number of input visual tokens is often significantly larger than that of textual tokens. Pruning these tokens reduces first-token latency and overall FLOPs while preserving accuracy. In this repository, we implement a visual token pruning method called [CDPruner](https://arxiv.org/pdf/2506.10967), which maximizes the conditional diversity of retained tokens. It can reduce FLOPs by 95% and CUDA latency by 78%, while maintaining 94% of the original accuracy.
14+
15+
- [**Sparse Attention**](./sparse_attention.py):
16+
Designed to accelerate the prefill stage in LLMs and MMLLMs with long prompts, high-resolution images, or videos by attending only to the most relevant query-key blocks. This block-wise attention mechanism reduces memory usage and FLOPs while preserving model accuracy. Supported modes:
17+
- **Tri-Shape Mode** – A static block-sparse attention pattern that preserves the initial tokens, local windows, and the final segment of the query, forming a triangular structure to capture critical tokens while maintaining instruction-following performance in both turn-0 and multi-request scenarios. Paper: https://arxiv.org/pdf/2412.10319
18+
- **XAttention Mode** – A dynamic block-sparse attention mechanism that accelerates inference by focusing computation on the most important regions of the attention matrix using antidiagonal block scoring, reducing FLOPs and memory usage without significant loss of accuracy. Paper: https://arxiv.org/pdf/2503.16428
19+
20+
- [**KV Cache Token Eviction**](./token_eviction.py):
21+
Designed to optimize KV cache memory usage during autoregressive generation in LLMs. It selectively removes less important cached tokens while preserving those crucial for contextual understanding, enabling efficient long-sequence inference under constrained memory. Note that currently eviction starts only after the full prompt has been processed; i.e., no eviction takes place during the prefill phase.
22+
23+
The KV cache is split into three parts: **start**, **intermediate (evictable)**, and **recent**. The size of each part is configurable:
24+
- **Start Area** – Initial tokens that are never evicted.
25+
- **Intermediate Area** – Tokens that can be evicted based on importance scores.
26+
- **Recent Area** – Most recent tokens that are preserved (not evicted while in this area, but naturally migrate toward the evictable area as text generation continues).
27+
28+
Eviction granularity can be **per-token** or **per-group**:
29+
- **Per-token** – Tokens are evicted independently from the KV cache.
30+
- **Per-group** – Only fully filled blocks from the evictable area are removed. Tokens are managed in consecutive, non-overlapping groups, following the concept of *Paged Attention*, which organizes the KV cache into pages. Each token belongs to a single page and remains there for the entire generation process. To maximize eviction efficiency, entire pages are evicted rather than individual tokens. The `group_size` is a configurable algorithm parameter.
31+
32+
Supported modes:
33+
- **H2O Mode** – Evicts tokens using the *Heavy-Hitter Oracle* strategy, which accumulates attention scores to identify and retain high-impact tokens. It also preserves recent tokens due to their strong correlation with the current context. Scores are accumulated throughout the entire generation process, and their weighting can be adjusted via the `normalize_scores` parameter, which controls whether attention scores are normalized by the number of times each token was attended to.
34+
Paper: https://arxiv.org/pdf/2306.14048
35+
- **SnapKV Mode** – Modifies the *H2O* approach by computing token importance within a small sliding window of the most recent queries during the prefill stage, then reverting to the H2O strategy during decoding. The authors observed that only a small subset of prompt tokens is sufficient for accurate response generation.
36+
Paper: https://arxiv.org/pdf/2404.14469
37+
38+
## Supported and tested models
39+
40+
Large Language Models:
41+
42+
- [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)
43+
- [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
44+
- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
45+
- [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)
46+
- [Qwen/Qwen2-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct)
47+
- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
48+
49+
Multimodal Large Language Models:
50+
51+
- [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)
52+
- [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
53+
- [Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)
54+
- [Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)
55+
56+
## Prerequisites
57+
58+
Before running algorithms, ensure you have **Python 3.10+** installed and set up your environment.
59+
60+
### 1. Create and activate a virtual environment
61+
62+
```bash
63+
python3 -m venv env
64+
source env/bin/activate # On Windows: env\Scripts\activate.bat
65+
```
66+
67+
### 2. Installation
68+
69+
You can install the package directly from the repository. To avoid running out of memory during the build, you can limit threads with `MAX_JOBS=4`:
70+
71+
```bash
72+
pip install git+https://github.com/openvinotoolkit/openvino_contrib.git#egg=genai_opt&subdirectory=modules/genai_optimizations
73+
```
74+
75+
Or install it locally with extra dependencies for benchmarks support:
76+
77+
```bash
78+
pip install .[benchmarks]
79+
```
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# Generative AI Models Optimization Examples
2+
3+
This folder provides examples for evaluating and optimizing Generative AI models across different scenarios.
4+
5+
6+
<details>
7+
<summary><b>Large Language Models Optimization Example: LongBench</b></summary>
8+
9+
This [example](./longbench.py) demonstrates how to evaluate and optimize LLMs using the [LongBench](https://arxiv.org/pdf/2308.14508), a bilingual, multi-task benchmark designed to assess long-context understanding. LongBench includes 21 datasets across six task categories—single-document QA, multi-document QA, summarization, few-shot learning, synthetic reasoning, and code completion—in both English and Chinese.
10+
11+
Sparse attention speeds up the prefill stage in LLMs by attending only to the most relevant query-key blocks. Static patterns like Tri-Shape and dynamic mechanisms like XAttention reduce memory and computation without significant accuracy loss, enabling efficient handling of long prompts.
12+
13+
### Run Example
14+
15+
```bash
16+
python longbench.py \
17+
--subset samsum \
18+
--model meta-llama/Llama-3.2-1B-Instruct \
19+
--use_custom_attention \
20+
--prefill_impl tri-shape \
21+
--enable_eviction \
22+
--algorithm h2o \
23+
--granularity per_group \
24+
--normalize_scores \
25+
--intermediate_tokens 1024
26+
```
27+
This will automatically:
28+
29+
- Download the selected model and dataset
30+
- Apply sparse attention computation during the prefill stage
31+
- Apply token eviction during the decoding stage
32+
- Evaluate the model and report the score
33+
34+
</details>
35+
36+
<details>
37+
<summary><b>Multimodal Large Language Models Optimization Example: MME Benchmark</b></summary>
38+
39+
This [example](./mmebench.py) demonstrates how to evaluate and optimize MLLMs using the [MME benchmark](https://arxiv.org/pdf/2306.13394), which measures both perception and cognition abilities across 14 subtasks. Its concise instruction design enables fair comparison of MLLMs without the need for extensive prompt engineering.
40+
41+
Visual token pruning enables significant acceleration of inference in VLMs, where the number of input visual tokens is often much larger than the number of textual tokens. By pruning these tokens, we reduce first-token latency and overall FLOPs while preserving accuracy.
42+
43+
Sparse attention speeds up the prefill stage in LLMs and MMLLMs by attending only to the most relevant query-key blocks. Static patterns like Tri-Shape and dynamic mechanisms like XAttention reduce memory and computation without significant accuracy loss, enabling efficient handling of long prompts, high-resolution images, and multi-frame videos.
44+
45+
### Run Example
46+
47+
```bash
48+
python mmebench.py \
49+
--subset artwork \
50+
--model Qwen/Qwen2.5-VL-3B-Instruct \
51+
--enable_visual_pruning \
52+
--num_keep_tokens 128 \
53+
--theta 0.5 \
54+
--use_custom_attention \
55+
--prefill_impl x-attention \
56+
--enable_eviction \
57+
--algorithm snapkv \
58+
--granularity per_group \
59+
--window_size 8
60+
```
61+
This will automatically:
62+
63+
- Download the selected model and dataset
64+
- Apply the visual token pruning algorithm
65+
- Apply sparse attention computation during the prefill stage
66+
- Apply token eviction during the decoding stage
67+
- Evaluate the model and report the score
68+
69+
</details>
70+
71+
<details>
72+
<summary><b>Multimodal Large Language Models Optimization Example: MileBench</b></summary>
73+
74+
This [example](./milebench.py) demonstrates how to optimize MLLMs using an experimental visual token pruning algorithm. The example leverages [MileBench](https://arxiv.org/pdf/2404.18532), a pioneering benchmark designed to rigorously evaluate the multimodal long-context capabilities of MLLMs. MileBench encompasses diverse tasks requiring both comprehension and generation, and introduces two distinct evaluation sets— diagnostic and realistic — that systematically assess models’ capacity for long-context adaptation and effective task completion.
75+
76+
77+
### Run Example
78+
79+
```bash
80+
python milebench.py \
81+
--subset WikiVQA \
82+
--model Qwen/Qwen2-VL-2B-Instruct \
83+
--enable_visual_pruning \
84+
--num_keep_tokens 64 \
85+
--theta 0.5 \
86+
--use_custom_attention \
87+
--prefill_impl tri-shape \
88+
--enable_eviction \
89+
--algorithm snapkv \
90+
--granularity per_group \
91+
--window_size 8
92+
```
93+
94+
This will automatically:
95+
96+
- Download the selected model and dataset
97+
- Apply the visual token pruning algorithm
98+
- Apply sparse attention computation during the prefill stage
99+
- Apply token eviction during the decoding stage
100+
- Evaluate the model and report the score
101+
102+
</details>

0 commit comments

Comments
 (0)