Skip to content

Commit e0f80e0

Browse files
[ONNX] LLM compression example for ONNX (openvinotoolkit#3513)
### Changes Add LLM weight compression example for ONNX ### Reason for changes Ref: 168070 ### Related tickets Ref: 168070 ### Tests TBD
1 parent ff41e82 commit e0f80e0

5 files changed

Lines changed: 115 additions & 0 deletions

File tree

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Large Language Models Weight Compression Example
2+
3+
This example demonstrates how to optimize Large Language Models (LLMs) in ONNX format using NNCF weight compression API. The example applies 4/8-bit mixed-precision quantization to weights of Linear (Fully-connected) layers of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) model. This leads to a significant decrease in model footprint and performance improvement with ONNX Runtime.
4+
5+
## Prerequisites
6+
7+
To use this example:
8+
9+
- Create a separate Python* environment and activate it: `python3 -m venv nncf_env && source nncf_env/bin/activate`
10+
- Install dependencies:
11+
12+
```bash
13+
pip install -U pip
14+
pip install -r requirements.txt
15+
pip install ../../../../
16+
```
17+
18+
## Run Example
19+
20+
To run example:
21+
22+
```bash
23+
python main.py
24+
```
25+
26+
It will automatically download baseline model and save the resulting model.
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Copyright (c) 2025 Intel Corporation
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
# Unless required by applicable law or agreed to in writing, software
7+
# distributed under the License is distributed on an "AS IS" BASIS,
8+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
# See the License for the specific language governing permissions and
10+
# limitations under the License.
11+
12+
import time
13+
from pathlib import Path
14+
15+
import onnx
16+
from optimum.onnxruntime import ORTModelForCausalLM
17+
from transformers import AutoTokenizer
18+
19+
import nncf
20+
from nncf.onnx.quantization.backend_parameters import BackendParameters
21+
22+
ROOT = Path(__file__).parent.resolve()
23+
24+
25+
MODEL_ID = "PY007/TinyLlama-1.1B-Chat-v0.3"
26+
OUTPUT_DIR = ROOT / "tinyllama_compressed"
27+
28+
29+
def main():
30+
# Export the pretrained model in ONNX format. The OUTPUT_DIR directory
31+
# will contain model.onnx, model.onnx_data, and some metadata files.
32+
model = ORTModelForCausalLM.from_pretrained(MODEL_ID, export=True)
33+
model.save_pretrained(OUTPUT_DIR)
34+
35+
# Load the exported pretrained model as an ONNX model. For models larger than 2GB,
36+
# set `load_external_data=False` to load only the model's topology without the weights.
37+
# The weights will be loaded on the fly during compression. To enable this, specify the
38+
# `BackendParameters.EXTERNAL_DATA_DIR` parameter, which should be the absolute path to
39+
# the directory containing the model’s external data files.
40+
onnx_model = onnx.load(OUTPUT_DIR / "model.onnx", load_external_data=False)
41+
42+
compressed_onnx_model = nncf.compress_weights(
43+
onnx_model,
44+
mode=nncf.CompressWeightsMode.INT4_SYM,
45+
ratio=0.8,
46+
advanced_parameters=nncf.AdvancedCompressionParameters(
47+
backend_params={BackendParameters.EXTERNAL_DATA_DIR: OUTPUT_DIR}
48+
),
49+
)
50+
51+
# Replace the original model with the compressed model.
52+
onnx.save(compressed_onnx_model, OUTPUT_DIR / "model.onnx", save_as_external_data=True)
53+
54+
# Infer Model.
55+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
56+
ort_model = ORTModelForCausalLM.from_pretrained(OUTPUT_DIR)
57+
input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device)
58+
59+
start_t = time.time()
60+
output = ort_model.generate(**input_ids, max_new_tokens=100)
61+
print("Elapsed time: ", time.time() - start_t)
62+
63+
output_text = tokenizer.decode(output[0])
64+
print(output_text)
65+
return output_text
66+
67+
68+
if __name__ == "__main__":
69+
main()
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
transformers
2+
openvino==2025.1
3+
optimum-intel[openvino]
4+
onnx==1.17.0

tests/cross_fw/examples/example_scope.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,14 @@
218218
"word_count": 65
219219
}
220220
},
221+
"llm_compression_onnx": {
222+
"backend": "onnx",
223+
"requirements": "examples/llm_compression/onnx/tiny_llama/requirements.txt",
224+
"cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz",
225+
"accuracy_metrics": {
226+
"word_count": 77
227+
}
228+
},
221229
"llm_tune_params": {
222230
"backend": "openvino",
223231
"requirements": "examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt",

tests/cross_fw/examples/run_example.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,14 @@ def llm_compression() -> dict[str, float]:
168168
return {"word_count": len(result.split())}
169169

170170

171+
def llm_compression_onnx() -> dict[str, float]:
172+
from examples.llm_compression.onnx.tiny_llama.main import main as llm_compression_main
173+
174+
result = llm_compression_main()
175+
176+
return {"word_count": len(result.split())}
177+
178+
171179
def llm_tune_params() -> dict[str, float]:
172180
from examples.llm_compression.openvino.tiny_llama_find_hyperparams.main import main as llm_tune_params_main
173181

0 commit comments

Comments
 (0)