[ONNX] LLM compression example for ONNX (openvinotoolkit#3513)

andrey-churkin · web-flow · commit e0f80e01088b · 2025-05-28T10:47:44.000+01:00
### Changes

Add LLM weight compression example for ONNX

### Reason for changes

Ref: 168070

### Related tickets

Ref: 168070

### Tests

TBD
diff --git a/examples/llm_compression/onnx/tiny_llama/README.md b/examples/llm_compression/onnx/tiny_llama/README.md
@@ -0,0 +1,26 @@
+# Large Language Models Weight Compression Example
+
+This example demonstrates how to optimize Large Language Models (LLMs) in ONNX format using NNCF weight compression API. The example applies 4/8-bit mixed-precision quantization to weights of Linear (Fully-connected) layers of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) model. This leads to a significant decrease in model footprint and performance improvement with ONNX Runtime.
+
+## Prerequisites
+
+To use this example:
+
+- Create a separate Python* environment and activate it: `python3 -m venv nncf_env && source nncf_env/bin/activate`
+- Install dependencies:
+
+```bash
+pip install -U pip
+pip install -r requirements.txt
+pip install ../../../../
+```
+
+## Run Example
+
+To run example:
+
+```bash
+python main.py
+```
+
+It will automatically download baseline model and save the resulting model.
diff --git a/examples/llm_compression/onnx/tiny_llama/main.py b/examples/llm_compression/onnx/tiny_llama/main.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2025 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from pathlib import Path
+
+import onnx
+from optimum.onnxruntime import ORTModelForCausalLM
+from transformers import AutoTokenizer
+
+import nncf
+from nncf.onnx.quantization.backend_parameters import BackendParameters
+
+ROOT = Path(__file__).parent.resolve()
+
+
+MODEL_ID = "PY007/TinyLlama-1.1B-Chat-v0.3"
+OUTPUT_DIR = ROOT / "tinyllama_compressed"
+
+
+def main():
+    # Export the pretrained model in ONNX format. The OUTPUT_DIR directory
+    # will contain model.onnx, model.onnx_data, and some metadata files.
+    model = ORTModelForCausalLM.from_pretrained(MODEL_ID, export=True)
+    model.save_pretrained(OUTPUT_DIR)
+
+    # Load the exported pretrained model as an ONNX model. For models larger than 2GB,
+    # set `load_external_data=False` to load only the model's topology without the weights.
+    # The weights will be loaded on the fly during compression. To enable this, specify the
+    # `BackendParameters.EXTERNAL_DATA_DIR` parameter, which should be the absolute path to
+    # the directory containing the model’s external data files.
+    onnx_model = onnx.load(OUTPUT_DIR / "model.onnx", load_external_data=False)
+
+    compressed_onnx_model = nncf.compress_weights(
+        onnx_model,
+        mode=nncf.CompressWeightsMode.INT4_SYM,
+        ratio=0.8,
+        advanced_parameters=nncf.AdvancedCompressionParameters(
+            backend_params={BackendParameters.EXTERNAL_DATA_DIR: OUTPUT_DIR}
+        ),
+    )
+
+    # Replace the original model with the compressed model.
+    onnx.save(compressed_onnx_model, OUTPUT_DIR / "model.onnx", save_as_external_data=True)
+
+    # Infer Model.
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    ort_model = ORTModelForCausalLM.from_pretrained(OUTPUT_DIR)
+    input_ids = tokenizer("What is PyTorch?", return_tensors="pt").to(device=model.device)
+
+    start_t = time.time()
+    output = ort_model.generate(**input_ids, max_new_tokens=100)
+    print("Elapsed time: ", time.time() - start_t)
+
+    output_text = tokenizer.decode(output[0])
+    print(output_text)
+    return output_text
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/llm_compression/onnx/tiny_llama/requirements.txt b/examples/llm_compression/onnx/tiny_llama/requirements.txt
@@ -0,0 +1,4 @@
+transformers
+openvino==2025.1
+optimum-intel[openvino]
+onnx==1.17.0
diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json
@@ -218,6 +218,14 @@
             "word_count": 65
         }
     },
+    "llm_compression_onnx": {
+        "backend": "onnx",
+        "requirements": "examples/llm_compression/onnx/tiny_llama/requirements.txt",
+        "cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz",
+        "accuracy_metrics": {
+            "word_count": 77
+        }
+    },
     "llm_tune_params": {
         "backend": "openvino",
         "requirements": "examples/llm_compression/openvino/tiny_llama_find_hyperparams/requirements.txt",
diff --git a/tests/cross_fw/examples/run_example.py b/tests/cross_fw/examples/run_example.py
@@ -168,6 +168,14 @@ def llm_compression() -> dict[str, float]:
     return {"word_count": len(result.split())}
 
 
+def llm_compression_onnx() -> dict[str, float]:
+    from examples.llm_compression.onnx.tiny_llama.main import main as llm_compression_main
+
+    result = llm_compression_main()
+
+    return {"word_count": len(result.split())}
+
+
 def llm_tune_params() -> dict[str, float]:
     from examples.llm_compression.openvino.tiny_llama_find_hyperparams.main import main as llm_tune_params_main