Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
283403f
[OpenVINO] Support Zamba2 by OpenVINO
rkazants Jun 20, 2025
e6ef129
Merge remote-tracking branch 'upstream/main' into support_zamba2_ov
rkazants Jul 27, 2025
f535012
Apply suggestions from code review
rkazants Jul 27, 2025
112af9a
Apply suggestions from code review
rkazants Jul 27, 2025
708f52c
Apply suggestions from code review
rkazants Jul 27, 2025
f956e00
Apply suggestions from code review
rkazants Jul 27, 2025
ff1dbc6
Apply suggestions from code review
rkazants Jul 27, 2025
32cfb33
Apply suggestions from code review
rkazants Jul 27, 2025
6169f62
Apply suggestions from code review
rkazants Jul 27, 2025
78b21de
Apply suggestions from code review
rkazants Jul 27, 2025
191a3f4
Apply suggestions from code review
rkazants Jul 27, 2025
018d81a
Revert changes in notebooks/openvino/stable_diffusion_hybrid_quantiza…
rkazants Jul 27, 2025
7be4c4b
Add tests
rkazants Jul 28, 2025
c6ef767
Fix formatting
rkazants Jul 28, 2025
3b97ea5
Merge remote-tracking branch 'upstream/main' into support_zamba2_ov
rkazants Jul 31, 2025
ff470f7
Re-implement exporting Zamba2 model
rkazants Jul 31, 2025
c906220
Fix export_cli_int8 test
rkazants Jul 31, 2025
196827e
Merge remote-tracking branch 'upstream/main' into support_zamba2_ov
rkazants Oct 9, 2025
34a4ee3
Apply suggestion from @rkazants
rkazants Oct 9, 2025
f094f78
Apply suggestion from @rkazants
rkazants Oct 9, 2025
4c0ffc5
Apply suggestion from @rkazants
rkazants Oct 9, 2025
06ef4e0
Update optimum/exporters/openvino/model_configs.py
rkazants Oct 9, 2025
56bff2e
Update optimum/exporters/openvino/model_configs.py
rkazants Oct 9, 2025
a4e3bd0
Update tests/openvino/test_exporters_cli.py
rkazants Oct 9, 2025
7db344b
Apply suggestion from @rkazants
rkazants Oct 9, 2025
3aca613
Apply suggestion from @rkazants
rkazants Oct 9, 2025
0825f43
Fix formatting
rkazants Oct 9, 2025
b11d517
rkazants Oct 13, 2025
04d1496
^^X
rkazants Oct 13, 2025
bd427b2
Merge remote-tracking branch 'origin/support_zamba2_ov' into support_…
rkazants Oct 13, 2025
ef52983
Introduce hybrid cache for both mamba and zamba2 models
rkazants Oct 13, 2025
74e4da7
Handle hybrid cache
rkazants Oct 14, 2025
f4712b3
Fix model config to set correct dimension for sequence length
rkazants Oct 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/openvino/models.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ Here is the list of the supported architectures :
- XLM
- XLM-Roberta
- XVERSE
- Zamba2

## [Diffusers](https://huggingface.co/docs/diffusers/index)
- Stable Diffusion
Expand Down
2 changes: 1 addition & 1 deletion examples/neural_compressor/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ def main():
else:
model = AutoModelForCausalLM.from_config(config)
n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")

# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
Expand Down
4 changes: 2 additions & 2 deletions examples/neural_compressor/question-answering/run_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,8 +563,8 @@ def num_param(model):

def get_logits(teacher_model_qa, train_dataset):
logger.info("***** Getting logits of teacher model *****")
logger.info(f" Num examples = {len(train_dataset) }")
logger.info(f" Batch Size = {training_args.per_device_eval_batch_size }")
logger.info(f" Num examples = {len(train_dataset)}")
logger.info(f" Batch Size = {training_args.per_device_eval_batch_size}")

sampler = None
if accelerator.num_processes > 1:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""
A subclass of `INCTrainer` specific to Question-Answering tasks
"""

import math
import time

Expand Down
1 change: 1 addition & 0 deletions examples/neural_compressor/question-answering/utils_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""
Post-processing utilities for question answering.
"""

import collections
import json
import logging
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

""" Finetuning the library models for sequence classification on Clinc."""
"""Finetuning the library models for sequence classification on Clinc."""
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.

import copy
Expand Down
2 changes: 1 addition & 1 deletion examples/neural_compressor/text-classification/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

""" Finetuning the library models for sequence classification on GLUE."""
"""Finetuning the library models for sequence classification on GLUE."""
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.

import logging
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

""" Finetuning the library models for sequence classification on GLUE."""
"""Finetuning the library models for sequence classification on GLUE."""
# You can also adapt this script on your own text classification task. Pointers for this are left as comments.

import logging
Expand Down
4 changes: 1 addition & 3 deletions examples/neural_compressor/text-generation/run_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
"""

"""Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)"""

import argparse
import logging
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

""" Example for stable-diffusion to generate a picture from a text ."""
"""Example for stable-diffusion to generate a picture from a text ."""
# You can also adapt this script on your own text to image task. Pointers for this are left as comments.

import argparse
Expand Down
6 changes: 5 additions & 1 deletion notebooks/ipex/langchain_hf_pipelines.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"source": [
"from optimum.intel.version import __version__\n",
"\n",
"\n",
"print(\"optimum-intel version is\", __version__)"
]
},
Expand All @@ -49,6 +50,7 @@
"source": [
"from optimum.intel.utils.import_utils import _langchain_hf_version\n",
"\n",
"\n",
"print(\"langchain-huggingface version is\", _langchain_hf_version)"
]
},
Expand All @@ -69,6 +71,7 @@
"source": [
"from langchain_huggingface.llms import HuggingFacePipeline\n",
"\n",
"\n",
"hf = HuggingFacePipeline.from_model_id(\n",
" model_id=\"gpt2\",\n",
" task=\"text-generation\",\n",
Expand All @@ -94,6 +97,7 @@
"source": [
"from langchain_core.prompts import PromptTemplate\n",
"\n",
"\n",
"template = \"\"\"Question: {question}\n",
"\n",
"Answer: Let's think step by step.\"\"\"\n",
Expand All @@ -103,7 +107,7 @@
"\n",
"question = \"What is electroencephalography?\"\n",
"\n",
"print(chain.invoke({\"question\": question}))\n"
"print(chain.invoke({\"question\": question}))"
]
},
{
Expand Down
9 changes: 7 additions & 2 deletions notebooks/ipex/text_generation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"source": [
"import torch\n",
"from transformers import AutoTokenizer\n",
"\n",
"from optimum.intel import IPEXModelForCausalLM"
]
},
Expand Down Expand Up @@ -62,9 +63,13 @@
"source": [
"model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16)\n",
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
"input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n",
"input_sentence = [\n",
" \"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"\n",
"]\n",
"model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n",
"generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)\n",
"generation_kwargs = dict(\n",
" max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True\n",
")\n",
"\n",
"generated_ids = model.generate(**model_inputs, **generation_kwargs)\n",
"output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
Expand Down
72 changes: 32 additions & 40 deletions notebooks/openvino/demos/quantized_generation_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
" \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n",
" },\n",
" \"compile\": False,\n",
" \"quantization_config\": quantization_config\n",
" \"quantization_config\": quantization_config,\n",
"}\n",
"\n",
"# Check whether the model was already exported\n",
Expand All @@ -144,8 +144,8 @@
"\n",
"# TODO Optional: export to huggingface/hub\n",
"\n",
"model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024 ** 3\n",
"print(f'Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB')"
"model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024**3\n",
"print(f\"Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB\")"
]
},
{
Expand Down Expand Up @@ -214,7 +214,7 @@
"\n",
"\n",
"# Tokenize the sample\n",
"inputs = tokenizer([sample], return_tensors='pt')\n",
"inputs = tokenizer([sample], return_tensors=\"pt\")\n",
"\n",
"# Call generate on the inputs\n",
"out = model.generate(\n",
Expand Down Expand Up @@ -296,7 +296,7 @@
"\n",
"\n",
"# Tokenize the sample\n",
"inputs = tokenizer([sample], return_tensors='pt')\n",
"inputs = tokenizer([sample], return_tensors=\"pt\")\n",
"\n",
"out = stateless_model.generate(\n",
" **inputs,\n",
Expand Down Expand Up @@ -360,7 +360,7 @@
" \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n",
" },\n",
" \"compile\": False,\n",
" \"quantization_config\": quantization_config\n",
" \"quantization_config\": quantization_config,\n",
"}\n",
"\n",
"# Check whether the model was already exported\n",
Expand Down Expand Up @@ -482,10 +482,11 @@
" out = self.model_generate(*args, **kwargs)\n",
" self.seq_lens[-1].append(out.shape[-1])\n",
" return out\n",
"\n",
" self.model.generate = generate_wrapper\n",
" return self\n",
"\n",
" def __exit__(self, type, value, traceback):\n",
" def __exit__(self, type, value, traceback):\n",
" self.model.forward = self.model_forward\n",
" self.model.generate = self.model_generate\n",
" self.model_forward = None\n",
Expand All @@ -506,9 +507,8 @@
" sl = np.array(sl, dtype=np.float64)\n",
" ws = np.array(ws, dtype=np.float64)\n",
" out_lens = sl - ws\n",
" accepted = (out_lens[1:] - out_lens[:-1] - 1)\n",
" ar_per_win.append(np.divide(accepted, ws[:-1],\n",
" out=np.zeros_like(accepted),where=ws[:-1] != 0))\n",
" accepted = out_lens[1:] - out_lens[:-1] - 1\n",
" ar_per_win.append(np.divide(accepted, ws[:-1], out=np.zeros_like(accepted), where=ws[:-1] != 0))\n",
" ar_per_win = np.hstack(ar_per_win)\n",
" # Normalized AR doesn't take into account windows with size 0\n",
" if normalize:\n",
Expand Down Expand Up @@ -548,7 +548,7 @@
"samples_number = 30\n",
"with AcceptanceRateRecorder(stateless_model) as ar_recorder:\n",
" for text in tqdm(dataset[:samples_number]):\n",
" tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors='pt')\n",
" tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors=\"pt\")\n",
" stateless_model.generate(\n",
" **tokenized_prompt,\n",
" max_new_tokens=128,\n",
Expand Down Expand Up @@ -627,7 +627,6 @@
" return False\n",
"\n",
"\n",
"\n",
"# Set the chat template to the tokenizer. The chat template implements the simple template of\n",
"# User: content\n",
"# Assistant: content\n",
Expand Down Expand Up @@ -655,11 +654,7 @@
" if model_msg:\n",
" messages.append({\"role\": \"Assistant\", \"content\": model_msg})\n",
" input_token = tokenizer.apply_chat_template(\n",
" messages,\n",
" add_generation_prompt=True,\n",
" tokenize=True,\n",
" return_tensors=\"pt\",\n",
" return_dict=True\n",
" messages, add_generation_prompt=True, tokenize=True, return_tensors=\"pt\", return_dict=True\n",
" )\n",
" return input_token\n",
"\n",
Expand All @@ -683,13 +678,13 @@
" # Construct the input message string for the model by concatenating the current system message and conversation history\n",
" # Tokenize the messages string\n",
" inputs = prepare_history_for_model(history)\n",
" input_length = inputs['input_ids'].shape[1]\n",
" input_length = inputs[\"input_ids\"].shape[1]\n",
" # truncate input in case it is too long.\n",
" # TODO improve this\n",
" if input_length > 2000:\n",
" history = [history[-1]]\n",
" inputs = prepare_history_for_model(history)\n",
" input_length = inputs['input_ids'].shape[1]\n",
" input_length = inputs[\"input_ids\"].shape[1]\n",
"\n",
" prompt_char = \"▌\"\n",
" history[-1][1] = prompt_char\n",
Expand All @@ -710,11 +705,14 @@
" eos_token_id=[tokenizer.eos_token_id],\n",
" pad_token_id=tokenizer.eos_token_id,\n",
" )\n",
" generate_kwargs = dict(\n",
" streamer=streamer,\n",
" generation_config=generation_config,\n",
" stopping_criteria=stopping_criteria,\n",
" ) | inputs\n",
" generate_kwargs = (\n",
" dict(\n",
" streamer=streamer,\n",
" generation_config=generation_config,\n",
" stopping_criteria=stopping_criteria,\n",
" )\n",
" | inputs\n",
" )\n",
"\n",
" if assisted:\n",
" target_generate = stateless_model.generate\n",
Expand All @@ -741,7 +739,7 @@
" yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n",
" history[-1][1] = partial_text\n",
" generation_time = time.perf_counter() - start\n",
" yield history, f'Generation time: {generation_time:.2f} sec', *([gr.update(interactive=True)] * 4)"
" yield history, f\"Generation time: {generation_time:.2f} sec\", *([gr.update(interactive=True)] * 4)"
]
},
{
Expand Down Expand Up @@ -786,7 +784,9 @@
" [\"Can you explain to me briefly what is Python programming language?\"],\n",
" [\"Explain the plot of Cinderella in a sentence.\"],\n",
" [\"Write a Python function to perform binary search over a sorted list. Use markdown to write code\"],\n",
" [\"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"],\n",
" [\n",
" \"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"\n",
" ],\n",
"]\n",
"\n",
"\n",
Expand All @@ -802,7 +802,7 @@
" \"\"\"\n",
" # Append current user message to history with a blank assistant message which will be generated by the model\n",
" history.append([message, None])\n",
" return ('', history)\n",
" return (\"\", history)\n",
"\n",
"\n",
"def prepare_for_regenerate(history):\n",
Expand All @@ -826,7 +826,7 @@
" msg = gr.Textbox(placeholder=\"Enter message here...\", show_label=False, autofocus=True, scale=75)\n",
" status = gr.Textbox(\"Status: Idle\", show_label=False, max_lines=1, scale=15)\n",
" with gr.Row():\n",
" submit = gr.Button(\"Submit\", variant='primary')\n",
" submit = gr.Button(\"Submit\", variant=\"primary\")\n",
" regenerate = gr.Button(\"Regenerate\")\n",
" clear = gr.Button(\"Clear\")\n",
" with gr.Accordion(\"Advanced Options:\", open=False):\n",
Expand Down Expand Up @@ -865,9 +865,7 @@
" step=0.1,\n",
" interactive=True,\n",
" )\n",
" gr.Examples(\n",
" EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\"\n",
" )\n",
" gr.Examples(EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\")\n",
"\n",
" # Sets generate function to be triggered when the user submit a new message\n",
" gr.on(\n",
Expand All @@ -881,20 +879,14 @@
" inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n",
" outputs=[chatbot, status, msg, submit, regenerate, clear],\n",
" concurrency_limit=1,\n",
" queue=True\n",
" )\n",
" regenerate.click(\n",
" fn=prepare_for_regenerate,\n",
" inputs=chatbot,\n",
" outputs=chatbot,\n",
" queue=True,\n",
" concurrency_limit=1\n",
" ).then(\n",
" )\n",
" regenerate.click(fn=prepare_for_regenerate, inputs=chatbot, outputs=chatbot, queue=True, concurrency_limit=1).then(\n",
" fn=generate,\n",
" inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n",
" outputs=[chatbot, status, msg, submit, regenerate, clear],\n",
" concurrency_limit=1,\n",
" queue=True\n",
" queue=True,\n",
" )\n",
" clear.click(fn=lambda: (None, \"Status: Idle\"), inputs=None, outputs=[chatbot, status], queue=False)"
]
Expand Down
4 changes: 3 additions & 1 deletion notebooks/openvino/optimum_openvino_inference.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,9 @@
"source": [
"# Set the device directly with `.from_pretrained()`\n",
"if \"GPU\" in Core().available_devices:\n",
" model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\")"
" model = OVModelForQuestionAnswering.from_pretrained(\n",
" \"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\"\n",
" )"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion notebooks/openvino/question_answering_quantization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -945,7 +945,7 @@
"print(cpu_device_name)\n",
"print(f\"Latency of original FP32 model: {original_latency:.2f} ms\")\n",
"print(f\"Latency of quantized model: {quantized_latency:.2f} ms\")\n",
"print(f\"Speedup: {(original_latency/quantized_latency):.2f}x\")"
"print(f\"Speedup: {(original_latency / quantized_latency):.2f}x\")"
]
}
],
Expand Down
1 change: 1 addition & 0 deletions notebooks/openvino/sentence_transformer_quantization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@
"\n",
"quantizer = OVQuantizer.from_pretrained(model)\n",
"\n",
"\n",
"def preprocess_function(examples, tokenizer):\n",
" return tokenizer(examples[\"sentence\"], padding=\"max_length\", max_length=384, truncation=True)\n",
"\n",
Expand Down
Loading
Loading