huggingface · rkazants · Jun 20, 2025 · Jul 27, 2025 · Jul 27, 2025 · Jul 27, 2025
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -145,6 +145,7 @@ Here is the list of the supported architectures :
 - XLM
 - XLM-Roberta
 - XVERSE
+- Zamba2
 
 ## [Diffusers](https://huggingface.co/docs/diffusers/index)
 - Stable Diffusion

diff --git a/examples/neural_compressor/language-modeling/run_clm.py b/examples/neural_compressor/language-modeling/run_clm.py
@@ -503,7 +503,7 @@ def main():
     else:
         model = AutoModelForCausalLM.from_config(config)
         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-        logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+        logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")
 
     # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
     # on a small vocab and want a smaller embedding size, remove this test.

diff --git a/examples/neural_compressor/question-answering/run_qa.py b/examples/neural_compressor/question-answering/run_qa.py
@@ -563,8 +563,8 @@ def num_param(model):
 
         def get_logits(teacher_model_qa, train_dataset):
             logger.info("***** Getting logits of teacher model *****")
-            logger.info(f"  Num examples = {len(train_dataset) }")
-            logger.info(f"  Batch Size = {training_args.per_device_eval_batch_size }")
+            logger.info(f"  Num examples = {len(train_dataset)}")
+            logger.info(f"  Batch Size = {training_args.per_device_eval_batch_size}")
 
             sampler = None
             if accelerator.num_processes > 1:

diff --git a/examples/neural_compressor/question-answering/trainer_qa.py b/examples/neural_compressor/question-answering/trainer_qa.py
@@ -15,6 +15,7 @@
 """
 A subclass of `INCTrainer` specific to Question-Answering tasks
 """
+
 import math
 import time
 

diff --git a/examples/neural_compressor/question-answering/utils_qa.py b/examples/neural_compressor/question-answering/utils_qa.py
@@ -15,6 +15,7 @@
 """
 Post-processing utilities for question answering.
 """
+
 import collections
 import json
 import logging

diff --git a/examples/neural_compressor/text-classification/intent-classification/run_clinc.py b/examples/neural_compressor/text-classification/intent-classification/run_clinc.py
@@ -14,7 +14,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-""" Finetuning the library models for sequence classification on Clinc."""
+"""Finetuning the library models for sequence classification on Clinc."""
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
 import copy

diff --git a/examples/neural_compressor/text-classification/run_glue.py b/examples/neural_compressor/text-classification/run_glue.py
@@ -14,7 +14,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-""" Finetuning the library models for sequence classification on GLUE."""
+"""Finetuning the library models for sequence classification on GLUE."""
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
 import logging

diff --git a/examples/neural_compressor/text-classification/run_glue_post_training.py b/examples/neural_compressor/text-classification/run_glue_post_training.py
@@ -14,7 +14,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-""" Finetuning the library models for sequence classification on GLUE."""
+"""Finetuning the library models for sequence classification on GLUE."""
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
 import logging

diff --git a/examples/neural_compressor/text-generation/run_generation.py b/examples/neural_compressor/text-generation/run_generation.py
@@ -14,9 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
-"""
-
+"""Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)"""
 
 import argparse
 import logging

diff --git a/examples/neural_compressor/text-to-image/run_diffusion_post_training.py b/examples/neural_compressor/text-to-image/run_diffusion_post_training.py
@@ -14,7 +14,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-""" Example for stable-diffusion to generate a picture from a text ."""
+"""Example for stable-diffusion to generate a picture from a text ."""
 # You can also adapt this script on your own text to image task. Pointers for this are left as comments.
 
 import argparse

diff --git a/notebooks/ipex/langchain_hf_pipelines.ipynb b/notebooks/ipex/langchain_hf_pipelines.ipynb
@@ -38,6 +38,7 @@
    "source": [
     "from optimum.intel.version import __version__\n",
     "\n",
+    "\n",
     "print(\"optimum-intel version is\", __version__)"
    ]
   },
@@ -49,6 +50,7 @@
    "source": [
     "from optimum.intel.utils.import_utils import _langchain_hf_version\n",
     "\n",
+    "\n",
     "print(\"langchain-huggingface version is\", _langchain_hf_version)"
    ]
   },
@@ -69,6 +71,7 @@
    "source": [
     "from langchain_huggingface.llms import HuggingFacePipeline\n",
     "\n",
+    "\n",
     "hf = HuggingFacePipeline.from_model_id(\n",
     "    model_id=\"gpt2\",\n",
     "    task=\"text-generation\",\n",
@@ -94,6 +97,7 @@
    "source": [
     "from langchain_core.prompts import PromptTemplate\n",
     "\n",
+    "\n",
     "template = \"\"\"Question: {question}\n",
     "\n",
     "Answer: Let's think step by step.\"\"\"\n",
@@ -103,7 +107,7 @@
     "\n",
     "question = \"What is electroencephalography?\"\n",
     "\n",
-    "print(chain.invoke({\"question\": question}))\n"
+    "print(chain.invoke({\"question\": question}))"
    ]
   },
   {

diff --git a/notebooks/ipex/text_generation.ipynb b/notebooks/ipex/text_generation.ipynb
@@ -22,6 +22,7 @@
    "source": [
     "import torch\n",
     "from transformers import AutoTokenizer\n",
+    "\n",
     "from optimum.intel import IPEXModelForCausalLM"
    ]
   },
@@ -62,9 +63,13 @@
    "source": [
     "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16)\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
-    "input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n",
+    "input_sentence = [\n",
+    "    \"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"\n",
+    "]\n",
     "model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n",
-    "generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)\n",
+    "generation_kwargs = dict(\n",
+    "    max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True\n",
+    ")\n",
     "\n",
     "generated_ids = model.generate(**model_inputs, **generation_kwargs)\n",
     "output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",

diff --git a/notebooks/openvino/demos/quantized_generation_demo.ipynb b/notebooks/openvino/demos/quantized_generation_demo.ipynb
@@ -122,7 +122,7 @@
     "        \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"),  # OpenVINO will use this directory as cache\n",
     "    },\n",
     "    \"compile\": False,\n",
-    "    \"quantization_config\": quantization_config\n",
+    "    \"quantization_config\": quantization_config,\n",
     "}\n",
     "\n",
     "# Check whether the model was already exported\n",
@@ -144,8 +144,8 @@
     "\n",
     "# TODO Optional: export to huggingface/hub\n",
     "\n",
-    "model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024 ** 3\n",
-    "print(f'Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB')"
+    "model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024**3\n",
+    "print(f\"Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB\")"
    ]
   },
   {
@@ -214,7 +214,7 @@
     "\n",
     "\n",
     "# Tokenize the sample\n",
-    "inputs = tokenizer([sample], return_tensors='pt')\n",
+    "inputs = tokenizer([sample], return_tensors=\"pt\")\n",
     "\n",
     "# Call generate on the inputs\n",
     "out = model.generate(\n",
@@ -296,7 +296,7 @@
     "\n",
     "\n",
     "# Tokenize the sample\n",
-    "inputs = tokenizer([sample], return_tensors='pt')\n",
+    "inputs = tokenizer([sample], return_tensors=\"pt\")\n",
     "\n",
     "out = stateless_model.generate(\n",
     "    **inputs,\n",
@@ -360,7 +360,7 @@
     "        \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"),  # OpenVINO will use this directory as cache\n",
     "    },\n",
     "    \"compile\": False,\n",
-    "    \"quantization_config\": quantization_config\n",
+    "    \"quantization_config\": quantization_config,\n",
     "}\n",
     "\n",
     "# Check whether the model was already exported\n",
@@ -482,10 +482,11 @@
     "            out = self.model_generate(*args, **kwargs)\n",
     "            self.seq_lens[-1].append(out.shape[-1])\n",
     "            return out\n",
+    "\n",
     "        self.model.generate = generate_wrapper\n",
     "        return self\n",
     "\n",
-    "    def __exit__(self,  type, value, traceback):\n",
+    "    def __exit__(self, type, value, traceback):\n",
     "        self.model.forward = self.model_forward\n",
     "        self.model.generate = self.model_generate\n",
     "        self.model_forward = None\n",
@@ -506,9 +507,8 @@
     "            sl = np.array(sl, dtype=np.float64)\n",
     "            ws = np.array(ws, dtype=np.float64)\n",
     "            out_lens = sl - ws\n",
-    "            accepted = (out_lens[1:] - out_lens[:-1] - 1)\n",
-    "            ar_per_win.append(np.divide(accepted, ws[:-1],\n",
-    "                                   out=np.zeros_like(accepted),where=ws[:-1] != 0))\n",
+    "            accepted = out_lens[1:] - out_lens[:-1] - 1\n",
+    "            ar_per_win.append(np.divide(accepted, ws[:-1], out=np.zeros_like(accepted), where=ws[:-1] != 0))\n",
     "        ar_per_win = np.hstack(ar_per_win)\n",
     "        # Normalized AR doesn't take into account windows with size 0\n",
     "        if normalize:\n",
@@ -548,7 +548,7 @@
     "samples_number = 30\n",
     "with AcceptanceRateRecorder(stateless_model) as ar_recorder:\n",
     "    for text in tqdm(dataset[:samples_number]):\n",
-    "        tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors='pt')\n",
+    "        tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors=\"pt\")\n",
     "        stateless_model.generate(\n",
     "            **tokenized_prompt,\n",
     "            max_new_tokens=128,\n",
@@ -627,7 +627,6 @@
     "    return False\n",
     "\n",
     "\n",
-    "\n",
     "# Set the chat template to the tokenizer. The chat template implements the simple template of\n",
     "#   User: content\n",
     "#   Assistant: content\n",
@@ -655,11 +654,7 @@
     "        if model_msg:\n",
     "            messages.append({\"role\": \"Assistant\", \"content\": model_msg})\n",
     "    input_token = tokenizer.apply_chat_template(\n",
-    "        messages,\n",
-    "        add_generation_prompt=True,\n",
-    "        tokenize=True,\n",
-    "        return_tensors=\"pt\",\n",
-    "        return_dict=True\n",
+    "        messages, add_generation_prompt=True, tokenize=True, return_tensors=\"pt\", return_dict=True\n",
     "    )\n",
     "    return input_token\n",
     "\n",
@@ -683,13 +678,13 @@
     "    # Construct the input message string for the model by concatenating the current system message and conversation history\n",
     "    # Tokenize the messages string\n",
     "    inputs = prepare_history_for_model(history)\n",
-    "    input_length = inputs['input_ids'].shape[1]\n",
+    "    input_length = inputs[\"input_ids\"].shape[1]\n",
     "    # truncate input in case it is too long.\n",
     "    # TODO improve this\n",
     "    if input_length > 2000:\n",
     "        history = [history[-1]]\n",
     "        inputs = prepare_history_for_model(history)\n",
-    "        input_length = inputs['input_ids'].shape[1]\n",
+    "        input_length = inputs[\"input_ids\"].shape[1]\n",
     "\n",
     "    prompt_char = \"▌\"\n",
     "    history[-1][1] = prompt_char\n",
@@ -710,11 +705,14 @@
     "        eos_token_id=[tokenizer.eos_token_id],\n",
     "        pad_token_id=tokenizer.eos_token_id,\n",
     "    )\n",
-    "    generate_kwargs = dict(\n",
-    "        streamer=streamer,\n",
-    "        generation_config=generation_config,\n",
-    "        stopping_criteria=stopping_criteria,\n",
-    "    ) | inputs\n",
+    "    generate_kwargs = (\n",
+    "        dict(\n",
+    "            streamer=streamer,\n",
+    "            generation_config=generation_config,\n",
+    "            stopping_criteria=stopping_criteria,\n",
+    "        )\n",
+    "        | inputs\n",
+    "    )\n",
     "\n",
     "    if assisted:\n",
     "        target_generate = stateless_model.generate\n",
@@ -741,7 +739,7 @@
     "        yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n",
     "    history[-1][1] = partial_text\n",
     "    generation_time = time.perf_counter() - start\n",
-    "    yield history, f'Generation time: {generation_time:.2f} sec', *([gr.update(interactive=True)] * 4)"
+    "    yield history, f\"Generation time: {generation_time:.2f} sec\", *([gr.update(interactive=True)] * 4)"
    ]
   },
   {
@@ -786,7 +784,9 @@
     "    [\"Can you explain to me briefly what is Python programming language?\"],\n",
     "    [\"Explain the plot of Cinderella in a sentence.\"],\n",
     "    [\"Write a Python function to perform binary search over a sorted list. Use markdown to write code\"],\n",
-    "    [\"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"],\n",
+    "    [\n",
+    "        \"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"\n",
+    "    ],\n",
     "]\n",
     "\n",
     "\n",
@@ -802,7 +802,7 @@
     "    \"\"\"\n",
     "    # Append current user message to history with a blank assistant message which will be generated by the model\n",
     "    history.append([message, None])\n",
-    "    return ('', history)\n",
+    "    return (\"\", history)\n",
     "\n",
     "\n",
     "def prepare_for_regenerate(history):\n",
@@ -826,7 +826,7 @@
     "        msg = gr.Textbox(placeholder=\"Enter message here...\", show_label=False, autofocus=True, scale=75)\n",
     "        status = gr.Textbox(\"Status: Idle\", show_label=False, max_lines=1, scale=15)\n",
     "    with gr.Row():\n",
-    "        submit = gr.Button(\"Submit\", variant='primary')\n",
+    "        submit = gr.Button(\"Submit\", variant=\"primary\")\n",
     "        regenerate = gr.Button(\"Regenerate\")\n",
     "        clear = gr.Button(\"Clear\")\n",
     "    with gr.Accordion(\"Advanced Options:\", open=False):\n",
@@ -865,9 +865,7 @@
     "                    step=0.1,\n",
     "                    interactive=True,\n",
     "                )\n",
-    "    gr.Examples(\n",
-    "        EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\"\n",
-    "    )\n",
+    "    gr.Examples(EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\")\n",
     "\n",
     "    # Sets generate function to be triggered when the user submit a new message\n",
     "    gr.on(\n",
@@ -881,20 +879,14 @@
     "        inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n",
     "        outputs=[chatbot, status, msg, submit, regenerate, clear],\n",
     "        concurrency_limit=1,\n",
-    "        queue=True\n",
-    "    )\n",
-    "    regenerate.click(\n",
-    "        fn=prepare_for_regenerate,\n",
-    "        inputs=chatbot,\n",
-    "        outputs=chatbot,\n",
     "        queue=True,\n",
-    "        concurrency_limit=1\n",
-    "    ).then(\n",
+    "    )\n",
+    "    regenerate.click(fn=prepare_for_regenerate, inputs=chatbot, outputs=chatbot, queue=True, concurrency_limit=1).then(\n",
     "        fn=generate,\n",
     "        inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n",
     "        outputs=[chatbot, status, msg, submit, regenerate, clear],\n",
     "        concurrency_limit=1,\n",
-    "        queue=True\n",
+    "        queue=True,\n",
     "    )\n",
     "    clear.click(fn=lambda: (None, \"Status: Idle\"), inputs=None, outputs=[chatbot, status], queue=False)"
    ]

diff --git a/notebooks/openvino/optimum_openvino_inference.ipynb b/notebooks/openvino/optimum_openvino_inference.ipynb
@@ -489,7 +489,9 @@
    "source": [
     "# Set the device directly with `.from_pretrained()`\n",
     "if \"GPU\" in Core().available_devices:\n",
-    "    model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\")"
+    "    model = OVModelForQuestionAnswering.from_pretrained(\n",
+    "        \"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\"\n",
+    "    )"
    ]
   },
   {

diff --git a/notebooks/openvino/question_answering_quantization.ipynb b/notebooks/openvino/question_answering_quantization.ipynb
@@ -945,7 +945,7 @@
     "print(cpu_device_name)\n",
     "print(f\"Latency of original FP32 model: {original_latency:.2f} ms\")\n",
     "print(f\"Latency of quantized model: {quantized_latency:.2f} ms\")\n",
-    "print(f\"Speedup: {(original_latency/quantized_latency):.2f}x\")"
+    "print(f\"Speedup: {(original_latency / quantized_latency):.2f}x\")"
    ]
   }
  ],

diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb
@@ -188,6 +188,7 @@
     "\n",
     "quantizer = OVQuantizer.from_pretrained(model)\n",
     "\n",
+    "\n",
     "def preprocess_function(examples, tokenizer):\n",
     "    return tokenizer(examples[\"sentence\"], padding=\"max_length\", max_length=384, truncation=True)\n",
     "\n",
-Original file line number
+Diff line change
@@ Expand Up / @@ -145,6 +145,7 @@ Here is the list of the supported architectures : @@
     - XLM
     - XLM-Roberta
     - XVERSE
+    - Zamba2
     ## [Diffusers](https://huggingface.co/docs/diffusers/index)
     - Stable Diffusion
@@ Expand Down @@