huggingface · xufang-lisa · Oct 9, 2025 · Oct 10, 2025 · Oct 12, 2025 · Oct 12, 2025
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -274,6 +274,13 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "reduces quantization error. Valid only when activations quantization is enabled."
         ),
     )
+    optional_group.add_argument(
+        "--eagle3",
+        action="store_true",
+        help=(
+            "the original model is a draft model of eagle3 pipeline."
+        ),
+    )
     optional_group.add_argument(
         "--model-kwargs",
         type=json.loads,
@@ -576,6 +583,7 @@ def run(self):
                 library_name=library_name,
                 variant=self.args.variant,
                 model_kwargs=self.args.model_kwargs,
+                eagle3=self.args.eagle3,
                 # **input_shapes,
             )
 

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -16,12 +16,16 @@
 import logging
 import operator
 import warnings
+import json
+import os
+import importlib.util
 from functools import reduce
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
 
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from requests.exceptions import ConnectionError as RequestsConnectionError
+from safetensors.torch import save_file
 from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase, ProcessorMixin
 from transformers.utils import is_torch_available
 
@@ -105,6 +109,50 @@ def infer_task(
                 )
     return task
 
+def eagle3_config(model_path: str):
+    config_file = os.path.join(model_path, 'config.json')
+    # rename the origin config
+    org_config_file = os.path.join(model_path, 'config_org.json')
+    os.rename(config_file, org_config_file)
+
+    # read config
+    with open(org_config_file, 'r', encoding='utf-8') as f:
+        config = json.load(f)
+    # modify config
+    if 'model_type' in config.keys():
+        org_type = config['model_type']
+        if 'eagle3' not in org_type:
+            config['model_type'] = org_type + 'eagle3'
+    moduler_name = 'optimum.exporters.openvino.model_patcher'
+    spec = importlib.util.find_spec(moduler_name)
+    if spec and spec.origin:
+        moduler_path = os.path.dirname(spec.origin)
+        config['auto_map'] = {
+            "AutoConfig": moduler_path + "--model_patcher.LlamaEagle3Config",
+            "AutoModel": moduler_path + "--model_patcher.LlamaEagle3Model",
+            "AutoModelForCausalLM": moduler_path + "--model_patcher.LlamaEagle3ForCausalLM"
+        }
+    # write new config.json
+    with open(config_file, 'w', encoding='utf-8') as f:
+        json.dump(config, f, ensure_ascii=False, indent=2)
+
+def extract_d2t(model_path: str, output_path: str):
+    load_model_path=os.path.join(model_path, "pytorch_model.bin")
+    output_path = os.path.join(output_path, "eagle3.safetensors")
+    target_keys = ['d2t', 't2d']
+    if os.path.exists(load_model_path):
+        state_dict = torch.load(load_model_path, map_location=torch.device('cpu'))
+        extracted = {k: state_dict[k] for k in target_keys if k in state_dict.keys()}
+        # save output file
+        save_file(extracted, output_path)
+
+def restore_config(model_path: str, ov_path: str):
+    # restore the origin config
+    config_file = os.path.join(model_path, 'config.json')
+    org_config_file = os.path.join(model_path, 'config_org.json')
+    os.rename(org_config_file, config_file)
+    if os.path.exists(ov_path):
+        extract_d2t(model_path, ov_path)
 
 def main_export(
     model_name_or_path: str,
@@ -130,6 +178,7 @@ def main_export(
     library_name: Optional[str] = None,
     model_loading_kwargs: Optional[Dict[str, Any]] = None,
     variant: Optional[str] = None,
+    eagle3: bool = False,
     **kwargs_shapes,
 ):
     """
@@ -187,6 +236,8 @@ def main_export(
             especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success.
         stateful (`bool`, defaults to `True`):
             Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models.
+        eagle3 (`bool`, defaults to `False`):
+            This is needed by eagle3 draft models.
         **kwargs_shapes (`Dict`):
             Shapes to use during inference. This argument allows to override the default shapes used during the ONNX export.
 
@@ -251,6 +302,9 @@ def main_export(
         dtype = getattr(torch, dtype) if dtype != "auto" else dtype
 
     if library_name == "transformers":
+        if eagle3:
+            eagle3_config(model_name_or_path)
+
         config = AutoConfig.from_pretrained(
             model_name_or_path,
             subfolder=subfolder,
@@ -539,6 +593,8 @@ class StoreAttr(object):
             torch.cuda.is_available = orig_cuda_check
             if do_gptq_patching:
                 GPTQQuantizer.post_init_model = orig_post_init_model
+        if eagle3 and library_name == "transformers":
+            restore_config(model_name_or_path, output)
 
 
 def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None, task=None):

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -131,6 +131,9 @@ def _save_model(
 
     runtime_options = config.runtime_options if hasattr(config, "runtime_options") else {}
     model = _add_runtime_options_to_rt_info(model, runtime_options)
+
+    if 'eagle3' in config._config.model_type:
+        model = _add_eagle3_mode_to_rt_info(model)
     save_model(model, path, compress_to_fp16)
     del model
     gc.collect()
@@ -831,6 +834,16 @@ def _add_runtime_options_to_rt_info(model: Model, options: Dict):
 
     return model
 
+def _add_eagle3_mode_to_rt_info(model: Model):
+    """
+    Add eagle3 mode
+    """
+    try:
+        model.set_rt_info("True", ["eagle3_mode"])
+    except Exception:
+        pass
+
+    return model
 
 def _add_version_info_to_model(model: Model, library_name: Optional[str] = None):
     """

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -4278,3 +4278,49 @@ class GPT2OpenVINOConfig(GPT2OnnxConfig):
 )
 class VisionEncoderDecoderOpenVINOConfig(VisionEncoderDecoderOnnxConfig):
     _MODEL_PATCHER = OVSeq2SeqModelPatcher
+
+
+class EAGLE3DummyGenerator(DummyInputGenerator):
+    """
+    Generates dummy hidden_states inputs.
+    """
+
+    SUPPORTED_INPUT_NAMES = ("hidden_states",)
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedTextConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        **kwargs,
+    ):
+        self.batch_size = batch_size
+        self.sequence_length = sequence_length
+        self.hidden_size = normalized_config.hidden_size
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        shape = (
+            self.batch_size,
+            self.sequence_length,
+            self.hidden_size*3,
+        )
+        return self.random_float_tensor(shape, framework=framework, dtype=float_dtype)
+
+@register_in_tasks_manager( "llamaeagle3",*["text-generation","text-generation-with-past"],library_name="transformers")
+class LlamaEagle3OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14  # Llama now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator, EAGLE3DummyGenerator)
+    DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        common_inputs = super().inputs
+        common_inputs["hidden_states"] = {0: "batch_size", 1: "sequence_length", 2: "hidden_size"}
+        return common_inputs
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return OVDecoderModelPatcher(self, model, model_kwargs=model_kwargs)