Forward VLM convert model spec runtime settings

geoHeil · geoHeil · commit 2fd433ecce50 · 2026-04-17T08:59:08.000+02:00
Signed-off-by: Georg Heiler &lt;georg.kf.heiler@gmail.com&gt;
diff --git a/docling/datamodel/stage_model_specs.py b/docling/datamodel/stage_model_specs.py
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 import logging
+from copy import deepcopy
 from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Set
 
 from pydantic import BaseModel, Field
@@ -159,10 +160,24 @@ class VlmModelSpec(BaseModel):
         default_factory=list, description="Stop strings for generation"
     )
 
+    temperature: float = Field(
+        default=0.0, description="Sampling temperature for generation"
+    )
+
     max_new_tokens: int = Field(
         default=4096, description="Maximum number of new tokens to generate"
     )
 
+    extra_generation_config: Dict[str, Any] = Field(
+        default_factory=dict, description="Additional generation configuration"
+    )
+
+    _RUNTIME_INPUT_OVERRIDE_KEYS: ClassVar[Set[str]] = {
+        "transformers_prompt_style",
+        "extra_processor_kwargs",
+        "custom_stopping_criteria",
+    }
+
     def get_repo_id(self, engine_type: VlmEngineType) -> str:
         """Get the repository ID for a specific engine.
 
@@ -248,6 +263,34 @@ def get_engine_config(self, engine_type: VlmEngineType) -> EngineModelConfig:
             extra_config=extra_config,
         )
 
+    def get_runtime_input_extra_config(
+        self, engine_type: VlmEngineType
+    ) -> Dict[str, Any]:
+        """Build runtime input config for a specific engine.
+
+        This returns only the subset of model/engine configuration that should
+        flow into ``VlmEngineInput.extra_generation_config``. Load-time engine
+        options such as ``torch_dtype`` or ``transformers_model_type`` remain in
+        ``EngineModelConfig.extra_config`` and are intentionally excluded.
+        """
+
+        runtime_config: Dict[str, Any] = deepcopy(self.extra_generation_config)
+
+        if engine_type not in self.engine_overrides:
+            return runtime_config
+
+        override_config = self.engine_overrides[engine_type].extra_config
+        nested_generation_config = override_config.get("extra_generation_config")
+
+        if isinstance(nested_generation_config, dict):
+            runtime_config.update(deepcopy(nested_generation_config))
+
+        for key in self._RUNTIME_INPUT_OVERRIDE_KEYS:
+            if key in override_config:
+                runtime_config[key] = deepcopy(override_config[key])
+
+        return runtime_config
+
     def has_explicit_engine_export(self, engine_type: VlmEngineType) -> bool:
         """Check if this model has an explicit export for the given engine.
 
diff --git a/docling/models/inference_engines/vlm/api_openai_compatible_engine.py b/docling/models/inference_engines/vlm/api_openai_compatible_engine.py
@@ -56,28 +56,21 @@ def __init__(
         super().__init__(options, model_config=model_config)
         self.enable_remote_services = enable_remote_services
         self.options: ApiVlmEngineOptions = options
+        self.model_api_params: dict[str, object] = {}
+        self.user_params: dict[str, object] = self.options.params.copy()
 
         if not self.enable_remote_services:
             raise OperationNotAllowed(
                 "Connections to remote services is only allowed when set explicitly. "
                 "pipeline_options.enable_remote_services=True."
             )
 
-        # Merge model_config extra_config (which contains API params from model spec)
-        # with runtime options params. Runtime options take precedence.
+        # Keep model spec API params as defaults only when the user has not
+        # provided explicit API params; explicit runtime params are treated as
+        # complete overrides to avoid vendor-specific conflicts.
         if model_config and "api_params" in model_config.extra_config:
-            # Model spec provides API params (e.g., model name)
-            model_api_params = model_config.extra_config["api_params"]
-
-            # Only use model spec params if user hasn't provided any params
-            # This prevents conflicts when users provide custom params (e.g., model_id for watsonx)
             if not self.options.params:
-                self.merged_params = model_api_params.copy()
-            else:
-                # User provided params - use them as-is (don't merge with model spec)
-                self.merged_params = self.options.params.copy()
-        else:
-            self.merged_params = self.options.params.copy()
+                self.model_api_params = model_config.extra_config["api_params"].copy()
 
     def initialize(self) -> None:
         """Initialize the API engine.
@@ -122,19 +115,21 @@ def _process_single_input(input_data: VlmEngineInput) -> VlmEngineOutput:
             images = preprocess_image_batch([input_data.image])
             image = images[0]
 
-            # Prepare API parameters: engine defaults first, then user/model
-            # params override. This allows users to set Azure-specific params
-            # like max_completion_tokens or override temperature (#3112).
-            api_params: dict[str, object] = {
-                "temperature": input_data.temperature,
-            }
+            # Apply precedence in this order:
+            # 1. model spec API defaults
+            # 2. per-request generation settings from VlmEngineInput
+            # 3. explicit user API params from engine_options.params
+            api_params: dict[str, object] = self.model_api_params.copy()
+            api_params["temperature"] = input_data.temperature
 
             # Add max_tokens if specified
             if input_data.max_new_tokens:
                 api_params["max_tokens"] = input_data.max_new_tokens
 
-            # User/model spec params take precedence over engine defaults
-            api_params.update(self.merged_params)
+            # Explicit user params take precedence over per-request defaults.
+            # This allows users to set Azure-specific params like
+            # max_completion_tokens or override temperature (#3112).
+            api_params.update(self.user_params)
 
             # If user specified max_completion_tokens, remove conflicting
             # max_tokens (required for Azure OpenAI compatibility)
diff --git a/docling/models/stages/vlm_convert/vlm_convert_model.py b/docling/models/stages/vlm_convert/vlm_convert_model.py
@@ -5,9 +5,9 @@
 """
 
 import logging
+import time
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional, Union
 
 from PIL import Image as PILImage
 
@@ -19,6 +19,7 @@
 from docling.models.inference_engines.vlm import (
     BaseVlmEngine,
     VlmEngineInput,
+    VlmEngineType,
     create_vlm_engine,
 )
 from docling.utils.profiling import TimeRecorder
@@ -42,7 +43,7 @@ def __init__(
         self,
         enabled: bool,
         enable_remote_services: bool,
-        artifacts_path: Optional[Union[Path, str]],
+        artifacts_path: Path | str | None,
         options: VlmConvertOptions,
         accelerator_options: AcceleratorOptions,
     ):
@@ -81,6 +82,26 @@ def __init__(
 
         _log.info("VlmConvertModel initialized successfully")
 
+    def _get_runtime_engine_type(self) -> VlmEngineType:
+        selected_engine_type = getattr(self.engine, "selected_engine_type", None)
+        if selected_engine_type is not None:
+            return selected_engine_type
+        return self.options.engine_options.engine_type
+
+    def _build_engine_input(self, image: PILImage.Image, prompt: str) -> VlmEngineInput:
+        model_spec = self.options.model_spec
+        runtime_engine_type = self._get_runtime_engine_type()
+        return VlmEngineInput(
+            image=image,
+            prompt=prompt,
+            temperature=model_spec.temperature,
+            max_new_tokens=model_spec.max_new_tokens,
+            stop_strings=list(model_spec.stop_strings),
+            extra_generation_config=model_spec.get_runtime_input_extra_config(
+                runtime_engine_type
+            ),
+        )
+
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
@@ -106,33 +127,43 @@ def __call__(
             images = []
             prompts = []
             valid_pages = []
+            rasterize_time = 0.0
+            scale_resize_time = 0.0
+            max_size_resize_time = 0.0
 
             for page in page_list:
-                if page.image is None:
+                rasterize_start = time.perf_counter()
+                image = page.image
+                rasterize_time += time.perf_counter() - rasterize_start
+
+                if image is None:
                     _log.warning(
                         f"Page {page.page_no} has no image, skipping VLM conversion"
                     )
                     continue
 
                 # Scale image if needed
-                image = page.image
                 if self.options.scale != 1.0:
+                    resize_start = time.perf_counter()
                     new_size = (
                         int(image.width * self.options.scale),
                         int(image.height * self.options.scale),
                     )
                     image = image.resize(new_size, PILImage.Resampling.LANCZOS)
+                    scale_resize_time += time.perf_counter() - resize_start
 
                 # Apply max_size constraint if specified
                 if self.options.max_size is not None:
                     max_dim = max(image.width, image.height)
                     if max_dim > self.options.max_size:
+                        resize_start = time.perf_counter()
                         scale_factor = self.options.max_size / max_dim
                         new_size = (
                             int(image.width * scale_factor),
                             int(image.height * scale_factor),
                         )
                         image = image.resize(new_size, PILImage.Resampling.LANCZOS)
+                        max_size_resize_time += time.perf_counter() - resize_start
 
                 images.append(image)
                 prompts.append(self.options.model_spec.prompt)
@@ -143,22 +174,29 @@ def __call__(
                 return
 
             # Process through runtime using batch prediction
-            _log.debug(f"Processing {len(images)} pages through VLM engine (batched)")
+            _log.debug(
+                "Prepared %s pages for VLM engine: rasterize=%.3fs, scale_resize=%.3fs, max_size_resize=%.3fs",
+                len(images),
+                rasterize_time,
+                scale_resize_time,
+                max_size_resize_time,
+            )
 
             try:
                 # Create batch of runtime inputs
                 engine_inputs = [
-                    VlmEngineInput(
-                        image=img,
-                        prompt=prompt,
-                        temperature=0.0,  # Use from options if needed
-                        max_new_tokens=4096,  # Use from options if needed
-                    )
+                    self._build_engine_input(image=img, prompt=prompt)
                     for img, prompt in zip(images, prompts)
                 ]
 
                 # Run batch inference
+                batch_start = time.perf_counter()
                 outputs = self.engine.predict_batch(engine_inputs)
+                _log.debug(
+                    "Processed %s pages through VLM engine in %.3fs",
+                    len(engine_inputs),
+                    time.perf_counter() - batch_start,
+                )
 
                 # Attach predictions to pages
                 for page, output in zip(valid_pages, outputs):
@@ -226,12 +264,7 @@ def process_images(
 
         # Process batch of images
         engine_inputs = [
-            VlmEngineInput(
-                image=img,
-                prompt=p,
-                temperature=0.0,
-                max_new_tokens=4096,
-            )
+            self._build_engine_input(image=img, prompt=p)
             for img, p in zip(images, prompts)
         ]
 
diff --git a/tests/test_api_vlm_engine.py b/tests/test_api_vlm_engine.py
@@ -0,0 +1,106 @@
+from PIL import Image
+
+from docling.datamodel.stage_model_specs import EngineModelConfig
+from docling.datamodel.vlm_engine_options import ApiVlmEngineOptions
+from docling.models.inference_engines.vlm.api_openai_compatible_engine import (
+    ApiVlmEngine,
+)
+from docling.models.inference_engines.vlm.base import VlmEngineInput, VlmEngineType
+
+
+def test_api_vlm_engine_uses_request_generation_settings_over_model_defaults(
+    monkeypatch,
+) -> None:
+    captured = {}
+
+    def _fake_api_image_request(**kwargs):
+        captured.update(kwargs)
+        return "ok", 1, "stop"
+
+    monkeypatch.setattr(
+        "docling.models.inference_engines.vlm.api_openai_compatible_engine.api_image_request",
+        _fake_api_image_request,
+    )
+
+    engine = ApiVlmEngine(
+        enable_remote_services=True,
+        options=ApiVlmEngineOptions(
+            engine_type=VlmEngineType.API_OPENAI,
+            url="http://localhost:11434/v1/chat/completions",
+        ),
+        model_config=EngineModelConfig(
+            extra_config={
+                "api_params": {
+                    "model": "test-model",
+                    "max_tokens": 4096,
+                    "temperature": 0.0,
+                }
+            }
+        ),
+    )
+
+    outputs = engine.predict_batch(
+        [
+            VlmEngineInput(
+                image=Image.new("RGB", (8, 8), "white"),
+                prompt="Prompt",
+                temperature=0.4,
+                max_new_tokens=128,
+                stop_strings=["</doctag>"],
+            )
+        ]
+    )
+
+    assert [output.text for output in outputs] == ["ok"]
+    assert captured["model"] == "test-model"
+    assert captured["temperature"] == 0.4
+    assert captured["max_tokens"] == 128
+    assert captured["stop"] == ["</doctag>"]
+
+
+def test_api_vlm_engine_allows_explicit_user_params_to_override_request_settings(
+    monkeypatch,
+) -> None:
+    captured = {}
+
+    def _fake_api_image_request(**kwargs):
+        captured.update(kwargs)
+        return "ok", 1, "stop"
+
+    monkeypatch.setattr(
+        "docling.models.inference_engines.vlm.api_openai_compatible_engine.api_image_request",
+        _fake_api_image_request,
+    )
+
+    engine = ApiVlmEngine(
+        enable_remote_services=True,
+        options=ApiVlmEngineOptions(
+            engine_type=VlmEngineType.API_OPENAI,
+            url="http://localhost:11434/v1/chat/completions",
+            params={
+                "model": "override-model",
+                "temperature": 0.8,
+                "max_completion_tokens": 256,
+            },
+        ),
+        model_config=EngineModelConfig(
+            extra_config={"api_params": {"model": "default-model", "max_tokens": 4096}}
+        ),
+    )
+
+    outputs = engine.predict_batch(
+        [
+            VlmEngineInput(
+                image=Image.new("RGB", (8, 8), "white"),
+                prompt="Prompt",
+                temperature=0.4,
+                max_new_tokens=128,
+            )
+        ]
+    )
+
+    assert [output.text for output in outputs] == ["ok"]
+    assert captured["model"] == "override-model"
+    assert captured["temperature"] == 0.8
+    assert captured["max_completion_tokens"] == 256
+    assert "max_tokens" not in captured
diff --git a/tests/test_vlm_convert_model.py b/tests/test_vlm_convert_model.py