55"""
66
77import logging
8+ import time
89from collections .abc import Iterable
910from pathlib import Path
10- from typing import Optional , Union
1111
1212from PIL import Image as PILImage
1313
1919from docling .models .inference_engines .vlm import (
2020 BaseVlmEngine ,
2121 VlmEngineInput ,
22+ VlmEngineType ,
2223 create_vlm_engine ,
2324)
2425from docling .utils .profiling import TimeRecorder
@@ -42,7 +43,7 @@ def __init__(
4243 self ,
4344 enabled : bool ,
4445 enable_remote_services : bool ,
45- artifacts_path : Optional [ Union [ Path , str ]] ,
46+ artifacts_path : Path | str | None ,
4647 options : VlmConvertOptions ,
4748 accelerator_options : AcceleratorOptions ,
4849 ):
@@ -81,6 +82,26 @@ def __init__(
8182
8283 _log .info ("VlmConvertModel initialized successfully" )
8384
85+ def _get_runtime_engine_type (self ) -> VlmEngineType :
86+ selected_engine_type = getattr (self .engine , "selected_engine_type" , None )
87+ if selected_engine_type is not None :
88+ return selected_engine_type
89+ return self .options .engine_options .engine_type
90+
91+ def _build_engine_input (self , image : PILImage .Image , prompt : str ) -> VlmEngineInput :
92+ model_spec = self .options .model_spec
93+ runtime_engine_type = self ._get_runtime_engine_type ()
94+ return VlmEngineInput (
95+ image = image ,
96+ prompt = prompt ,
97+ temperature = model_spec .temperature ,
98+ max_new_tokens = model_spec .max_new_tokens ,
99+ stop_strings = list (model_spec .stop_strings ),
100+ extra_generation_config = model_spec .get_runtime_input_extra_config (
101+ runtime_engine_type
102+ ),
103+ )
104+
84105 def __call__ (
85106 self , conv_res : ConversionResult , page_batch : Iterable [Page ]
86107 ) -> Iterable [Page ]:
@@ -106,33 +127,43 @@ def __call__(
106127 images = []
107128 prompts = []
108129 valid_pages = []
130+ rasterize_time = 0.0
131+ scale_resize_time = 0.0
132+ max_size_resize_time = 0.0
109133
110134 for page in page_list :
111- if page .image is None :
135+ rasterize_start = time .perf_counter ()
136+ image = page .image
137+ rasterize_time += time .perf_counter () - rasterize_start
138+
139+ if image is None :
112140 _log .warning (
113141 f"Page { page .page_no } has no image, skipping VLM conversion"
114142 )
115143 continue
116144
117145 # Scale image if needed
118- image = page .image
119146 if self .options .scale != 1.0 :
147+ resize_start = time .perf_counter ()
120148 new_size = (
121149 int (image .width * self .options .scale ),
122150 int (image .height * self .options .scale ),
123151 )
124152 image = image .resize (new_size , PILImage .Resampling .LANCZOS )
153+ scale_resize_time += time .perf_counter () - resize_start
125154
126155 # Apply max_size constraint if specified
127156 if self .options .max_size is not None :
128157 max_dim = max (image .width , image .height )
129158 if max_dim > self .options .max_size :
159+ resize_start = time .perf_counter ()
130160 scale_factor = self .options .max_size / max_dim
131161 new_size = (
132162 int (image .width * scale_factor ),
133163 int (image .height * scale_factor ),
134164 )
135165 image = image .resize (new_size , PILImage .Resampling .LANCZOS )
166+ max_size_resize_time += time .perf_counter () - resize_start
136167
137168 images .append (image )
138169 prompts .append (self .options .model_spec .prompt )
@@ -143,22 +174,29 @@ def __call__(
143174 return
144175
145176 # Process through runtime using batch prediction
146- _log .debug (f"Processing { len (images )} pages through VLM engine (batched)" )
177+ _log .debug (
178+ "Prepared %s pages for VLM engine: rasterize=%.3fs, scale_resize=%.3fs, max_size_resize=%.3fs" ,
179+ len (images ),
180+ rasterize_time ,
181+ scale_resize_time ,
182+ max_size_resize_time ,
183+ )
147184
148185 try :
149186 # Create batch of runtime inputs
150187 engine_inputs = [
151- VlmEngineInput (
152- image = img ,
153- prompt = prompt ,
154- temperature = 0.0 , # Use from options if needed
155- max_new_tokens = 4096 , # Use from options if needed
156- )
188+ self ._build_engine_input (image = img , prompt = prompt )
157189 for img , prompt in zip (images , prompts )
158190 ]
159191
160192 # Run batch inference
193+ batch_start = time .perf_counter ()
161194 outputs = self .engine .predict_batch (engine_inputs )
195+ _log .debug (
196+ "Processed %s pages through VLM engine in %.3fs" ,
197+ len (engine_inputs ),
198+ time .perf_counter () - batch_start ,
199+ )
162200
163201 # Attach predictions to pages
164202 for page , output in zip (valid_pages , outputs ):
@@ -226,12 +264,7 @@ def process_images(
226264
227265 # Process batch of images
228266 engine_inputs = [
229- VlmEngineInput (
230- image = img ,
231- prompt = p ,
232- temperature = 0.0 ,
233- max_new_tokens = 4096 ,
234- )
267+ self ._build_engine_input (image = img , prompt = p )
235268 for img , p in zip (images , prompts )
236269 ]
237270
0 commit comments