2020 AutoImageProcessor = None # type: ignore
2121 AutoModelForDepthEstimation = None # type: ignore
2222
23+ try :
24+ import onnx
25+ except ImportError :
26+ onnx = None # type: ignore
27+
28+ try :
29+ from onnxruntime .transformers .float16 import convert_float_to_float16 # type: ignore[import-untyped]
30+
31+ HAS_ONNX_QUANTIZATION = True
32+ except ImportError :
33+ convert_float_to_float16 = None # type: ignore
34+ HAS_ONNX_QUANTIZATION = False
35+
2336
2437logger = logging .getLogger (__name__ )
2538
@@ -93,12 +106,53 @@ def ensure_yolo_model_downloaded(
93106 raise RuntimeError (error_msg ) from e
94107
95108
109+ # Ops that don't work well with FP16 on cpu (can be removed if on gpu)
110+ FP16_OP_BLOCK_LIST = [
111+ "Resize" ,
112+ "Upsample" ,
113+ ]
114+
115+
116+ def quantize_onnx_dynamic (model_path : Path ) -> None :
117+ """Convert ONNX model to FP16 (mixed precision) in-place.
118+
119+ Uses ONNX Runtime's float16 converter which properly handles:
120+ - Keeping inputs/outputs as FP32 for compatibility
121+ - Blocking problematic ops (Resize, Upsample) from FP16 conversion
122+ - Inserting Cast nodes where needed
123+
124+ This provides ~50% model size reduction while maintaining CPU compatibility.
125+
126+ Args:
127+ model_path: Path to the ONNX model to convert
128+
129+ Raises:
130+ RuntimeError: If onnxruntime.transformers is not available
131+ """
132+ if not HAS_ONNX_QUANTIZATION or not onnx :
133+ raise RuntimeError ("onnx, onnxruntime are required for FP16 conversion. " )
134+
135+ logger .info ("Converting ONNX model to FP16 (mixed precision)..." )
136+
137+ model = onnx .load (str (model_path ))
138+
139+ model_fp16 = convert_float_to_float16 (
140+ model ,
141+ keep_io_types = True ,
142+ op_block_list = FP16_OP_BLOCK_LIST ,
143+ )
144+
145+ onnx .save (model_fp16 , str (model_path ))
146+ logger .info ("FP16 conversion complete: %s" , model_path )
147+
148+
96149def export_yolo_to_onnx (
97150 yolo_path : Path ,
98151 output_path : Path ,
99152 opset : int = 18 ,
100153 imgsz : int = 640 ,
101154 simplify : bool = True ,
155+ half : bool = False ,
102156) -> Path :
103157 """Export YOLO model to ONNX format.
104158
@@ -108,25 +162,25 @@ def export_yolo_to_onnx(
108162 opset: ONNX opset version
109163 imgsz: Image size
110164 simplify: Whether to run ONNX simplifier
165+ half: Apply INT8 quantization for smaller model size (better than FP16 for CPU)
111166
112167 Returns:
113168 Path to the exported ONNX model
114169 """
115- logger .info ("Exporting YOLO model to ONNX..." )
170+ logger .info ("Exporting YOLO model to ONNX (quantize=%s) ..." , half )
116171 try :
117172 if not yolo_path .exists ():
118173 raise FileNotFoundError (f"YOLO model not found at { yolo_path } " )
119174
120175 model = YOLO (str (yolo_path ))
121176
122- # Ultralytics export saves to the same directory as the source model by default
123- # or we can specify 'project' and 'name' but it creates subdirs.
124- # Easiest is to let it export, then move if needed.
177+ # Export to ONNX in FP32 first
125178 exported_filename = model .export (
126179 format = "onnx" ,
127180 opset = opset ,
128181 imgsz = imgsz ,
129182 simplify = simplify ,
183+ half = False ,
130184 )
131185
132186 exported_path = Path (exported_filename ).resolve ()
@@ -137,9 +191,12 @@ def export_yolo_to_onnx(
137191 if exported_path != output_path :
138192 shutil .move (str (exported_path ), str (output_path ))
139193 logger .info ("Moved exported YOLO model to %s" , output_path )
140- else :
141- logger .info ("YOLO ONNX model ready at: %s" , output_path )
142194
195+ # Apply INT8 quantization if requested (replaces old FP16 conversion)
196+ if half :
197+ quantize_onnx_dynamic (output_path )
198+
199+ logger .info ("YOLO ONNX model ready at: %s" , output_path )
143200 return output_path
144201
145202 except Exception as e :
@@ -208,6 +265,7 @@ def export_midas_to_onnx(
208265 model_repo : str = "intel-isl/MiDaS" ,
209266 opset : int = 18 ,
210267 input_size : Optional [int ] = None ,
268+ half : bool = False ,
211269) -> Path :
212270 """Export MiDaS model to ONNX format.
213271
@@ -218,16 +276,18 @@ def export_midas_to_onnx(
218276 model_repo: Repo
219277 opset: ONNX opset version
220278 input_size: Optional manual input size override
279+ half: Apply FP16 quantization for smaller model size
221280
222281 Returns:
223282 Path to the exported ONNX model
224283 """
225- logger .info ("Exporting %s model to ONNX..." , model_type )
284+ logger .info ("Exporting %s model to ONNX (FP16=%s) ..." , model_type , half )
226285 try :
227286 torch .hub .set_dir (str (cache_dir ))
228287 model = torch .hub .load (model_repo , model_type , trust_repo = True )
229288 model .eval ()
230289
290+ # Always export in FP32 first, then quantize post-export
231291 default_size , _ = get_midas_onnx_config (model_type )
232292 size = input_size if input_size else default_size
233293
@@ -246,6 +306,9 @@ def export_midas_to_onnx(
246306 output_names = ["output" ],
247307 )
248308
309+ if half :
310+ quantize_onnx_dynamic (output_path )
311+
249312 logger .info ("%s ONNX model ready at: %s" , model_type , output_path )
250313 return output_path
251314 except Exception as e :
0 commit comments