Merge pull request #248 from amosproj/feat/178-quant

fhilgers · web-flow · commit e2d3c87f681e · 2026-01-28T09:28:47.000+01:00
Feat/178 quantization

Signed-off-by: Felix Hilgers &lt;felix.hilgers@fau.de&gt;
diff --git a/README.md b/README.md
@@ -84,6 +84,14 @@ make export-yolo-onnx
 make export-midas-onnx
 ```
 
+### FP16 Quantization (Optional)
+
+Export models with FP16 precision for ~50% size reduction:
+
+```bash
+ONNX_HALF_PRECISION=true make export-onnx
+```
+
 To start the analyzer service with ONNX backend:
 ```bash
 DETECTOR_BACKEND=onnx DEPTH_BACKEND=onnx make run-analyzer-local
diff --git a/scripts/download_models.py b/scripts/download_models.py
@@ -163,7 +163,8 @@ def main() -> None:
                 yolo_path=yolo_final_path,
                 output_path=yolo_onnx_target,
                 opset=args.onnx_opset,
-                simplify=args.onnx_simplify
+                simplify=args.onnx_simplify,
+                half=config.ONNX_HALF_PRECISION,
             )
 
     # --- MiDaS Processing ---
@@ -197,6 +198,7 @@ def main() -> None:
                 model_type=args.midas_type,
                 model_repo=args.midas_repo,
                 opset=args.onnx_opset,
+                half=config.ONNX_HALF_PRECISION,
             )
 
     # --- Depth Anything Processing ---
diff --git a/src/backend/common/config.py b/src/backend/common/config.py
@@ -110,6 +110,11 @@ class Config:
     DETECTOR_NUM_CLASSES: int = int(os.getenv("DETECTOR_NUM_CLASSES", "80"))
     TORCH_DEVICE: Optional[str] = os.getenv("TORCH_DEVICE")
     TORCH_HALF_PRECISION: str = os.getenv("TORCH_HALF_PRECISION", "auto")
+    ONNX_HALF_PRECISION: bool = os.getenv("ONNX_HALF_PRECISION", "false").lower() in (
+        "1",
+        "true",
+        "yes",
+    )
     ONNX_PROVIDERS: list[str] = [
         provider.strip()
         for provider in os.getenv("ONNX_PROVIDERS", "").split(",")
diff --git a/src/backend/common/utils/model_downloader.py b/src/backend/common/utils/model_downloader.py
@@ -20,6 +20,19 @@
     AutoImageProcessor = None  # type: ignore
     AutoModelForDepthEstimation = None  # type: ignore
 
+try:
+    import onnx
+except ImportError:
+    onnx = None  # type: ignore
+
+try:
+    from onnxruntime.transformers.float16 import convert_float_to_float16  # type: ignore[import-untyped]
+
+    HAS_ONNX_QUANTIZATION = True
+except ImportError:
+    convert_float_to_float16 = None  # type: ignore
+    HAS_ONNX_QUANTIZATION = False
+
 
 logger = logging.getLogger(__name__)
 
@@ -93,12 +106,53 @@ def ensure_yolo_model_downloaded(
         raise RuntimeError(error_msg) from e
 
 
+# Ops that don't work well with FP16 on cpu (can be removed if on gpu)
+FP16_OP_BLOCK_LIST = [
+    "Resize",
+    "Upsample",
+]
+
+
+def quantize_onnx_dynamic(model_path: Path) -> None:
+    """Convert ONNX model to FP16 (mixed precision) in-place.
+
+    Uses ONNX Runtime's float16 converter which properly handles:
+    - Keeping inputs/outputs as FP32 for compatibility
+    - Blocking problematic ops (Resize, Upsample) from FP16 conversion
+    - Inserting Cast nodes where needed
+
+    This provides ~50% model size reduction while maintaining CPU compatibility.
+
+    Args:
+        model_path: Path to the ONNX model to convert
+
+    Raises:
+        RuntimeError: If onnxruntime.transformers is not available
+    """
+    if not HAS_ONNX_QUANTIZATION or not onnx:
+        raise RuntimeError("onnx, onnxruntime are required for FP16 conversion. ")
+
+    logger.info("Converting ONNX model to FP16 (mixed precision)...")
+
+    model = onnx.load(str(model_path))
+
+    model_fp16 = convert_float_to_float16(
+        model,
+        keep_io_types=True,
+        op_block_list=FP16_OP_BLOCK_LIST,
+    )
+
+    onnx.save(model_fp16, str(model_path))
+    logger.info("FP16 conversion complete: %s", model_path)
+
+
 def export_yolo_to_onnx(
     yolo_path: Path,
     output_path: Path,
     opset: int = 18,
     imgsz: int = 640,
     simplify: bool = True,
+    half: bool = False,
 ) -> Path:
     """Export YOLO model to ONNX format.
 
@@ -108,25 +162,25 @@ def export_yolo_to_onnx(
         opset: ONNX opset version
         imgsz: Image size
         simplify: Whether to run ONNX simplifier
+        half: Apply INT8 quantization for smaller model size (better than FP16 for CPU)
 
     Returns:
         Path to the exported ONNX model
     """
-    logger.info("Exporting YOLO model to ONNX...")
+    logger.info("Exporting YOLO model to ONNX (quantize=%s)...", half)
     try:
         if not yolo_path.exists():
             raise FileNotFoundError(f"YOLO model not found at {yolo_path}")
 
         model = YOLO(str(yolo_path))
 
-        # Ultralytics export saves to the same directory as the source model by default
-        # or we can specify 'project' and 'name' but it creates subdirs.
-        # Easiest is to let it export, then move if needed.
+        # Export to ONNX in FP32 first
         exported_filename = model.export(
             format="onnx",
             opset=opset,
             imgsz=imgsz,
             simplify=simplify,
+            half=False,
         )
 
         exported_path = Path(exported_filename).resolve()
@@ -137,9 +191,12 @@ def export_yolo_to_onnx(
         if exported_path != output_path:
             shutil.move(str(exported_path), str(output_path))
             logger.info("Moved exported YOLO model to %s", output_path)
-        else:
-            logger.info("YOLO ONNX model ready at: %s", output_path)
 
+        # Apply INT8 quantization if requested (replaces old FP16 conversion)
+        if half:
+            quantize_onnx_dynamic(output_path)
+
+        logger.info("YOLO ONNX model ready at: %s", output_path)
         return output_path
 
     except Exception as e:
@@ -208,6 +265,7 @@ def export_midas_to_onnx(
     model_repo: str = "intel-isl/MiDaS",
     opset: int = 18,
     input_size: Optional[int] = None,
+    half: bool = False,
 ) -> Path:
     """Export MiDaS model to ONNX format.
 
@@ -218,16 +276,18 @@ def export_midas_to_onnx(
         model_repo: Repo
         opset: ONNX opset version
         input_size: Optional manual input size override
+        half: Apply FP16 quantization for smaller model size
 
     Returns:
         Path to the exported ONNX model
     """
-    logger.info("Exporting %s model to ONNX...", model_type)
+    logger.info("Exporting %s model to ONNX (FP16=%s)...", model_type, half)
     try:
         torch.hub.set_dir(str(cache_dir))
         model = torch.hub.load(model_repo, model_type, trust_repo=True)
         model.eval()
 
+        # Always export in FP32 first, then quantize post-export
         default_size, _ = get_midas_onnx_config(model_type)
         size = input_size if input_size else default_size
 
@@ -246,6 +306,9 @@ def export_midas_to_onnx(
             output_names=["output"],
         )
 
+        if half:
+            quantize_onnx_dynamic(output_path)
+
         logger.info("%s ONNX model ready at: %s", model_type, output_path)
         return output_path
     except Exception as e:
diff --git a/src/backend/tests/common/core/test_model_downloader.py b/src/backend/tests/common/core/test_model_downloader.py
@@ -4,14 +4,25 @@
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 
+import numpy as np
 import pytest
 
 from common.utils.model_downloader import (
+    quantize_onnx_dynamic,
     ensure_midas_model_available,
     ensure_yolo_model_downloaded,
     get_midas_cache_dir,
+    HAS_ONNX_QUANTIZATION,
 )
 
+try:
+    import onnx
+    from onnx import TensorProto, helper, numpy_helper
+
+    ONNX_AVAILABLE = True
+except ImportError:
+    ONNX_AVAILABLE = False
+
 
 @pytest.fixture
 def tmp_models_dir(tmp_path):
@@ -145,3 +156,36 @@ def test_ensure_yolo_model_downloaded_creates_cache_directory(tmp_path, mock_yol
         assert result == model_path
         # The implementation passes Path objects to copy2
         mock_copy.assert_called_once_with(downloaded_path, model_path)
+
+
+@pytest.mark.skipif(
+    not ONNX_AVAILABLE or not HAS_ONNX_QUANTIZATION,
+    reason="onnx or onnxruntime.transformers.float16 not installed",
+)
+def test_quantize_onnx_dynamic(tmp_path):
+    """Test FP16 conversion reduces model size and keeps IO types as FP32."""
+    # create a basic model with FP32 weights
+    weight_data = np.random.randn(100, 100).astype(np.float32)
+    weight_tensor = numpy_helper.from_array(weight_data, name="weight")
+    input_info = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 100])
+    output_info = helper.make_tensor_value_info("output", TensorProto.FLOAT, [1, 100])
+    node = helper.make_node("MatMul", ["input", "weight"], ["output"])
+    graph = helper.make_graph(
+        [node], "test", [input_info], [output_info], [weight_tensor]
+    )
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+
+    model_path = tmp_path / "model.onnx"
+    onnx.save(model, str(model_path))
+    fp32_size = model_path.stat().st_size
+    quantize_onnx_dynamic(model_path)
+    fp16_size = model_path.stat().st_size
+    assert fp16_size < fp32_size * 0.7
+
+    # model can be loaded and inputs/outputs remain FP32
+    converted_model = onnx.load(str(model_path))
+    assert converted_model is not None
+
+    # keep_io_types=True, inputs/outputs should remain FP32
+    assert converted_model.graph.input[0].type.tensor_type.elem_type == TensorProto.FLOAT
+    assert converted_model.graph.output[0].type.tensor_type.elem_type == TensorProto.FLOAT