Skip to content

Commit e2d3c87

Browse files
authored
Merge pull request #248 from amosproj/feat/178-quant
Feat/178 quantization Signed-off-by: Felix Hilgers <felix.hilgers@fau.de>
2 parents a14d93e + f3a2c56 commit e2d3c87

5 files changed

Lines changed: 130 additions & 8 deletions

File tree

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,14 @@ make export-yolo-onnx
8484
make export-midas-onnx
8585
```
8686

87+
### FP16 Quantization (Optional)
88+
89+
Export models with FP16 precision for ~50% size reduction:
90+
91+
```bash
92+
ONNX_HALF_PRECISION=true make export-onnx
93+
```
94+
8795
To start the analyzer service with ONNX backend:
8896
```bash
8997
DETECTOR_BACKEND=onnx DEPTH_BACKEND=onnx make run-analyzer-local

scripts/download_models.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,8 @@ def main() -> None:
163163
yolo_path=yolo_final_path,
164164
output_path=yolo_onnx_target,
165165
opset=args.onnx_opset,
166-
simplify=args.onnx_simplify
166+
simplify=args.onnx_simplify,
167+
half=config.ONNX_HALF_PRECISION,
167168
)
168169

169170
# --- MiDaS Processing ---
@@ -197,6 +198,7 @@ def main() -> None:
197198
model_type=args.midas_type,
198199
model_repo=args.midas_repo,
199200
opset=args.onnx_opset,
201+
half=config.ONNX_HALF_PRECISION,
200202
)
201203

202204
# --- Depth Anything Processing ---

src/backend/common/config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,11 @@ class Config:
110110
DETECTOR_NUM_CLASSES: int = int(os.getenv("DETECTOR_NUM_CLASSES", "80"))
111111
TORCH_DEVICE: Optional[str] = os.getenv("TORCH_DEVICE")
112112
TORCH_HALF_PRECISION: str = os.getenv("TORCH_HALF_PRECISION", "auto")
113+
ONNX_HALF_PRECISION: bool = os.getenv("ONNX_HALF_PRECISION", "false").lower() in (
114+
"1",
115+
"true",
116+
"yes",
117+
)
113118
ONNX_PROVIDERS: list[str] = [
114119
provider.strip()
115120
for provider in os.getenv("ONNX_PROVIDERS", "").split(",")

src/backend/common/utils/model_downloader.py

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,19 @@
2020
AutoImageProcessor = None # type: ignore
2121
AutoModelForDepthEstimation = None # type: ignore
2222

23+
try:
24+
import onnx
25+
except ImportError:
26+
onnx = None # type: ignore
27+
28+
try:
29+
from onnxruntime.transformers.float16 import convert_float_to_float16 # type: ignore[import-untyped]
30+
31+
HAS_ONNX_QUANTIZATION = True
32+
except ImportError:
33+
convert_float_to_float16 = None # type: ignore
34+
HAS_ONNX_QUANTIZATION = False
35+
2336

2437
logger = logging.getLogger(__name__)
2538

@@ -93,12 +106,53 @@ def ensure_yolo_model_downloaded(
93106
raise RuntimeError(error_msg) from e
94107

95108

109+
# Ops that don't work well with FP16 on cpu (can be removed if on gpu)
110+
FP16_OP_BLOCK_LIST = [
111+
"Resize",
112+
"Upsample",
113+
]
114+
115+
116+
def quantize_onnx_dynamic(model_path: Path) -> None:
117+
"""Convert ONNX model to FP16 (mixed precision) in-place.
118+
119+
Uses ONNX Runtime's float16 converter which properly handles:
120+
- Keeping inputs/outputs as FP32 for compatibility
121+
- Blocking problematic ops (Resize, Upsample) from FP16 conversion
122+
- Inserting Cast nodes where needed
123+
124+
This provides ~50% model size reduction while maintaining CPU compatibility.
125+
126+
Args:
127+
model_path: Path to the ONNX model to convert
128+
129+
Raises:
130+
RuntimeError: If onnxruntime.transformers is not available
131+
"""
132+
if not HAS_ONNX_QUANTIZATION or not onnx:
133+
raise RuntimeError("onnx, onnxruntime are required for FP16 conversion. ")
134+
135+
logger.info("Converting ONNX model to FP16 (mixed precision)...")
136+
137+
model = onnx.load(str(model_path))
138+
139+
model_fp16 = convert_float_to_float16(
140+
model,
141+
keep_io_types=True,
142+
op_block_list=FP16_OP_BLOCK_LIST,
143+
)
144+
145+
onnx.save(model_fp16, str(model_path))
146+
logger.info("FP16 conversion complete: %s", model_path)
147+
148+
96149
def export_yolo_to_onnx(
97150
yolo_path: Path,
98151
output_path: Path,
99152
opset: int = 18,
100153
imgsz: int = 640,
101154
simplify: bool = True,
155+
half: bool = False,
102156
) -> Path:
103157
"""Export YOLO model to ONNX format.
104158
@@ -108,25 +162,25 @@ def export_yolo_to_onnx(
108162
opset: ONNX opset version
109163
imgsz: Image size
110164
simplify: Whether to run ONNX simplifier
165+
half: Apply INT8 quantization for smaller model size (better than FP16 for CPU)
111166
112167
Returns:
113168
Path to the exported ONNX model
114169
"""
115-
logger.info("Exporting YOLO model to ONNX...")
170+
logger.info("Exporting YOLO model to ONNX (quantize=%s)...", half)
116171
try:
117172
if not yolo_path.exists():
118173
raise FileNotFoundError(f"YOLO model not found at {yolo_path}")
119174

120175
model = YOLO(str(yolo_path))
121176

122-
# Ultralytics export saves to the same directory as the source model by default
123-
# or we can specify 'project' and 'name' but it creates subdirs.
124-
# Easiest is to let it export, then move if needed.
177+
# Export to ONNX in FP32 first
125178
exported_filename = model.export(
126179
format="onnx",
127180
opset=opset,
128181
imgsz=imgsz,
129182
simplify=simplify,
183+
half=False,
130184
)
131185

132186
exported_path = Path(exported_filename).resolve()
@@ -137,9 +191,12 @@ def export_yolo_to_onnx(
137191
if exported_path != output_path:
138192
shutil.move(str(exported_path), str(output_path))
139193
logger.info("Moved exported YOLO model to %s", output_path)
140-
else:
141-
logger.info("YOLO ONNX model ready at: %s", output_path)
142194

195+
# Apply INT8 quantization if requested (replaces old FP16 conversion)
196+
if half:
197+
quantize_onnx_dynamic(output_path)
198+
199+
logger.info("YOLO ONNX model ready at: %s", output_path)
143200
return output_path
144201

145202
except Exception as e:
@@ -208,6 +265,7 @@ def export_midas_to_onnx(
208265
model_repo: str = "intel-isl/MiDaS",
209266
opset: int = 18,
210267
input_size: Optional[int] = None,
268+
half: bool = False,
211269
) -> Path:
212270
"""Export MiDaS model to ONNX format.
213271
@@ -218,16 +276,18 @@ def export_midas_to_onnx(
218276
model_repo: Repo
219277
opset: ONNX opset version
220278
input_size: Optional manual input size override
279+
half: Apply FP16 quantization for smaller model size
221280
222281
Returns:
223282
Path to the exported ONNX model
224283
"""
225-
logger.info("Exporting %s model to ONNX...", model_type)
284+
logger.info("Exporting %s model to ONNX (FP16=%s)...", model_type, half)
226285
try:
227286
torch.hub.set_dir(str(cache_dir))
228287
model = torch.hub.load(model_repo, model_type, trust_repo=True)
229288
model.eval()
230289

290+
# Always export in FP32 first, then quantize post-export
231291
default_size, _ = get_midas_onnx_config(model_type)
232292
size = input_size if input_size else default_size
233293

@@ -246,6 +306,9 @@ def export_midas_to_onnx(
246306
output_names=["output"],
247307
)
248308

309+
if half:
310+
quantize_onnx_dynamic(output_path)
311+
249312
logger.info("%s ONNX model ready at: %s", model_type, output_path)
250313
return output_path
251314
except Exception as e:

src/backend/tests/common/core/test_model_downloader.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,25 @@
44
from pathlib import Path
55
from unittest.mock import MagicMock, patch
66

7+
import numpy as np
78
import pytest
89

910
from common.utils.model_downloader import (
11+
quantize_onnx_dynamic,
1012
ensure_midas_model_available,
1113
ensure_yolo_model_downloaded,
1214
get_midas_cache_dir,
15+
HAS_ONNX_QUANTIZATION,
1316
)
1417

18+
try:
19+
import onnx
20+
from onnx import TensorProto, helper, numpy_helper
21+
22+
ONNX_AVAILABLE = True
23+
except ImportError:
24+
ONNX_AVAILABLE = False
25+
1526

1627
@pytest.fixture
1728
def tmp_models_dir(tmp_path):
@@ -145,3 +156,36 @@ def test_ensure_yolo_model_downloaded_creates_cache_directory(tmp_path, mock_yol
145156
assert result == model_path
146157
# The implementation passes Path objects to copy2
147158
mock_copy.assert_called_once_with(downloaded_path, model_path)
159+
160+
161+
@pytest.mark.skipif(
162+
not ONNX_AVAILABLE or not HAS_ONNX_QUANTIZATION,
163+
reason="onnx or onnxruntime.transformers.float16 not installed",
164+
)
165+
def test_quantize_onnx_dynamic(tmp_path):
166+
"""Test FP16 conversion reduces model size and keeps IO types as FP32."""
167+
# create a basic model with FP32 weights
168+
weight_data = np.random.randn(100, 100).astype(np.float32)
169+
weight_tensor = numpy_helper.from_array(weight_data, name="weight")
170+
input_info = helper.make_tensor_value_info("input", TensorProto.FLOAT, [1, 100])
171+
output_info = helper.make_tensor_value_info("output", TensorProto.FLOAT, [1, 100])
172+
node = helper.make_node("MatMul", ["input", "weight"], ["output"])
173+
graph = helper.make_graph(
174+
[node], "test", [input_info], [output_info], [weight_tensor]
175+
)
176+
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
177+
178+
model_path = tmp_path / "model.onnx"
179+
onnx.save(model, str(model_path))
180+
fp32_size = model_path.stat().st_size
181+
quantize_onnx_dynamic(model_path)
182+
fp16_size = model_path.stat().st_size
183+
assert fp16_size < fp32_size * 0.7
184+
185+
# model can be loaded and inputs/outputs remain FP32
186+
converted_model = onnx.load(str(model_path))
187+
assert converted_model is not None
188+
189+
# keep_io_types=True, inputs/outputs should remain FP32
190+
assert converted_model.graph.input[0].type.tensor_type.elem_type == TensorProto.FLOAT
191+
assert converted_model.graph.output[0].type.tensor_type.elem_type == TensorProto.FLOAT

0 commit comments

Comments
 (0)