amosproj
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎scripts/download_models.py‎
Lines changed: 2 additions & 0 deletions b/‎scripts/download_models.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/backend/analyzer/manager.py‎
Lines changed: 53 additions & 9 deletions b/‎src/backend/analyzer/manager.py‎
Lines changed: 53 additions & 9 deletions
diff --git a/‎src/backend/common/config.py‎
Lines changed: 5 additions & 1 deletion b/‎src/backend/common/config.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/backend/common/core/depth.py‎
Lines changed: 115 additions & 2 deletions b/‎src/backend/common/core/depth.py‎
Lines changed: 115 additions & 2 deletions
@@ -140,7 +140,9 @@ Optional environment variables:
 - `DEPTH_ANYTHING_MODEL` - Hugging Face model ID for Depth Anything V2 (default `depth-anything/Depth-Anything-V2-Small-hf`)
 - `DEPTH_ANYTHING_CACHE_DIR` - Depth Anything cache directory (default `models/depth_anything_cache`)
 - `MIDAS_ONNX_MODEL_PATH` - defaults to `models/midas_small.onnx`
+- `MIDAS_ONNX_INPUT_SIZE` – input size for MiDaS ONNX preprocessing (default: `384`)
 - `MIDAS_ONNX_PROVIDERS` - comma separated ONNX Runtime providers for depth (falls back to `ONNX_PROVIDERS`)
+- `ONNX_SHARED_PREPROCESSING` – reuse one resize step for ONNX detector + depth when sizes align (default: `true`)
 - `DETECTOR_BACKEND` - `torch` (default) or `onnx`
 - `TORCH_DEVICE` - force PyTorch to use `cuda:0`, `cpu`, etc. (defaults to best available)
 - `TORCH_HALF_PRECISION` - `auto` (default), `true`, or `false`
 
@@ -163,6 +163,7 @@ def main() -> None:
                 yolo_path=yolo_final_path,
                 output_path=yolo_onnx_target,
                 opset=args.onnx_opset,
+                imgsz=config.DETECTOR_IMAGE_SIZE,
                 simplify=args.onnx_simplify,
                 half=config.ONNX_HALF_PRECISION,
             )
@@ -198,6 +199,7 @@ def main() -> None:
                 model_type=args.midas_type,
                 model_repo=args.midas_repo,
                 opset=args.onnx_opset,
+                input_size=config.MIDAS_ONNX_INPUT_SIZE,
                 half=config.ONNX_HALF_PRECISION,
             )
 
 
@@ -36,6 +36,7 @@
 )
 from common.utils.transforms import (
     calculate_adaptive_scale,
+    letterbox,
     resize_frame,
 )
 
@@ -203,6 +204,19 @@ def _get_compute_intrinsics(
             )
         return self._intrinsics_cache[cache_key]
 
+    def _should_share_preprocess(
+        self, detector: ObjectDetector, estimator: DepthEstimator
+    ) -> bool:
+        """Compute whether shared ONNX preprocessing can be used."""
+        return (
+            config.ONNX_SHARED_PREPROCESSING
+            and config.DETECTOR_BACKEND == "onnx"
+            and config.DEPTH_BACKEND == "onnx"
+            and config.DETECTOR_IMAGE_SIZE == config.MIDAS_ONNX_INPUT_SIZE
+            and hasattr(detector, "infer_preprocessed")
+            and hasattr(estimator, "estimate_distance_m_preprocessed")
+        )
+
     async def shutdown(self) -> None:
         """Cleanup on service shutdown."""
         await self._stop_processing()
@@ -254,6 +268,7 @@ async def _process_frames(self, source_track: MediaStreamTrack) -> None:
         """Process frames from webcam and send metadata to all clients."""
         detector = get_detector()
         estimator = get_depth_estimator()
+        shared_preprocess = self._should_share_preprocess(detector, estimator)
 
         state = ProcessingState(
             target_scale=self.target_scale_init, source_track=source_track
@@ -301,7 +316,12 @@ async def frame_receiver() -> None:
 
                     self._inference_task = asyncio.create_task(
                         self._run_inference_pipeline(
-                            frame_small, state, detector, estimator, current_time
+                            frame_small,
+                            state,
+                            detector,
+                            estimator,
+                            current_time,
+                            shared_preprocess,
                         )
                     )
 
@@ -425,12 +445,25 @@ async def _process_detection(
         state: ProcessingState,
         detector: ObjectDetector,
         estimator: DepthEstimator,
+        shared_preprocess: bool,
     ) -> tuple[list[Detection], list[float], list[bool]]:
-        with self._measure_time(
-            self._detection_duration, labels={"backend": config.DETECTOR_BACKEND}
-        ):
-            # YOLO detection (async)
-            raw_detections = await detector.infer(frame_small)
+        if shared_preprocess:
+            resized, ratio, dwdh = letterbox(frame_small, config.DETECTOR_IMAGE_SIZE)
+            with self._measure_time(
+                self._detection_duration, labels={"backend": config.DETECTOR_BACKEND}
+            ):
+                raw_detections = await detector.infer_preprocessed(
+                    resized,
+                    ratio,
+                    dwdh,
+                    (frame_small.shape[0], frame_small.shape[1]),
+                )
+        else:
+            with self._measure_time(
+                self._detection_duration, labels={"backend": config.DETECTOR_BACKEND}
+            ):
+                # YOLO detection (async)
+                raw_detections = await detector.infer(frame_small)
 
         if not raw_detections:
             return [], [], []
@@ -440,9 +473,18 @@ async def _process_detection(
             labels={"model_type": estimator.model_type},
         ):
             # Distance estimation (sync) -> run in executor
-            raw_distances = await asyncio.get_running_loop().run_in_executor(
-                None, estimator.estimate_distance_m, frame_small, raw_detections
-            )
+            if shared_preprocess:
+                raw_distances = await asyncio.get_running_loop().run_in_executor(
+                    None,
+                    estimator.estimate_distance_m_preprocessed,
+                    resized,
+                    raw_detections,
+                    (frame_small.shape[0], frame_small.shape[1]),
+                )
+            else:
+                raw_distances = await asyncio.get_running_loop().run_in_executor(
+                    None, estimator.estimate_distance_m, frame_small, raw_detections
+                )
 
         # Tracking logic
         updated_track_ids, track_assignments = (
@@ -493,6 +535,7 @@ async def _run_inference_pipeline(
         detector: ObjectDetector,
         estimator: DepthEstimator,
         current_time: float,
+        shared_preprocess: bool,
     ) -> None:
         """Run ML inference detection and tracking pipeline in background."""
 
@@ -517,6 +560,7 @@ async def _run_inference_pipeline(
                 state=state,
                 detector=detector,
                 estimator=estimator,
+                shared_preprocess=shared_preprocess,
             )
 
             if all_detections:
 
@@ -61,6 +61,7 @@ class Config:
     MIDAS_ONNX_MODEL_PATH: Path = Path(
         os.getenv("MIDAS_ONNX_MODEL_PATH", "models/midas_small.onnx")
     ).resolve()
+    MIDAS_ONNX_INPUT_SIZE: int = int(os.getenv("MIDAS_ONNX_INPUT_SIZE", "384"))
     MIDAS_ONNX_PROVIDERS: list[str] = [
         provider.strip()
         for provider in os.getenv("MIDAS_ONNX_PROVIDERS", "").split(",")
@@ -103,7 +104,7 @@ class Config:
         os.getenv("ONNX_MODEL_PATH", str(MODEL_PATH.with_suffix(".onnx")))
     ).resolve()
     DETECTOR_BACKEND: str = os.getenv("DETECTOR_BACKEND", "torch").lower()
-    DETECTOR_IMAGE_SIZE: int = int(os.getenv("DETECTOR_IMAGE_SIZE", "640"))
+    DETECTOR_IMAGE_SIZE: int = int(os.getenv("DETECTOR_IMAGE_SIZE", "384"))
     DETECTOR_CONF_THRESHOLD: float = float(os.getenv("DETECTOR_CONF_THRESHOLD", "0.25"))
     DETECTOR_IOU_THRESHOLD: float = float(os.getenv("DETECTOR_IOU_THRESHOLD", "0.7"))
     DETECTOR_MAX_DETECTIONS: int = int(os.getenv("DETECTOR_MAX_DETECTIONS", "100"))
@@ -120,6 +121,9 @@ class Config:
         for provider in os.getenv("ONNX_PROVIDERS", "").split(",")
         if provider.strip()
     ]
+    ONNX_SHARED_PREPROCESSING: bool = os.getenv(
+        "ONNX_SHARED_PREPROCESSING", "true"
+    ).lower() in ("1", "true", "yes")
     ONNX_IO_BINDING: bool = os.getenv("ONNX_IO_BINDING", "false").lower() in (
         "1",
         "true",
 
@@ -8,11 +8,9 @@
 import numpy as np
 import torch
 from PIL import Image
-
 from common.config import config
 from common.typing import Detection
 from common.protocols import DepthEstimator
-
 from common.utils.depth import calculate_distances, resize_to_frame
 
 
@@ -34,6 +32,58 @@
     AutoModelForDepthEstimation = None  # type: ignore
 
 
+def _build_midas_small_transform(
+    midas_transforms: object,
+    input_size: int,
+) -> Callable[[np.ndarray], torch.Tensor]:
+    """Create a MiDaS-small transform with a custom input size."""
+    import cv2
+    from torchvision.transforms import Compose  # type: ignore[import-untyped]
+
+    resize = getattr(midas_transforms, "Resize")
+    normalize = getattr(midas_transforms, "NormalizeImage")
+    prepare = getattr(midas_transforms, "PrepareForNet")
+
+    return Compose(
+        [
+            lambda img: {"image": img / 255.0},
+            resize(
+                input_size,
+                input_size,
+                resize_target=None,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=32,
+                resize_method="upper_bound",
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            prepare(),
+            lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
+        ]
+    )
+
+
+def _build_midas_no_resize_transform(
+    midas_transforms: object,
+    mean: list[float],
+    std: list[float],
+) -> Callable[[np.ndarray], torch.Tensor]:
+    """Create a MiDaS transform that assumes the input is already resized, want to avoid resize inside MiDaS."""
+    from torchvision.transforms import Compose  # type: ignore[import-untyped]
+
+    normalize = getattr(midas_transforms, "NormalizeImage")
+    prepare = getattr(midas_transforms, "PrepareForNet")
+
+    return Compose(
+        [
+            lambda img: {"image": img / 255.0},
+            normalize(mean=mean, std=std),
+            prepare(),
+            lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0),
+        ]
+    )
+
+
 # Factories let us swap depth estimation backends without changing call sites.
 DepthEstimatorFactory = Callable[[Optional[Path]], DepthEstimator]
 
@@ -227,13 +277,47 @@ def __init__(
         )
         self._input_name = self._session.get_inputs()[0].name
         self._output_name = self._session.get_outputs()[0].name
+        self._no_resize_transform: Optional[Callable[[np.ndarray], torch.Tensor]] = None
 
         super().__init__(
             midas_cache_directory=midas_cache_directory,
             model_type=model_type,
             midas_model=midas_model,
         )
 
+    def _load_transform(self) -> Callable[[np.ndarray], torch.Tensor]:
+        """Load MiDaS transform, aligned to the ONNX input size when needed."""
+        torch.hub.set_dir(str(self.midas_cache_directory))
+        midas_transforms = torch.hub.load(
+            self.midas_model, "transforms", trust_repo=True
+        )
+        if self.model_type in {"DPT_Large", "DPT_Hybrid"}:
+            if config.ONNX_SHARED_PREPROCESSING and all(
+                hasattr(midas_transforms, attr)
+                for attr in ("NormalizeImage", "PrepareForNet")
+            ):
+                self._no_resize_transform = _build_midas_no_resize_transform(
+                    midas_transforms, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]
+                )
+            return midas_transforms.dpt_transform
+        if self.model_type == "MiDaS_small":
+            if config.ONNX_SHARED_PREPROCESSING and all(
+                hasattr(midas_transforms, attr)
+                for attr in ("NormalizeImage", "PrepareForNet")
+            ):
+                self._no_resize_transform = _build_midas_no_resize_transform(
+                    midas_transforms,
+                    mean=[0.485, 0.456, 0.406],
+                    std=[0.229, 0.224, 0.225],
+                )
+            if config.MIDAS_ONNX_INPUT_SIZE != 256 and hasattr(
+                midas_transforms, "Resize"
+            ):
+                return _build_midas_small_transform(
+                    midas_transforms, config.MIDAS_ONNX_INPUT_SIZE
+                )
+        return midas_transforms.small_transform
+
     def _resolve_providers(self) -> list[str]:
         configured = config.MIDAS_ONNX_PROVIDERS or config.ONNX_PROVIDERS
         if configured:
@@ -255,6 +339,35 @@ def _predict_depth_map(
         self, frame_rgb: np.ndarray, output_shape: tuple[int, int]
     ) -> np.ndarray:
         input_batch = self.transform(frame_rgb)
+        return self._run_onnx_inference(input_batch, output_shape)
+
+    def estimate_distance_m_preprocessed(
+        self,
+        resized_rgb: np.ndarray,
+        dets: list[Detection],
+        output_shape: tuple[int, int],
+    ) -> list[float]:
+        """Estimate distances using a pre-resized ONNX input."""
+        self.update_id += 1
+        if self.update_id % self.update_freq != 0 and len(self.last_depths) == len(
+            dets
+        ):
+            return self.last_depths
+        depth_map = self._predict_depth_map_preprocessed(resized_rgb, output_shape)
+        distances = self._distances_from_depth_map(depth_map, dets)
+        self.last_depths = distances
+        return distances
+
+    def _predict_depth_map_preprocessed(
+        self, resized_rgb: np.ndarray, output_shape: tuple[int, int]
+    ) -> np.ndarray:
+        transform = self._no_resize_transform or self.transform
+        input_batch = transform(resized_rgb)
+        return self._run_onnx_inference(input_batch, output_shape)
+
+    def _run_onnx_inference(
+        self, input_batch: torch.Tensor, output_shape: tuple[int, int]
+    ) -> np.ndarray:
         _, _, h, w = input_batch.shape
         size = max(w, h)
         input_batch = torch.nn.functional.pad(input_batch, (0, size - w, 0, size - h))
Original file line number	Diff line number	Diff line change
`@@ -163,6 +163,7 @@ def main() -> None:`
`163`	`163`	`yolo_path=yolo_final_path,`
`164`	`164`	`output_path=yolo_onnx_target,`
`165`	`165`	`opset=args.onnx_opset,`
	`166`	`+ imgsz=config.DETECTOR_IMAGE_SIZE,`
`166`	`167`	`simplify=args.onnx_simplify,`
`167`	`168`	`half=config.ONNX_HALF_PRECISION,`
`168`	`169`	`)`
`@@ -198,6 +199,7 @@ def main() -> None:`
`198`	`199`	`model_type=args.midas_type,`
`199`	`200`	`model_repo=args.midas_repo,`
`200`	`201`	`opset=args.onnx_opset,`
	`202`	`+ input_size=config.MIDAS_ONNX_INPUT_SIZE,`
`201`	`203`	`half=config.ONNX_HALF_PRECISION,`
`202`	`204`	`)`
`203`	`205`