WATonomous
diff --git a/‎deep_object_detection/README.md‎
Lines changed: 143 additions & 165 deletions b/‎deep_object_detection/README.md‎
Lines changed: 143 additions & 165 deletions
diff --git a/‎deep_object_detection/config/generic_model_params.yaml‎
Lines changed: 4 additions & 7 deletions b/‎deep_object_detection/config/generic_model_params.yaml‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎deep_object_detection/include/deep_object_detection/detection_types.hpp‎
Lines changed: 3 additions & 7 deletions b/‎deep_object_detection/include/deep_object_detection/detection_types.hpp‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎deep_object_detection/include/deep_object_detection/generic_postprocessor.hpp‎
Lines changed: 9 additions & 33 deletions b/‎deep_object_detection/include/deep_object_detection/generic_postprocessor.hpp‎
Lines changed: 9 additions & 33 deletions
diff --git a/‎deep_object_detection/package.xml‎
Lines changed: 1 addition & 1 deletion b/‎deep_object_detection/package.xml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deep_object_detection/src/deep_object_detection_node.cpp‎
Lines changed: 25 additions & 81 deletions b/‎deep_object_detection/src/deep_object_detection_node.cpp‎
Lines changed: 25 additions & 81 deletions
@@ -7,6 +7,7 @@ deep_object_detection_node:
     Model:
       num_classes: 80
       bbox_format: "cxcywh"
+      output_shape: [1, 8400, 84]
 
     Preprocessing:
       input_width: 640
@@ -23,15 +24,11 @@ deep_object_detection_node:
       nms_iou_threshold: 0.45
       score_activation: "sigmoid"
       enable_nms: true
-      use_multi_output: false
-      output_boxes_idx: 0
-      output_scores_idx: 1
-      output_classes_idx: 2
       class_score_mode: "all_classes"
       class_score_start_idx: -1
       class_score_count: -1
+
       layout:
-        auto_detect: true
         batch_dim: 0
         detection_dim: 1
         feature_dim: 2
@@ -41,8 +38,8 @@ deep_object_detection_node:
         class_idx: 5
 
     Backend:
-      plugin: "onnxruntime_gpu"  # Options: "onnxruntime_cpu" or "onnxruntime_gpu"
-      execution_provider: "tensorrt"  # For onnxruntime_gpu: "cuda" or "tensorrt"
+      plugin: "onnxruntime_gpu"
+      execution_provider: "tensorrt"
       device_id: 0
       trt_engine_cache_enable: true
       trt_engine_cache_path: "/tmp/deep_ros_ort_trt_cache"
@@ -93,17 +93,17 @@ struct ModelMetadata
   int num_classes;  ///< Number of detection classes
   std::string class_names_file;  ///< Path to class names file (one per line, optional)
   std::string bbox_format;  ///< Bounding box format used by model ("cxcywh", "xyxy", "xywh")
+  std::vector<size_t> output_shape;  ///< Model output tensor shape (optional, for logging/validation)
 };
 
 /**
  * @brief Output tensor layout configuration
  *
- * Describes the structure of model output tensors for manual layout specification.
- * Only used when auto_detect is false.
+ * Describes the structure of model output tensors.
+ * All layout parameters must be manually specified in the config file.
  */
 struct OutputLayoutConfig
 {
-  bool auto_detect;  ///< True to auto-detect layout, false to use manual config
   int batch_dim;  ///< Batch dimension index
   int detection_dim;  ///< Detection dimension index
   int feature_dim;  ///< Feature dimension index
@@ -122,10 +122,6 @@ struct PostprocessingConfig
   float nms_iou_threshold;  ///< IoU threshold for Non-Maximum Suppression
   std::string score_activation;  ///< Score activation function ("sigmoid", "softmax", "none")
   bool enable_nms;  ///< Enable Non-Maximum Suppression
-  bool use_multi_output;  ///< True if model has separate outputs for boxes, scores, classes
-  int output_boxes_idx;  ///< Output index for bounding boxes (if use_multi_output)
-  int output_scores_idx;  ///< Output index for scores (if use_multi_output)
-  int output_classes_idx;  ///< Output index for class IDs (if use_multi_output)
   std::string class_score_mode;  ///< How class scores are extracted ("all_classes", "single_confidence")
   int class_score_start_idx;  ///< Start index for class scores (-1 = use all)
   int class_score_count;  ///< Count of class scores (-1 = use all)
 
@@ -17,7 +17,7 @@
  * @brief Generic postprocessor for object detection model outputs
  *
  * This header defines the GenericPostprocessor class which:
- * - Automatically detects output tensor layouts
+ * - Uses manually configured output tensor layouts (from YAML config)
  * - Applies score activation and thresholding
  * - Performs Non-Maximum Suppression (NMS)
  * - Transforms coordinates from preprocessed to original image space
@@ -43,7 +43,7 @@ namespace deep_object_detection
  * @brief Generic postprocessor for object detection models
  *
  * Handles postprocessing pipeline for various ONNX model output formats:
- * - Automatic layout detection (supports [batch, detections, features], [batch, features, detections], etc.)
+ * - Manual layout configuration (supports [batch, detections, features], [batch, features, detections], etc.)
  * - Score activation (sigmoid, softmax, or none)
  * - Score thresholding
  * - Non-maximum suppression (NMS)
@@ -52,6 +52,7 @@ namespace deep_object_detection
  *
  * Supports both single-output models (boxes + scores + classes in one tensor)
  * and multi-output models (separate tensors for boxes, scores, classes).
+ * Layout must be manually configured in the YAML config file.
  */
 class GenericPostprocessor
 {
@@ -76,7 +77,6 @@ class GenericPostprocessor
     size_t class_idx = 5;  ///< Index for class ID in feature dimension
     bool has_separate_class_output = false;  ///< True if class IDs are in separate output tensor
     size_t class_output_idx = 0;  ///< Output index for separate class tensor (if applicable)
-    bool auto_detect = true;  ///< True if layout should be auto-detected
   };
 
   /**
@@ -97,28 +97,16 @@ class GenericPostprocessor
     bool use_letterbox);
 
   /**
-   * @brief Automatically detect output tensor layout from shape
-   * @param output_shape Model output tensor shape
-   * @return Detected OutputLayout
-   *
-   * Analyzes tensor shape to determine layout:
-   * - [batch, detections, features] -> standard layout
-   * - [batch, features, detections] -> transposed layout
-   * - [batch, queries, 4+classes] -> query-based (DETR-style)
-   * - Other shapes -> heuristic-based detection
-   */
-  static OutputLayout detectLayout(const std::vector<size_t> & output_shape);
-
-  /**
-   * @brief Auto-configure output layout based on config and optional output shape
-   * @param output_shape Model output shape (can be empty for deferred detection)
+   * @brief Configure output layout from manual configuration
+   * @param output_shape Model output shape (optional, for validation/logging)
    * @param layout_config Layout configuration from parameters
    * @return Configured OutputLayout
    *
-   * Handles both manual and auto-detection modes. If auto_detect is true and output_shape
-   * is available, automatically detects layout. Otherwise uses manual config or defers detection.
+   * Creates OutputLayout from manual configuration parameters.
+   * All layout parameters must be specified in the config file.
    */
-  static OutputLayout autoConfigure(const std::vector<size_t> & output_shape, const OutputLayoutConfig & layout_config);
+  static OutputLayout configureLayout(
+    const std::vector<size_t> & output_shape, const OutputLayoutConfig & layout_config);
 
   /**
    * @brief Decode model output tensor to detections
@@ -133,18 +121,6 @@ class GenericPostprocessor
   std::vector<std::vector<SimpleDetection>> decode(
     const deep_ros::Tensor & output, const std::vector<ImageMeta> & metas) const;
 
-  /**
-   * @brief Decode multi-output model to detections
-   * @param outputs Vector of model output tensors (boxes, scores, classes separately)
-   * @param metas Image metadata for coordinate transformation
-   * @return Vector of detections per image in batch
-   *
-   * For models with separate outputs for boxes, scores, and classes.
-   * Uses output_boxes_idx, output_scores_idx, output_classes_idx from config.
-   */
-  std::vector<std::vector<SimpleDetection>> decodeMultiOutput(
-    const std::vector<deep_ros::Tensor> & outputs, const std::vector<ImageMeta> & metas) const;
-
   /**
    * @brief Fill ROS Detection2DArray message with detections
    * @param header ROS message header (timestamp and frame_id)
 
@@ -4,7 +4,7 @@
   <version>0.1.0</version>
   <description>
     Generic model-agnostic deep learning object detection node using ONNX Runtime with explicit provider selection (TensorRT/CUDA/CPU).
-    Works with any ONNX-compatible object detection model - automatically detects and adapts to model output format.
+    Works with any ONNX-compatible object detection model. Output tensor layout must be configured in YAML config file.
     Features dynamic batching, multi-camera support, configurable preprocessing/postprocessing, and fail-fast error handling.
   </description>
 
 
@@ -61,6 +61,7 @@ void DeepObjectDetectionNode::declareParameters()
   this->declare_parameter<std::string>("class_names_path", "");
   this->declare_parameter<int>("Model.num_classes", 80);
   this->declare_parameter<std::string>("Model.bbox_format", "cxcywh");
+  this->declare_parameter<std::vector<int64_t>>("Model.output_shape", std::vector<int64_t>());
 
   this->declare_parameter<int>("Preprocessing.input_width", 640);
   this->declare_parameter<int>("Preprocessing.input_height", 640);
@@ -75,15 +76,10 @@ void DeepObjectDetectionNode::declareParameters()
   this->declare_parameter<double>("Postprocessing.nms_iou_threshold", 0.45);
   this->declare_parameter<std::string>("Postprocessing.score_activation", "sigmoid");
   this->declare_parameter<bool>("Postprocessing.enable_nms", true);
-  this->declare_parameter<bool>("Postprocessing.use_multi_output", false);
-  this->declare_parameter<int>("Postprocessing.output_boxes_idx", 0);
-  this->declare_parameter<int>("Postprocessing.output_scores_idx", 1);
-  this->declare_parameter<int>("Postprocessing.output_classes_idx", 2);
   this->declare_parameter<std::string>("Postprocessing.class_score_mode", "all_classes");
   this->declare_parameter<int>("Postprocessing.class_score_start_idx", -1);
   this->declare_parameter<int>("Postprocessing.class_score_count", -1);
 
-  this->declare_parameter<bool>("Postprocessing.layout.auto_detect", true);
   this->declare_parameter<int>("Postprocessing.layout.batch_dim", 0);
   this->declare_parameter<int>("Postprocessing.layout.detection_dim", 1);
   this->declare_parameter<int>("Postprocessing.layout.feature_dim", 2);
@@ -114,6 +110,11 @@ void DeepObjectDetectionNode::declareParameters()
   params_.model_metadata.num_classes = this->get_parameter("Model.num_classes").as_int();
   params_.model_metadata.class_names_file = this->get_parameter("class_names_path").as_string();
   params_.model_metadata.bbox_format = this->get_parameter("Model.bbox_format").as_string();
+  auto output_shape_int = this->get_parameter("Model.output_shape").as_integer_array();
+  params_.model_metadata.output_shape.clear();
+  for (auto dim : output_shape_int) {
+    params_.model_metadata.output_shape.push_back(static_cast<size_t>(dim));
+  }
 
   // Preprocessing parameters
   params_.preprocessing.input_width = this->get_parameter("Preprocessing.input_width").as_int();
@@ -136,15 +137,10 @@ void DeepObjectDetectionNode::declareParameters()
     static_cast<float>(this->get_parameter("Postprocessing.nms_iou_threshold").as_double());
   params_.postprocessing.score_activation = this->get_parameter("Postprocessing.score_activation").as_string();
   params_.postprocessing.enable_nms = this->get_parameter("Postprocessing.enable_nms").as_bool();
-  params_.postprocessing.use_multi_output = this->get_parameter("Postprocessing.use_multi_output").as_bool();
-  params_.postprocessing.output_boxes_idx = this->get_parameter("Postprocessing.output_boxes_idx").as_int();
-  params_.postprocessing.output_scores_idx = this->get_parameter("Postprocessing.output_scores_idx").as_int();
-  params_.postprocessing.output_classes_idx = this->get_parameter("Postprocessing.output_classes_idx").as_int();
   params_.postprocessing.class_score_mode = this->get_parameter("Postprocessing.class_score_mode").as_string();
   params_.postprocessing.class_score_start_idx = this->get_parameter("Postprocessing.class_score_start_idx").as_int();
   params_.postprocessing.class_score_count = this->get_parameter("Postprocessing.class_score_count").as_int();
 
-  params_.postprocessing.layout.auto_detect = this->get_parameter("Postprocessing.layout.auto_detect").as_bool();
   params_.postprocessing.layout.batch_dim = this->get_parameter("Postprocessing.layout.batch_dim").as_int();
   params_.postprocessing.layout.detection_dim = this->get_parameter("Postprocessing.layout.detection_dim").as_int();
   params_.postprocessing.layout.feature_dim = this->get_parameter("Postprocessing.layout.feature_dim").as_int();
@@ -182,42 +178,11 @@ deep_ros::CallbackReturn DeepObjectDetectionNode::on_configure_impl(const rclcpp
     loadClassNames();
     preprocessor_ = std::make_unique<ImagePreprocessor>(params_.preprocessing);
 
-    // Get allocator from base class
-    auto allocator = get_current_allocator();
-    if (!allocator) {
-      RCLCPP_ERROR(this->get_logger(), "Plugin did not provide allocator");
-      return deep_ros::CallbackReturn::FAILURE;
-    }
-
-    // dynamically get the output shape by running a dummy inference
-    std::vector<size_t> input_shape = {
-      1,
-      RGB_CHANNELS,
-      static_cast<size_t>(params_.preprocessing.input_height),
-      static_cast<size_t>(params_.preprocessing.input_width)};
-    std::vector<size_t> output_shape;
-    try {
-      PackedInput dummy;
-      dummy.shape = input_shape;
-      size_t total_elements = 1;
-      for (size_t dim : input_shape) {
-        total_elements *= dim;
-      }
-      dummy.data.assign(total_elements, 0.0f);
-
-      deep_ros::Tensor input_tensor(dummy.shape, deep_ros::DataType::FLOAT32, allocator);
-      const size_t bytes = dummy.data.size() * sizeof(float);
-      allocator->copy_from_host(input_tensor.data(), dummy.data.data(), bytes);
-
-      auto output_tensor = run_inference(input_tensor);
-      output_shape = output_tensor.shape();
-    } catch (const std::exception & e) {
-      RCLCPP_WARN(this->get_logger(), "Could not determine output shape: %s", e.what());
-      output_shape.clear();
-    }
+    // Get output shape from config (optional, for logging/validation)
+    const std::vector<size_t> & output_shape = params_.model_metadata.output_shape;
 
     auto formatShape = [](const std::vector<size_t> & shape) {
-      if (shape.empty()) return std::string("auto-detect");
+      if (shape.empty()) return std::string("not specified");
       std::string result;
       for (size_t i = 0; i < shape.size(); ++i) {
         result += std::to_string(shape[i]);
@@ -227,30 +192,19 @@ deep_ros::CallbackReturn DeepObjectDetectionNode::on_configure_impl(const rclcpp
     };
 
     if (!output_shape.empty()) {
-      RCLCPP_INFO(this->get_logger(), "Detected model output shape: [%s]", formatShape(output_shape).c_str());
+      RCLCPP_INFO(this->get_logger(), "Configured model output shape: [%s]", formatShape(output_shape).c_str());
     }
 
     const bool use_letterbox = (params_.preprocessing.resize_method == "letterbox");
 
     GenericPostprocessor::OutputLayout layout =
-      GenericPostprocessor::autoConfigure(output_shape, params_.postprocessing.layout);
-    if (layout.auto_detect && !output_shape.empty()) {
-      RCLCPP_INFO(
-        this->get_logger(),
-        "Auto-detected layout: batch_dim=%zu, detection_dim=%zu, feature_dim=%zu",
-        layout.batch_dim,
-        layout.detection_dim,
-        layout.feature_dim);
-    } else if (!layout.auto_detect) {
-      RCLCPP_INFO(
-        this->get_logger(),
-        "Using manual layout: batch_dim=%zu, detection_dim=%zu, feature_dim=%zu",
-        layout.batch_dim,
-        layout.detection_dim,
-        layout.feature_dim);
-    } else {
-      RCLCPP_INFO(this->get_logger(), "Layout will be auto-detected from first inference");
-    }
+      GenericPostprocessor::configureLayout(output_shape, params_.postprocessing.layout);
+    RCLCPP_INFO(
+      this->get_logger(),
+      "Using configured layout: batch_dim=%zu, detection_dim=%zu, feature_dim=%zu",
+      layout.batch_dim,
+      layout.detection_dim,
+      layout.feature_dim);
 
     postprocessor_ = std::make_unique<GenericPostprocessor>(
       params_.postprocessing,
@@ -516,24 +470,27 @@ void DeepObjectDetectionNode::processImages(
   auto start_time = std::chrono::steady_clock::now();
   std::vector<cv::Mat> processed;
   std::vector<ImageMeta> metas;
+  std::vector<std_msgs::msg::Header> processed_headers;
   processed.reserve(images.size());
   metas.reserve(images.size());
+  processed_headers.reserve(images.size());
 
   // Preprocess all images
-  for (const auto & img : images) {
-    if (img.empty()) {
+  for (size_t i = 0; i < images.size() && i < headers.size(); ++i) {
+    if (images[i].empty()) {
       RCLCPP_WARN(this->get_logger(), "Received empty image, skipping");
       continue;
     }
 
     ImageMeta meta;
-    cv::Mat preprocessed = preprocessor_->preprocess(img, meta);
+    cv::Mat preprocessed = preprocessor_->preprocess(images[i], meta);
     if (preprocessed.empty()) {
       RCLCPP_WARN(this->get_logger(), "Preprocessing returned empty image, skipping");
       continue;
     }
     processed.push_back(std::move(preprocessed));
     metas.push_back(meta);
+    processed_headers.push_back(headers[i]);
   }
 
   if (processed.empty()) {
@@ -553,14 +510,8 @@ void DeepObjectDetectionNode::processImages(
   allocator->copy_from_host(input_tensor.data(), packed_input.data.data(), bytes);
 
   // Run inference
-  std::vector<std::vector<SimpleDetection>> batch_detections;
-  if (params_.postprocessing.use_multi_output) {
-    auto output_tensor = run_inference(input_tensor);
-    batch_detections = postprocessor_->decodeMultiOutput({output_tensor}, metas);
-  } else {
-    auto output_tensor = run_inference(input_tensor);
-    batch_detections = postprocessor_->decode(output_tensor, metas);
-  }
+  auto output_tensor = run_inference(input_tensor);
+  std::vector<std::vector<SimpleDetection>> batch_detections = postprocessor_->decode(output_tensor, metas);
 
   auto end_time = std::chrono::steady_clock::now();
   auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
@@ -576,13 +527,6 @@ void DeepObjectDetectionNode::processImages(
       return sum + dets.size();
     }));
 
-  // Use headers that match the processed images (may be fewer if some were skipped)
-  std::vector<std_msgs::msg::Header> processed_headers;
-  processed_headers.reserve(processed.size());
-  for (size_t i = 0; i < processed.size() && i < headers.size(); ++i) {
-    processed_headers.push_back(headers[i]);
-  }
-
   publishDetections(batch_detections, processed_headers, metas);
 }