WATonomous · Edwardius · Oct 12, 2025 · Oct 12, 2025
diff --git a/deep_conversions/include/deep_conversions/image_conversions.hpp b/deep_conversions/include/deep_conversions/image_conversions.hpp
@@ -59,27 +59,28 @@ ImageEncoding get_image_encoding_info(const std::string & encoding);
 /**
  * @brief Convert sensor_msgs::msg::Image to Tensor
  * @param image ROS Image message
- * @param allocator Memory allocator to use (uses CPU allocator if nullptr)
+ * @param allocator Memory allocator to use (required)
  * @param layout Tensor layout format (HWC or CHW)
  * @return Tensor with shape [1, height, width, channels] (HWC) or [1, channels, height, width] (CHW)
- * @throws std::runtime_error if image dimensions are invalid or data size mismatches
+ * @throws std::runtime_error if image dimensions are invalid, data size mismatches, or allocator is nullptr
  */
 Tensor from_image(
   const sensor_msgs::msg::Image & image,
-  std::shared_ptr<BackendMemoryAllocator> allocator = nullptr,
+  std::shared_ptr<BackendMemoryAllocator> allocator,
   TensorLayout layout = TensorLayout::HWC);
 
 /**
  * @brief Convert vector of sensor_msgs::msg::Image to batched Tensor
  * @param images Vector of ROS Image messages
- * @param allocator Memory allocator to use (uses CPU allocator if nullptr)
+ * @param allocator Memory allocator to use (required)
  * @param layout Tensor layout format (HWC or CHW)
  * @return Tensor with shape [batch_size, height, width, channels] (HWC) or [batch_size, channels, height, width] (CHW)
  * @throws std::invalid_argument if batch is empty or images have mismatched dimensions/encodings
+ * @throws std::runtime_error if allocator is nullptr
  */
 Tensor from_image(
   const std::vector<sensor_msgs::msg::Image> & images,
-  std::shared_ptr<BackendMemoryAllocator> allocator = nullptr,
+  std::shared_ptr<BackendMemoryAllocator> allocator,
   TensorLayout layout = TensorLayout::HWC);
 
 /**

diff --git a/deep_conversions/src/image_conversions.cpp b/deep_conversions/src/image_conversions.cpp
@@ -127,26 +127,17 @@ ImageEncoding get_image_encoding_info(const std::string & encoding)
 Tensor from_image(
   const sensor_msgs::msg::Image & image, std::shared_ptr<BackendMemoryAllocator> allocator, TensorLayout layout)
 {
+  if (!allocator) {
+    throw std::runtime_error("Memory allocator is required for image conversion");
+  }
+
   if (image.height == 0 || image.width == 0) {
     throw std::runtime_error(
       "Invalid image dimensions: height=" + std::to_string(image.height) + ", width=" + std::to_string(image.width));
   }
 
   auto encoding_info = get_image_encoding_info(image.encoding);
 
-  // Create tensor with proper shape based on layout
-  std::vector<size_t> shape;
-  if (layout == TensorLayout::CHW) {
-    // CHW: [batch, channels, height, width]
-    shape = {1, encoding_info.channels, image.height, image.width};
-  } else {
-    // HWC: [batch, height, width, channels]
-    shape = {1, image.height, image.width};
-    if (encoding_info.channels > 1) {
-      shape.push_back(encoding_info.channels);
-    }
-  }
-
   // Validate step size (bytes per row)
   size_t expected_step = image.width * encoding_info.channels * encoding_info.bytes_per_channel;
   if (image.step != expected_step) {
@@ -162,30 +153,39 @@ Tensor from_image(
       std::to_string(image.data.size()));
   }
 
-  Tensor tensor(shape, encoding_info.dtype, allocator);
-
-  if (layout == TensorLayout::HWC) {
-    // Direct copy for HWC layout
-    if (allocator) {
-      allocator->copy_from_host(tensor.data(), image.data.data(), image.data.size());
-    } else {
-      std::memcpy(tensor.data(), image.data.data(), image.data.size());
+  // Create tensor with shape and copy data based on layout
+  std::vector<size_t> shape;
+  Tensor tensor;
+
+  switch (layout) {
+    case TensorLayout::CHW: {
+      // CHW: [batch, channels, height, width]
+      shape = {1, encoding_info.channels, image.height, image.width};
+      tensor = Tensor(shape, encoding_info.dtype, allocator);
+
+      // Use copy_from_host_permuted to copy and transpose in one operation
+      // Source is HWC: [1, height, width, channels]
+      // Permutation [0, 3, 1, 2] converts BHWC to BCHW
+      std::vector<size_t> src_shape = {1, image.height, image.width, encoding_info.channels};
+      std::vector<size_t> permutation = {0, 3, 1, 2};
+      allocator->copy_from_host_permuted(
+        tensor.data(), image.data.data(), src_shape, permutation, encoding_info.bytes_per_channel);
+      break;
     }
-  } else {
-    // Transpose HWC to CHW for CHW layout
-    const auto * src = image.data.data();
-    auto * dst = static_cast<uint8_t *>(tensor.data());
-    size_t pixel_bytes = encoding_info.bytes_per_channel;
-
-    for (size_t c = 0; c < encoding_info.channels; ++c) {
-      for (size_t h = 0; h < image.height; ++h) {
-        for (size_t w = 0; w < image.width; ++w) {
-          size_t src_idx = ((h * image.width + w) * encoding_info.channels + c) * pixel_bytes;
-          size_t dst_idx = ((c * image.height + h) * image.width + w) * pixel_bytes;
-          std::memcpy(dst + dst_idx, src + src_idx, pixel_bytes);
-        }
+    case TensorLayout::HWC: {
+      // HWC: [batch, height, width, channels]
+      shape = {1, image.height, image.width};
+      if (encoding_info.channels > 1) {
+        shape.push_back(encoding_info.channels);
       }
+      tensor = Tensor(shape, encoding_info.dtype, allocator);
+
+      // Direct copy for HWC layout
+      allocator->copy_from_host(tensor.data(), image.data.data(), image.data.size());
+      break;
     }
+    default:
+      throw std::invalid_argument("Unsupported tensor layout");
   }
 
   return tensor;
@@ -196,26 +196,17 @@ Tensor from_image(
   std::shared_ptr<BackendMemoryAllocator> allocator,
   TensorLayout layout)
 {
+  if (!allocator) {
+    throw std::runtime_error("Memory allocator is required for image conversion");
+  }
+
   if (images.empty()) {
     throw std::invalid_argument("Image batch is empty");
   }
 
   // Get encoding info from first image
   auto encoding_info = get_image_encoding_info(images[0].encoding);
 
-  // Create batch shape based on layout
-  std::vector<size_t> shape;
-  if (layout == TensorLayout::CHW) {
-    // CHW: [batch_size, channels, height, width]
-    shape = {images.size(), encoding_info.channels, images[0].height, images[0].width};
-  } else {
-    // HWC: [batch_size, height, width, channels]
-    shape = {images.size(), images[0].height, images[0].width};
-    if (encoding_info.channels > 1) {
-      shape.push_back(encoding_info.channels);
-    }
-  }
-
   // Validate all images have same dimensions and encoding
   size_t expected_size = images[0].height * images[0].width * encoding_info.channels * encoding_info.bytes_per_channel;
   for (size_t i = 0; i < images.size(); ++i) {
@@ -232,37 +223,50 @@ Tensor from_image(
     }
   }
 
-  Tensor tensor(shape, encoding_info.dtype, allocator);
-  auto * dst = static_cast<uint8_t *>(tensor.data());
-  size_t height = images[0].height;
-  size_t width = images[0].width;
-  size_t pixel_bytes = encoding_info.bytes_per_channel;
-
-  if (layout == TensorLayout::HWC) {
-    // Direct copy for HWC layout
-    for (size_t i = 0; i < images.size(); ++i) {
-      if (allocator) {
-        allocator->copy_from_host(dst + i * images[i].data.size(), images[i].data.data(), images[i].data.size());
-      } else {
-        std::memcpy(dst + i * images[i].data.size(), images[i].data.data(), images[i].data.size());
+  // Create batch tensor with shape and copy data based on layout
+  std::vector<size_t> shape;
+  Tensor tensor;
+
+  switch (layout) {
+    case TensorLayout::CHW: {
+      // CHW: [batch_size, channels, height, width]
+      shape = {images.size(), encoding_info.channels, images[0].height, images[0].width};
+      tensor = Tensor(shape, encoding_info.dtype, allocator);
+      auto * dst = static_cast<uint8_t *>(tensor.data());
+
+      // Use copy_from_host_permuted for each image
+      std::vector<size_t> src_shape = {1, images[0].height, images[0].width, encoding_info.channels};
+      std::vector<size_t> permutation = {0, 3, 1, 2};
+      size_t single_image_chw_size =
+        encoding_info.channels * images[0].height * images[0].width * encoding_info.bytes_per_channel;
+
+      for (size_t i = 0; i < images.size(); ++i) {
+        allocator->copy_from_host_permuted(
+          dst + i * single_image_chw_size,
+          images[i].data.data(),
+          src_shape,
+          permutation,
+          encoding_info.bytes_per_channel);
       }
+      break;
     }
-  } else {
-    // Transpose HWC to CHW for each image in batch
-    for (size_t b = 0; b < images.size(); ++b) {
-      const auto * src = images[b].data.data();
-      size_t batch_offset = b * encoding_info.channels * height * width * pixel_bytes;
-
-      for (size_t c = 0; c < encoding_info.channels; ++c) {
-        for (size_t h = 0; h < height; ++h) {
-          for (size_t w = 0; w < width; ++w) {
-            size_t src_idx = ((h * width + w) * encoding_info.channels + c) * pixel_bytes;
-            size_t dst_idx = batch_offset + ((c * height + h) * width + w) * pixel_bytes;
-            std::memcpy(dst + dst_idx, src + src_idx, pixel_bytes);
-          }
-        }
+    case TensorLayout::HWC: {
+      // HWC: [batch_size, height, width, channels]
+      shape = {images.size(), images[0].height, images[0].width};
+      if (encoding_info.channels > 1) {
+        shape.push_back(encoding_info.channels);
+      }
+      tensor = Tensor(shape, encoding_info.dtype, allocator);
+      auto * dst = static_cast<uint8_t *>(tensor.data());
+
+      // Direct copy for HWC layout
+      for (size_t i = 0; i < images.size(); ++i) {
+        allocator->copy_from_host(dst + i * images[i].data.size(), images[i].data.data(), images[i].data.size());
       }
+      break;
     }
+    default:
+      throw std::invalid_argument("Unsupported tensor layout");
   }
 
   return tensor;

diff --git a/deep_core/include/deep_core/plugin_interfaces/backend_memory_allocator.hpp b/deep_core/include/deep_core/plugin_interfaces/backend_memory_allocator.hpp
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <string>
+#include <vector>
 
 namespace deep_ros
 {
@@ -61,6 +62,25 @@ class BackendMemoryAllocator
    */
   void copy_from_host(void * dst, const void * src, size_t bytes);
 
+  /**
+   * @brief Copy data from host memory to allocated memory with permutation
+   *
+   * Copies and transposes data in a single operation.
+   *
+   * @param dst Destination pointer (allocated by this allocator)
+   * @param src Source pointer (host memory)
+   * @param src_shape Shape of the source data
+   * @param permutation Dimension permutation to apply during copy
+   * @param elem_size Size of each element in bytes
+   * @throws std::invalid_argument if parameters are invalid
+   */
+  void copy_from_host_permuted(
+    void * dst,
+    const void * src,
+    const std::vector<size_t> & src_shape,
+    const std::vector<size_t> & permutation,
+    size_t elem_size);
+
   /**
    * @brief Copy data from allocated memory to host (CPU) memory
    * @param dst Destination pointer (host memory)
@@ -85,6 +105,16 @@ class BackendMemoryAllocator
    */
   virtual void copy_from_host_impl(void * dst, const void * src, size_t bytes) = 0;
 
+  /**
+   * @brief Implementation of copy_from_host_permuted (to be overridden by backends)
+   */
+  virtual void copy_from_host_permuted_impl(
+    void * dst,
+    const void * src,
+    const std::vector<size_t> & src_shape,
+    const std::vector<size_t> & permutation,
+    size_t elem_size) = 0;
+
   /**
    * @brief Implementation of copy_to_host (to be overridden by backends)
    */

diff --git a/deep_core/src/backend_memory_allocator.cpp b/deep_core/src/backend_memory_allocator.cpp
@@ -15,6 +15,7 @@
 #include "deep_core/plugin_interfaces/backend_memory_allocator.hpp"
 
 #include <stdexcept>
+#include <vector>
 
 namespace deep_ros
 {
@@ -27,6 +28,25 @@ void BackendMemoryAllocator::copy_from_host(void * dst, const void * src, size_t
   copy_from_host_impl(dst, src, bytes);
 }
 
+void BackendMemoryAllocator::copy_from_host_permuted(
+  void * dst,
+  const void * src,
+  const std::vector<size_t> & src_shape,
+  const std::vector<size_t> & permutation,
+  size_t elem_size)
+{
+  if (dst == nullptr || src == nullptr) {
+    throw std::invalid_argument("Null pointer passed to copy_from_host_permuted");
+  }
+  if (src_shape.empty() || permutation.empty()) {
+    throw std::invalid_argument("Empty shape or permutation passed to copy_from_host_permuted");
+  }
+  if (src_shape.size() != permutation.size()) {
+    throw std::invalid_argument("Shape and permutation size mismatch in copy_from_host_permuted");
+  }
+  copy_from_host_permuted_impl(dst, src, src_shape, permutation, elem_size);
+}
+
 void BackendMemoryAllocator::copy_to_host(void * dst, const void * src, size_t bytes)
 {
   if (bytes > 0 && (dst == nullptr || src == nullptr)) {

diff --git a/deep_ort_backend_plugin/include/deep_ort_backend_plugin/ort_cpu_memory_allocator.hpp b/deep_ort_backend_plugin/include/deep_ort_backend_plugin/ort_cpu_memory_allocator.hpp
@@ -18,6 +18,7 @@
 
 #include <memory>
 #include <string>
+#include <vector>
 
 #include <deep_core/plugin_interfaces/backend_memory_allocator.hpp>
 
@@ -90,6 +91,21 @@ class OrtCpuMemoryAllocator : public deep_ros::BackendMemoryAllocator
    */
   void copy_from_host_impl(void * dst, const void * src, size_t bytes) override;
 
+  /**
+   * @brief Copy from host memory with permutation
+   * @param dst Destination pointer
+   * @param src Source pointer
+   * @param src_shape Shape of source data
+   * @param permutation Dimension permutation
+   * @param elem_size Element size in bytes
+   */
+  void copy_from_host_permuted_impl(
+    void * dst,
+    const void * src,
+    const std::vector<size_t> & src_shape,
+    const std::vector<size_t> & permutation,
+    size_t elem_size) override;
+
   /**
    * @brief Copy to host memory (same as device for CPU)
    * @param dst Destination pointer