WATonomous
diff --git a/‎deep_core/include/deep_core/plugin_interfaces/backend_memory_allocator.hpp‎
Lines changed: 0 additions & 11 deletions b/‎deep_core/include/deep_core/plugin_interfaces/backend_memory_allocator.hpp‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎deep_ort_gpu_backend_plugin/CMakeLists.txt‎
Lines changed: 10 additions & 18 deletions b/‎deep_ort_gpu_backend_plugin/CMakeLists.txt‎
Lines changed: 10 additions & 18 deletions
diff --git a/‎deep_ort_gpu_backend_plugin/README.md‎
Lines changed: 5 additions & 6 deletions b/‎deep_ort_gpu_backend_plugin/README.md‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_executor.hpp‎
Lines changed: 2 additions & 7 deletions b/‎deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_executor.hpp‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_plugin.hpp‎
Lines changed: 2 additions & 10 deletions b/‎deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_plugin.hpp‎
Lines changed: 2 additions & 10 deletions
diff --git a/‎deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_memory_allocator.hpp‎
Lines changed: 46 additions & 46 deletions b/‎deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_memory_allocator.hpp‎
Lines changed: 46 additions & 46 deletions
@@ -99,17 +99,6 @@ class BackendMemoryAllocator
    */
   void copy_device_to_device(void * dst, const void * src, size_t bytes);
 
-  /**
-   * @brief Copy data from host (CPU) to device memory - alias for copy_from_host
-   * @param dst Destination device memory pointer
-   * @param src Source host memory pointer
-   * @param bytes Number of bytes to copy
-   */
-  void copy_host_to_device(void * dst, const void * src, size_t bytes)
-  {
-    copy_from_host(dst, src, bytes);
-  }
-
 protected:
   /**
    * @brief Implementation of copy_from_host (to be overridden by backends)
 
@@ -162,6 +162,7 @@ target_link_libraries(${DEEP_ORT_LIB}
     onnxruntime_gpu_vendor::onnxruntime_gpu_lib
 )
 
+
 # CUDA runtime is required since code calls CUDA functions directly
 # Try multiple strategies to find and link CUDA runtime
 set(CUDA_RUNTIME_LINKED FALSE)
@@ -220,15 +221,6 @@ if(NOT CUDA_RUNTIME_LINKED)
   endif()
 endif()
 
-# Strategy 4: Try system-wide search as last resort
-if(NOT CUDA_RUNTIME_LINKED)
-  find_library(SYSTEM_CUDA_RUNTIME_LIBRARY NAMES cudart)
-  if(SYSTEM_CUDA_RUNTIME_LIBRARY)
-    target_link_libraries(${DEEP_ORT_LIB} PRIVATE ${SYSTEM_CUDA_RUNTIME_LIBRARY})
-    message(STATUS "Linking system CUDA runtime: ${SYSTEM_CUDA_RUNTIME_LIBRARY}")
-    set(CUDA_RUNTIME_LINKED TRUE)
-  endif()
-endif()
 
 if(NOT CUDA_RUNTIME_LINKED)
   message(FATAL_ERROR "No CUDA runtime found - required for GPU functionality")
@@ -265,17 +257,17 @@ install(FILES plugins.xml
 # Export plugin description file to ament index
 pluginlib_export_plugin_description_file(deep_core plugins.xml)
 
-# if(BUILD_TESTING)
-#   find_package(deep_test REQUIRED)
+if(BUILD_TESTING)
+  find_package(deep_test REQUIRED)
 
-#   # add_deep_test(test_ort_gpu_backend test/test_ort_gpu_backend.cpp
-#   #   LIBRARIES
-#   #     ${DEEP_ORT_LIB}
-#   #     deep_core::deep_core_lib
-#   #     onnxruntime_gpu_vendor::onnxruntime_gpu_lib
-#   # )
+  add_deep_test(test_ort_gpu_backend test/test_ort_gpu_backend.cpp
+    LIBRARIES
+      ${DEEP_ORT_LIB}
+      deep_core::deep_core_lib
+      onnxruntime_gpu_vendor::onnxruntime_gpu_lib
+  )
 
-# endif()
+endif()
 
 ament_export_targets(${PROJECT_NAME}Targets HAS_LIBRARY_TARGET)
 ament_export_libraries(${DEEP_ORT_LIB})
 
@@ -1,17 +1,16 @@
-# deep_ort_backend_plugin
+# deep_ort_gpu_backend_plugin
 
 ONNX Runtime GPU backend plugin for deep_core.
 
 ## Overview
 
 Provides:
-- GPU inference executor using ONNX Runtime with options for CUDA or TensorRT execution provider
+- GPU inference executor using ONNX Runtime with options for CUDA or TensorRT(untested) execution provider
 - Device context management for multi-GPU systems
-- Zero-copy inference with IO binding
 
 ## Plugin Name
 
-`onnxruntime_cpu`
+`onnxruntime_gpu`
 
 ## Supported Formats
 
@@ -22,7 +21,7 @@ ONNX models (.onnx files)
 Add to your `package.xml`:
 
 ```xml
-<exec_depend>deep_ort_backend_plugin</exec_depend>
+<exec_depend>deep_ort_gpu_backend_plugin</exec_depend>
 ```
 
 Configure your inference nodes to use this plugin:
@@ -37,4 +36,4 @@ inference_node:
 ## Dependencies
 
 - deep_core
-- onnxruntime_vendor
+- onnxruntime_gpu_vendor
@@ -86,7 +86,7 @@ class OrtGpuBackendExecutor : public deep_ros::BackendInferenceExecutor
   bool load_model_impl(const std::filesystem::path & model_path) override;
 
   /**
-   * @brief Run inference using zero-copy IO binding with GPU acceleration
+   * @brief Run inference with GPU acceleration
    * @param input Input tensor (must be compatible with model input)
    * @return Output tensor with inference results
    * @throws std::runtime_error if inference fails or no model loaded
@@ -107,6 +107,7 @@ class OrtGpuBackendExecutor : public deep_ros::BackendInferenceExecutor
   std::unique_ptr<Ort::Session> session_;
   std::unique_ptr<Ort::SessionOptions> session_options_;
   Ort::MemoryInfo memory_info_;
+  std::shared_ptr<deep_ros::BackendMemoryAllocator> custom_allocator_;
 
   /**
    * @brief Initialize session options with GPU execution provider
@@ -155,12 +156,6 @@ class OrtGpuBackendExecutor : public deep_ros::BackendInferenceExecutor
    * @brief Set CUDA device context
    */
   void set_device() const;
-
-  /**
-   * @brief Check if all TensorRT dependencies are available
-   * @return true if TensorRT can be used, false otherwise
-   */
-  bool check_tensorrt_dependencies() const;
 };
 
 }  // namespace deep_ort_gpu_backend
@@ -29,20 +29,12 @@ namespace deep_ort_gpu_backend
 // Forward declarations
 enum class GpuExecutionProvider;
 
-/**
- * @brief Get a simple CPU allocator for use in other packages
- * This function allows other packages to get a CPU allocator without
- * including CUDA headers or GPU-specific dependencies
- * @return Shared pointer to a simple CPU allocator
- */
-std::shared_ptr<deep_ros::BackendMemoryAllocator> create_simple_cpu_allocator();
-
 /**
  * @brief ONNX Runtime GPU backend plugin
  *
  * Combines ORT GPU memory allocator and inference executor into a single
- * backend plugin for use with pluginlib. Supports both CUDA and TensorRT
- * execution providers.
+ * backend plugin for use with pluginlib. Supports CUDA
+ * execution provider and possible more in the future.
  */
 class OrtGpuBackendPlugin : public deep_ros::DeepBackendPlugin
 {
 
@@ -42,63 +42,63 @@ namespace deep_ort_gpu_backend
 {
 
 /**
- * @brief Simple CPU memory allocator for output tensors
+ * @brief CPU memory allocator for GPU backend output tensors (similar to CPU backend)
  */
-class SimpleCpuAllocator : public deep_ros::BackendMemoryAllocator
+class OrtGpuCpuMemoryAllocator : public deep_ros::BackendMemoryAllocator
 {
 public:
-  void * allocate(size_t bytes) override
-  {
-    return std::malloc(bytes);
-  }
-
-  void deallocate(void * ptr) override
-  {
-    std::free(ptr);
-  }
-
-  bool is_device_memory() const override
-  {
-    return false;
-  }
-
-  std::string device_name() const override
-  {
-    return "cpu";
-  }
+  /**
+   * @brief Constructor - initializes ORT allocator integration
+   */
+  OrtGpuCpuMemoryAllocator();
+
+  /**
+   * @brief Destructor - cleans up ORT resources
+   */
+  ~OrtGpuCpuMemoryAllocator() override;
+
+  /**
+   * @brief Get the ORT allocator for integration
+   * @return Pointer to OrtAllocator
+   */
+  OrtAllocator * get_ort_allocator();
+
+  /**
+   * @brief Get the ORT memory info
+   * @return Pointer to OrtMemoryInfo
+   */
+  const OrtMemoryInfo * get_ort_memory_info() const;
+
+  // BackendMemoryAllocator interface implementation
+  void * allocate(size_t bytes) override;
+  void deallocate(void * ptr) override;
+  bool is_device_memory() const override;
+  std::string device_name() const override;
 
 protected:
-  void copy_from_host_impl(void * dst, const void * src, size_t bytes) override
-  {
-    memcpy(dst, src, bytes);
-  }
-
+  void copy_from_host_impl(void * dst, const void * src, size_t bytes) override;
   void copy_from_host_permuted_impl(
     void * dst,
     const void * src,
     const std::vector<size_t> & src_shape,
     const std::vector<size_t> & permutation,
-    size_t elem_size) override
-  {
-    // Simple implementation - just copy without permutation
-    size_t total_elements = 1;
-    for (size_t dim : src_shape) {
-      total_elements *= dim;
-    }
-    memcpy(dst, src, total_elements * elem_size);
-  }
-
-  void copy_to_host_impl(void * dst, const void * src, size_t bytes) override
-  {
-    memcpy(dst, src, bytes);
-  }
-
-  void copy_device_to_device_impl(void * dst, const void * src, size_t bytes) override
-  {
-    memcpy(dst, src, bytes);
-  }
+    size_t elem_size) override;
+  void copy_to_host_impl(void * dst, const void * src, size_t bytes) override;
+  void copy_device_to_device_impl(void * dst, const void * src, size_t bytes) override;
+
+private:
+  // ORT allocator integration (static instance for callbacks)
+  static OrtGpuCpuMemoryAllocator * instance_;
+  OrtAllocator ort_allocator_;
+  OrtMemoryInfo * ort_memory_info_;
+
+  // Static callback functions for ORT integration
+  static void * ORT_API_CALL ort_alloc(OrtAllocator * this_, size_t size);
+  static void ORT_API_CALL ort_free(OrtAllocator * this_, void * p);
+  static const OrtMemoryInfo * ORT_API_CALL ort_info(const OrtAllocator * this_);
+  static void * ORT_API_CALL ort_reserve(OrtAllocator * this_, size_t size);
 };
 
-std::shared_ptr<deep_ros::BackendMemoryAllocator> get_simple_cpu_allocator();
+std::shared_ptr<deep_ros::BackendMemoryAllocator> get_ort_gpu_cpu_allocator();
 
 }  // namespace deep_ort_gpu_backend