Skip to content

Commit ca1af75

Browse files
committed
Functioning onnxruntime with CUDA as execution provider
1 parent 153a467 commit ca1af75

File tree

11 files changed

+425
-1028
lines changed

11 files changed

+425
-1028
lines changed

deep_core/include/deep_core/plugin_interfaces/backend_memory_allocator.hpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -99,17 +99,6 @@ class BackendMemoryAllocator
9999
*/
100100
void copy_device_to_device(void * dst, const void * src, size_t bytes);
101101

102-
/**
103-
* @brief Copy data from host (CPU) to device memory - alias for copy_from_host
104-
* @param dst Destination device memory pointer
105-
* @param src Source host memory pointer
106-
* @param bytes Number of bytes to copy
107-
*/
108-
void copy_host_to_device(void * dst, const void * src, size_t bytes)
109-
{
110-
copy_from_host(dst, src, bytes);
111-
}
112-
113102
protected:
114103
/**
115104
* @brief Implementation of copy_from_host (to be overridden by backends)

deep_ort_gpu_backend_plugin/CMakeLists.txt

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ target_link_libraries(${DEEP_ORT_LIB}
162162
onnxruntime_gpu_vendor::onnxruntime_gpu_lib
163163
)
164164

165+
165166
# CUDA runtime is required since code calls CUDA functions directly
166167
# Try multiple strategies to find and link CUDA runtime
167168
set(CUDA_RUNTIME_LINKED FALSE)
@@ -220,15 +221,6 @@ if(NOT CUDA_RUNTIME_LINKED)
220221
endif()
221222
endif()
222223

223-
# Strategy 4: Try system-wide search as last resort
224-
if(NOT CUDA_RUNTIME_LINKED)
225-
find_library(SYSTEM_CUDA_RUNTIME_LIBRARY NAMES cudart)
226-
if(SYSTEM_CUDA_RUNTIME_LIBRARY)
227-
target_link_libraries(${DEEP_ORT_LIB} PRIVATE ${SYSTEM_CUDA_RUNTIME_LIBRARY})
228-
message(STATUS "Linking system CUDA runtime: ${SYSTEM_CUDA_RUNTIME_LIBRARY}")
229-
set(CUDA_RUNTIME_LINKED TRUE)
230-
endif()
231-
endif()
232224

233225
if(NOT CUDA_RUNTIME_LINKED)
234226
message(FATAL_ERROR "No CUDA runtime found - required for GPU functionality")
@@ -265,17 +257,17 @@ install(FILES plugins.xml
265257
# Export plugin description file to ament index
266258
pluginlib_export_plugin_description_file(deep_core plugins.xml)
267259

268-
# if(BUILD_TESTING)
269-
# find_package(deep_test REQUIRED)
260+
if(BUILD_TESTING)
261+
find_package(deep_test REQUIRED)
270262

271-
# # add_deep_test(test_ort_gpu_backend test/test_ort_gpu_backend.cpp
272-
# # LIBRARIES
273-
# # ${DEEP_ORT_LIB}
274-
# # deep_core::deep_core_lib
275-
# # onnxruntime_gpu_vendor::onnxruntime_gpu_lib
276-
# # )
263+
add_deep_test(test_ort_gpu_backend test/test_ort_gpu_backend.cpp
264+
LIBRARIES
265+
${DEEP_ORT_LIB}
266+
deep_core::deep_core_lib
267+
onnxruntime_gpu_vendor::onnxruntime_gpu_lib
268+
)
277269

278-
# endif()
270+
endif()
279271

280272
ament_export_targets(${PROJECT_NAME}Targets HAS_LIBRARY_TARGET)
281273
ament_export_libraries(${DEEP_ORT_LIB})

deep_ort_gpu_backend_plugin/README.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
1-
# deep_ort_backend_plugin
1+
# deep_ort_gpu_backend_plugin
22

33
ONNX Runtime GPU backend plugin for deep_core.
44

55
## Overview
66

77
Provides:
8-
- GPU inference executor using ONNX Runtime with options for CUDA or TensorRT execution provider
8+
- GPU inference executor using ONNX Runtime with options for CUDA or TensorRT(untested) execution provider
99
- Device context management for multi-GPU systems
10-
- Zero-copy inference with IO binding
1110

1211
## Plugin Name
1312

14-
`onnxruntime_cpu`
13+
`onnxruntime_gpu`
1514

1615
## Supported Formats
1716

@@ -22,7 +21,7 @@ ONNX models (.onnx files)
2221
Add to your `package.xml`:
2322

2423
```xml
25-
<exec_depend>deep_ort_backend_plugin</exec_depend>
24+
<exec_depend>deep_ort_gpu_backend_plugin</exec_depend>
2625
```
2726

2827
Configure your inference nodes to use this plugin:
@@ -37,4 +36,4 @@ inference_node:
3736
## Dependencies
3837
3938
- deep_core
40-
- onnxruntime_vendor
39+
- onnxruntime_gpu_vendor

deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_executor.hpp

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class OrtGpuBackendExecutor : public deep_ros::BackendInferenceExecutor
8686
bool load_model_impl(const std::filesystem::path & model_path) override;
8787

8888
/**
89-
* @brief Run inference using zero-copy IO binding with GPU acceleration
89+
* @brief Run inference with GPU acceleration
9090
* @param input Input tensor (must be compatible with model input)
9191
* @return Output tensor with inference results
9292
* @throws std::runtime_error if inference fails or no model loaded
@@ -107,6 +107,7 @@ class OrtGpuBackendExecutor : public deep_ros::BackendInferenceExecutor
107107
std::unique_ptr<Ort::Session> session_;
108108
std::unique_ptr<Ort::SessionOptions> session_options_;
109109
Ort::MemoryInfo memory_info_;
110+
std::shared_ptr<deep_ros::BackendMemoryAllocator> custom_allocator_;
110111

111112
/**
112113
* @brief Initialize session options with GPU execution provider
@@ -155,12 +156,6 @@ class OrtGpuBackendExecutor : public deep_ros::BackendInferenceExecutor
155156
* @brief Set CUDA device context
156157
*/
157158
void set_device() const;
158-
159-
/**
160-
* @brief Check if all TensorRT dependencies are available
161-
* @return true if TensorRT can be used, false otherwise
162-
*/
163-
bool check_tensorrt_dependencies() const;
164159
};
165160

166161
} // namespace deep_ort_gpu_backend

deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_backend_plugin.hpp

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,12 @@ namespace deep_ort_gpu_backend
2929
// Forward declarations
3030
enum class GpuExecutionProvider;
3131

32-
/**
33-
* @brief Get a simple CPU allocator for use in other packages
34-
* This function allows other packages to get a CPU allocator without
35-
* including CUDA headers or GPU-specific dependencies
36-
* @return Shared pointer to a simple CPU allocator
37-
*/
38-
std::shared_ptr<deep_ros::BackendMemoryAllocator> create_simple_cpu_allocator();
39-
4032
/**
4133
* @brief ONNX Runtime GPU backend plugin
4234
*
4335
* Combines ORT GPU memory allocator and inference executor into a single
44-
* backend plugin for use with pluginlib. Supports both CUDA and TensorRT
45-
* execution providers.
36+
* backend plugin for use with pluginlib. Supports CUDA
37+
* execution provider and possible more in the future.
4638
*/
4739
class OrtGpuBackendPlugin : public deep_ros::DeepBackendPlugin
4840
{

deep_ort_gpu_backend_plugin/include/deep_ort_gpu_backend_plugin/ort_gpu_memory_allocator.hpp

Lines changed: 46 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -42,63 +42,63 @@ namespace deep_ort_gpu_backend
4242
{
4343

4444
/**
45-
* @brief Simple CPU memory allocator for output tensors
45+
* @brief CPU memory allocator for GPU backend output tensors (similar to CPU backend)
4646
*/
47-
class SimpleCpuAllocator : public deep_ros::BackendMemoryAllocator
47+
class OrtGpuCpuMemoryAllocator : public deep_ros::BackendMemoryAllocator
4848
{
4949
public:
50-
void * allocate(size_t bytes) override
51-
{
52-
return std::malloc(bytes);
53-
}
54-
55-
void deallocate(void * ptr) override
56-
{
57-
std::free(ptr);
58-
}
59-
60-
bool is_device_memory() const override
61-
{
62-
return false;
63-
}
64-
65-
std::string device_name() const override
66-
{
67-
return "cpu";
68-
}
50+
/**
51+
* @brief Constructor - initializes ORT allocator integration
52+
*/
53+
OrtGpuCpuMemoryAllocator();
54+
55+
/**
56+
* @brief Destructor - cleans up ORT resources
57+
*/
58+
~OrtGpuCpuMemoryAllocator() override;
59+
60+
/**
61+
* @brief Get the ORT allocator for integration
62+
* @return Pointer to OrtAllocator
63+
*/
64+
OrtAllocator * get_ort_allocator();
65+
66+
/**
67+
* @brief Get the ORT memory info
68+
* @return Pointer to OrtMemoryInfo
69+
*/
70+
const OrtMemoryInfo * get_ort_memory_info() const;
71+
72+
// BackendMemoryAllocator interface implementation
73+
void * allocate(size_t bytes) override;
74+
void deallocate(void * ptr) override;
75+
bool is_device_memory() const override;
76+
std::string device_name() const override;
6977

7078
protected:
71-
void copy_from_host_impl(void * dst, const void * src, size_t bytes) override
72-
{
73-
memcpy(dst, src, bytes);
74-
}
75-
79+
void copy_from_host_impl(void * dst, const void * src, size_t bytes) override;
7680
void copy_from_host_permuted_impl(
7781
void * dst,
7882
const void * src,
7983
const std::vector<size_t> & src_shape,
8084
const std::vector<size_t> & permutation,
81-
size_t elem_size) override
82-
{
83-
// Simple implementation - just copy without permutation
84-
size_t total_elements = 1;
85-
for (size_t dim : src_shape) {
86-
total_elements *= dim;
87-
}
88-
memcpy(dst, src, total_elements * elem_size);
89-
}
90-
91-
void copy_to_host_impl(void * dst, const void * src, size_t bytes) override
92-
{
93-
memcpy(dst, src, bytes);
94-
}
95-
96-
void copy_device_to_device_impl(void * dst, const void * src, size_t bytes) override
97-
{
98-
memcpy(dst, src, bytes);
99-
}
85+
size_t elem_size) override;
86+
void copy_to_host_impl(void * dst, const void * src, size_t bytes) override;
87+
void copy_device_to_device_impl(void * dst, const void * src, size_t bytes) override;
88+
89+
private:
90+
// ORT allocator integration (static instance for callbacks)
91+
static OrtGpuCpuMemoryAllocator * instance_;
92+
OrtAllocator ort_allocator_;
93+
OrtMemoryInfo * ort_memory_info_;
94+
95+
// Static callback functions for ORT integration
96+
static void * ORT_API_CALL ort_alloc(OrtAllocator * this_, size_t size);
97+
static void ORT_API_CALL ort_free(OrtAllocator * this_, void * p);
98+
static const OrtMemoryInfo * ORT_API_CALL ort_info(const OrtAllocator * this_);
99+
static void * ORT_API_CALL ort_reserve(OrtAllocator * this_, size_t size);
100100
};
101101

102-
std::shared_ptr<deep_ros::BackendMemoryAllocator> get_simple_cpu_allocator();
102+
std::shared_ptr<deep_ros::BackendMemoryAllocator> get_ort_gpu_cpu_allocator();
103103

104104
} // namespace deep_ort_gpu_backend

0 commit comments

Comments
 (0)