From ceaeec25fbcfc697ededbfbe332704345645e812 Mon Sep 17 00:00:00 2001 From: sykwer Date: Sat, 14 Mar 2026 04:07:26 +0900 Subject: [PATCH 01/18] cuda ipc agnocast Signed-off-by: sykwer --- src/agnocast_cuda/CMakeLists.txt | 45 +++++++++++++ .../include/agnocast/cuda/types.hpp | 42 ++++++++++++ src/agnocast_cuda/package.xml | 20 ++++++ src/agnocast_cuda/src/cuda_ipc_backend.cpp | 57 ++++++++++++++++ src/agnocast_cuda/src/cuda_ipc_backend.hpp | 19 ++++++ src/agnocast_cuda/src/get_backend.cpp | 64 ++++++++++++++++++ src/agnocast_cuda/src/nvscibuf_backend.cpp | 30 +++++++++ src/agnocast_cuda/src/nvscibuf_backend.hpp | 21 ++++++ .../src/unified_memory_backend.cpp | 30 +++++++++ .../src/unified_memory_backend.hpp | 21 ++++++ src/agnocast_cuda/src/vmm_backend.cpp | 30 +++++++++ src/agnocast_cuda/src/vmm_backend.hpp | 22 ++++++ .../CMakeLists.txt | 28 ++++++++ src/agnocast_sample_application/package.xml | 7 ++ .../src/cuda_publisher.cpp | 67 +++++++++++++++++++ .../src/cuda_subscriber.cpp | 52 ++++++++++++++ .../agnocast/agnocast_callback_info.hpp | 45 ++++++++++++- .../include/agnocast/agnocast_publisher.hpp | 30 ++++++++- .../agnocast/agnocast_smart_pointer.hpp | 56 +++++++++++++++- .../agnocast/agnocast_subscription.hpp | 7 +- .../agnocast/bridge/agnocast_bridge_node.hpp | 21 ++++-- .../include/agnocast/cuda_message_tag.hpp | 26 +++++++ .../include/agnocast/gpu_metadata.hpp | 25 +++++++ .../include/agnocast/gpu_transfer_backend.hpp | 35 ++++++++++ 24 files changed, 787 insertions(+), 13 deletions(-) create mode 100644 src/agnocast_cuda/CMakeLists.txt create mode 100644 src/agnocast_cuda/include/agnocast/cuda/types.hpp create mode 100644 src/agnocast_cuda/package.xml create mode 100644 src/agnocast_cuda/src/cuda_ipc_backend.cpp create mode 100644 src/agnocast_cuda/src/cuda_ipc_backend.hpp create mode 100644 src/agnocast_cuda/src/get_backend.cpp create mode 100644 src/agnocast_cuda/src/nvscibuf_backend.cpp create mode 100644 src/agnocast_cuda/src/nvscibuf_backend.hpp create mode 100644 src/agnocast_cuda/src/unified_memory_backend.cpp create mode 100644 src/agnocast_cuda/src/unified_memory_backend.hpp create mode 100644 src/agnocast_cuda/src/vmm_backend.cpp create mode 100644 src/agnocast_cuda/src/vmm_backend.hpp create mode 100644 src/agnocast_sample_application/src/cuda_publisher.cpp create mode 100644 src/agnocast_sample_application/src/cuda_subscriber.cpp create mode 100644 src/agnocastlib/include/agnocast/cuda_message_tag.hpp create mode 100644 src/agnocastlib/include/agnocast/gpu_metadata.hpp create mode 100644 src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp diff --git a/src/agnocast_cuda/CMakeLists.txt b/src/agnocast_cuda/CMakeLists.txt new file mode 100644 index 000000000..9d28e687d --- /dev/null +++ b/src/agnocast_cuda/CMakeLists.txt @@ -0,0 +1,45 @@ +cmake_minimum_required(VERSION 3.14) +project(agnocast_cuda LANGUAGES CXX CUDA) + +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic) +endif() + +find_package(ament_cmake REQUIRED) +find_package(agnocastlib REQUIRED) +find_package(sensor_msgs REQUIRED) +find_package(CUDAToolkit REQUIRED) + +add_library(agnocast_cuda SHARED + src/cuda_ipc_backend.cpp + src/vmm_backend.cpp + src/nvscibuf_backend.cpp + src/unified_memory_backend.cpp + src/get_backend.cpp) + +target_include_directories(agnocast_cuda PUBLIC + $ + $) + +ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs) +target_link_libraries(agnocast_cuda CUDA::cudart) + +install(TARGETS agnocast_cuda + EXPORT export_${PROJECT_NAME} + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + RUNTIME DESTINATION bin + INCLUDES DESTINATION include) + +install( + DIRECTORY include/ + DESTINATION include) + +ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET) +ament_export_include_directories(include) +ament_export_dependencies(agnocastlib sensor_msgs) +ament_package() diff --git a/src/agnocast_cuda/include/agnocast/cuda/types.hpp b/src/agnocast_cuda/include/agnocast/cuda/types.hpp new file mode 100644 index 000000000..c1d8a90c8 --- /dev/null +++ b/src/agnocast_cuda/include/agnocast/cuda/types.hpp @@ -0,0 +1,42 @@ +#pragma once + +#include "agnocast/cuda_message_tag.hpp" + +#include +#include + +#include +#include + +namespace agnocast::cuda +{ + +struct PointCloud2 : public sensor_msgs::msg::PointCloud2, public agnocast::cuda_message_tag +{ + uint8_t * data = nullptr; // GPU device pointer (shadows base class std::vector data) +}; + +struct Image : public sensor_msgs::msg::Image, public agnocast::cuda_message_tag +{ + uint8_t * data = nullptr; // GPU device pointer (shadows base class std::vector data) +}; + +} // namespace agnocast::cuda + +namespace agnocast +{ + +template <> +inline size_t get_cuda_gpu_data_size(const agnocast::cuda::PointCloud2 & msg) +{ + return static_cast(msg.height) * static_cast(msg.width) * + static_cast(msg.point_step); +} + +template <> +inline size_t get_cuda_gpu_data_size(const agnocast::cuda::Image & msg) +{ + return static_cast(msg.height) * msg.step; +} + +} // namespace agnocast diff --git a/src/agnocast_cuda/package.xml b/src/agnocast_cuda/package.xml new file mode 100644 index 000000000..1b0245d25 --- /dev/null +++ b/src/agnocast_cuda/package.xml @@ -0,0 +1,20 @@ + + + + agnocast_cuda + 0.1.0 + + CUDA IPC support for Agnocast. Provides GPU message types and cross-process GPU buffer sharing. + + Takahiro Ishikawa-Aso + Apache License 2.0 + + ament_cmake + + agnocastlib + sensor_msgs + + + ament_cmake + + diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.cpp b/src/agnocast_cuda/src/cuda_ipc_backend.cpp new file mode 100644 index 000000000..ab3ae8be2 --- /dev/null +++ b/src/agnocast_cuda/src/cuda_ipc_backend.cpp @@ -0,0 +1,57 @@ +#include "cuda_ipc_backend.hpp" + +#include + +#include +#include +#include + +namespace +{ + +void check_cuda_error(cudaError_t err, const char * operation) +{ + if (err != cudaSuccess) { + std::fprintf( + stderr, "[agnocast_cuda] FATAL: %s failed: %s\n", operation, cudaGetErrorString(err)); + std::abort(); + } +} + +} // namespace + +namespace agnocast::cuda +{ + +GpuHandle CudaIpcBackend::export_handle(void * device_ptr, size_t /*size*/) +{ + GpuHandle h{}; + static_assert(sizeof(cudaIpcMemHandle_t) <= sizeof(h.opaque)); + cudaIpcMemHandle_t ipc_handle; + check_cuda_error(cudaIpcGetMemHandle(&ipc_handle, device_ptr), "cudaIpcGetMemHandle"); + std::memcpy(h.opaque, &ipc_handle, sizeof(ipc_handle)); + return h; +} + +void CudaIpcBackend::free_device_memory(void * device_ptr) +{ + check_cuda_error(cudaFree(device_ptr), "cudaFree"); +} + +void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/) +{ + cudaIpcMemHandle_t ipc_handle; + std::memcpy(&ipc_handle, handle.opaque, sizeof(ipc_handle)); + void * ptr = nullptr; + check_cuda_error( + cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess), + "cudaIpcOpenMemHandle"); + return ptr; +} + +void CudaIpcBackend::release_handle(void * local_ptr) +{ + check_cuda_error(cudaIpcCloseMemHandle(local_ptr), "cudaIpcCloseMemHandle"); +} + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.hpp b/src/agnocast_cuda/src/cuda_ipc_backend.hpp new file mode 100644 index 000000000..fc262cea8 --- /dev/null +++ b/src/agnocast_cuda/src/cuda_ipc_backend.hpp @@ -0,0 +1,19 @@ +// Internal header — kept in src/ so it is NOT installed or visible to downstream packages. +// Only get_backend.cpp includes this to instantiate the singleton. +#pragma once + +#include "agnocast/gpu_transfer_backend.hpp" + +namespace agnocast::cuda +{ + +class CudaIpcBackend : public GpuTransferBackend +{ +public: + GpuHandle export_handle(void * device_ptr, size_t size) override; + void free_device_memory(void * device_ptr) override; + void * import_handle(const GpuHandle & handle, size_t size) override; + void release_handle(void * local_ptr) override; +}; + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/get_backend.cpp b/src/agnocast_cuda/src/get_backend.cpp new file mode 100644 index 000000000..87068ff54 --- /dev/null +++ b/src/agnocast_cuda/src/get_backend.cpp @@ -0,0 +1,64 @@ +#include "agnocast/gpu_transfer_backend.hpp" +#include "cuda_ipc_backend.hpp" +#include "nvscibuf_backend.hpp" +#include "unified_memory_backend.hpp" +#include "vmm_backend.hpp" + +#include + +#include +#include +#include + +namespace agnocast::cuda +{ + +namespace +{ + +std::unique_ptr select_backend() +{ + int device = 0; + cudaError_t err = cudaGetDevice(&device); + if (err != cudaSuccess) { + std::fprintf( + stderr, "[agnocast_cuda] FATAL: cudaGetDevice failed: %s\n", cudaGetErrorString(err)); + std::abort(); + } + + int is_integrated = 0; + err = cudaDeviceGetAttribute(&is_integrated, cudaDevAttrIntegrated, device); + if (err != cudaSuccess) { + std::fprintf( + stderr, "[agnocast_cuda] FATAL: cudaDeviceGetAttribute failed: %s\n", + cudaGetErrorString(err)); + std::abort(); + } + + if (!is_integrated) { + // Discrete GPU (GeForce, Quadro, Tesla, A/H series) — CUDA IPC is supported. + std::fprintf(stderr, "[agnocast_cuda] Discrete GPU detected, using CudaIpcBackend.\n"); + return std::make_unique(); + } + + // Integrated GPU (Jetson Xavier/Orin/Thor, DRIVE). + // TODO(agnocast): Implement and select the appropriate backend. + // - Jetson Thor (CUDA 13.0+): CudaIpcBackend may work via OpenRM. + // - Jetson Xavier/Orin: NvSciBufBackend or UnifiedMemoryBackend. + // - DRIVE: NvSciBufBackend. + std::fprintf( + stderr, + "[agnocast_cuda] FATAL: Integrated GPU detected (Jetson/DRIVE). " + "No backend is implemented yet for this platform.\n"); + std::abort(); +} + +} // namespace + +GpuTransferBackend & get_backend() +{ + static auto instance = select_backend(); + return *instance; +} + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/nvscibuf_backend.cpp b/src/agnocast_cuda/src/nvscibuf_backend.cpp new file mode 100644 index 000000000..0e900cedf --- /dev/null +++ b/src/agnocast_cuda/src/nvscibuf_backend.cpp @@ -0,0 +1,30 @@ +#include "nvscibuf_backend.hpp" + +#include + +namespace agnocast::cuda +{ + +GpuHandle NvSciBufBackend::export_handle(void * /*device_ptr*/, size_t /*size*/) +{ + throw std::runtime_error( + "[agnocast_cuda] NvSciBufBackend is not yet implemented. " + "Requires NvSciBuf (Jetson Xavier/Orin, NVIDIA DRIVE)."); +} + +void NvSciBufBackend::free_device_memory(void * /*device_ptr*/) +{ + throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented."); +} + +void * NvSciBufBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/) +{ + throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented."); +} + +void NvSciBufBackend::release_handle(void * /*local_ptr*/) +{ + throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented."); +} + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/nvscibuf_backend.hpp b/src/agnocast_cuda/src/nvscibuf_backend.hpp new file mode 100644 index 000000000..b631d1ca4 --- /dev/null +++ b/src/agnocast_cuda/src/nvscibuf_backend.hpp @@ -0,0 +1,21 @@ +// Internal header — kept in src/ so it is NOT installed or visible to downstream packages. +// Only get_backend.cpp includes this to instantiate the singleton. +#pragma once + +#include "agnocast/gpu_transfer_backend.hpp" + +namespace agnocast::cuda +{ + +// Placeholder backend using NvSciBuf / NvSciSync. +// Target platforms: Jetson Xavier/Orin, NVIDIA DRIVE. +class NvSciBufBackend : public GpuTransferBackend +{ +public: + GpuHandle export_handle(void * device_ptr, size_t size) override; + void free_device_memory(void * device_ptr) override; + void * import_handle(const GpuHandle & handle, size_t size) override; + void release_handle(void * local_ptr) override; +}; + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/unified_memory_backend.cpp b/src/agnocast_cuda/src/unified_memory_backend.cpp new file mode 100644 index 000000000..af97fff3b --- /dev/null +++ b/src/agnocast_cuda/src/unified_memory_backend.cpp @@ -0,0 +1,30 @@ +#include "unified_memory_backend.hpp" + +#include + +namespace agnocast::cuda +{ + +GpuHandle UnifiedMemoryBackend::export_handle(void * /*device_ptr*/, size_t /*size*/) +{ + throw std::runtime_error( + "[agnocast_cuda] UnifiedMemoryBackend is not yet implemented. " + "Requires POSIX shm + cudaHostRegister (Jetson unified memory)."); +} + +void UnifiedMemoryBackend::free_device_memory(void * /*device_ptr*/) +{ + throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented."); +} + +void * UnifiedMemoryBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/) +{ + throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented."); +} + +void UnifiedMemoryBackend::release_handle(void * /*local_ptr*/) +{ + throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented."); +} + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/unified_memory_backend.hpp b/src/agnocast_cuda/src/unified_memory_backend.hpp new file mode 100644 index 000000000..f9916f203 --- /dev/null +++ b/src/agnocast_cuda/src/unified_memory_backend.hpp @@ -0,0 +1,21 @@ +// Internal header — kept in src/ so it is NOT installed or visible to downstream packages. +// Only get_backend.cpp includes this to instantiate the singleton. +#pragma once + +#include "agnocast/gpu_transfer_backend.hpp" + +namespace agnocast::cuda +{ + +// Placeholder backend using POSIX shared memory + cudaHostRegister for Jetson platforms +// where CPU and GPU share the same physical memory (unified memory architecture). +class UnifiedMemoryBackend : public GpuTransferBackend +{ +public: + GpuHandle export_handle(void * device_ptr, size_t size) override; + void free_device_memory(void * device_ptr) override; + void * import_handle(const GpuHandle & handle, size_t size) override; + void release_handle(void * local_ptr) override; +}; + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/vmm_backend.cpp b/src/agnocast_cuda/src/vmm_backend.cpp new file mode 100644 index 000000000..4c14cbcd9 --- /dev/null +++ b/src/agnocast_cuda/src/vmm_backend.cpp @@ -0,0 +1,30 @@ +#include "vmm_backend.hpp" + +#include + +namespace agnocast::cuda +{ + +GpuHandle VmmBackend::export_handle(void * /*device_ptr*/, size_t /*size*/) +{ + throw std::runtime_error( + "[agnocast_cuda] VmmBackend is not yet implemented. " + "Requires cuMemExportToShareableHandle (CUDA Driver API)."); +} + +void VmmBackend::free_device_memory(void * /*device_ptr*/) +{ + throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented."); +} + +void * VmmBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/) +{ + throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented."); +} + +void VmmBackend::release_handle(void * /*local_ptr*/) +{ + throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented."); +} + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/vmm_backend.hpp b/src/agnocast_cuda/src/vmm_backend.hpp new file mode 100644 index 000000000..491e5544a --- /dev/null +++ b/src/agnocast_cuda/src/vmm_backend.hpp @@ -0,0 +1,22 @@ +// Internal header — kept in src/ so it is NOT installed or visible to downstream packages. +// Only get_backend.cpp includes this to instantiate the singleton. +#pragma once + +#include "agnocast/gpu_transfer_backend.hpp" + +namespace agnocast::cuda +{ + +// Placeholder backend using CUDA Virtual Memory Management (VMM) API. +// Uses cuMemExportToShareableHandle / cuMemImportFromShareableHandle. +// Supported on Jetson Orin (newer CUDA) and discrete GPUs with CUDA 10.2+. +class VmmBackend : public GpuTransferBackend +{ +public: + GpuHandle export_handle(void * device_ptr, size_t size) override; + void free_device_memory(void * device_ptr) override; + void * import_handle(const GpuHandle & handle, size_t size) override; + void release_handle(void * local_ptr) override; +}; + +} // namespace agnocast::cuda diff --git a/src/agnocast_sample_application/CMakeLists.txt b/src/agnocast_sample_application/CMakeLists.txt index 887916d7f..945e03ab7 100644 --- a/src/agnocast_sample_application/CMakeLists.txt +++ b/src/agnocast_sample_application/CMakeLists.txt @@ -160,6 +160,34 @@ install(TARGETS no_rclcpp_client install(TARGETS sim_time_timer DESTINATION lib/${PROJECT_NAME}) +# CUDA sample applications (only built when CUDA and agnocast_cuda are available) +find_package(CUDAToolkit QUIET) +find_package(agnocast_cuda QUIET) +if(CUDAToolkit_FOUND AND agnocast_cuda_FOUND) + enable_language(CUDA) + + add_executable(cuda_talker src/cuda_publisher.cpp) + set_source_files_properties(src/cuda_publisher.cpp PROPERTIES LANGUAGE CUDA) + ament_target_dependencies(cuda_talker agnocastlib agnocast_cuda sensor_msgs) + target_link_libraries(cuda_talker CUDA::cudart) + target_include_directories(cuda_talker PRIVATE + ${agnocastlib_INCLUDE_DIRS} + ${agnocast_cuda_INCLUDE_DIRS} + ) + + add_executable(cuda_listener src/cuda_subscriber.cpp) + set_source_files_properties(src/cuda_subscriber.cpp PROPERTIES LANGUAGE CUDA) + ament_target_dependencies(cuda_listener agnocastlib agnocast_cuda sensor_msgs) + target_link_libraries(cuda_listener CUDA::cudart) + target_include_directories(cuda_listener PRIVATE + ${agnocastlib_INCLUDE_DIRS} + ${agnocast_cuda_INCLUDE_DIRS} + ) + + install(TARGETS cuda_talker cuda_listener + DESTINATION lib/${PROJECT_NAME}) +endif() + install(DIRECTORY launch DESTINATION share/${PROJECT_NAME}/ ) diff --git a/src/agnocast_sample_application/package.xml b/src/agnocast_sample_application/package.xml index dc9385840..f69e50068 100644 --- a/src/agnocast_sample_application/package.xml +++ b/src/agnocast_sample_application/package.xml @@ -31,6 +31,13 @@ agnocast_components agnocast_sample_interfaces + + agnocast_cuda + sensor_msgs + ament_cmake diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp new file mode 100644 index 000000000..e8c447290 --- /dev/null +++ b/src/agnocast_sample_application/src/cuda_publisher.cpp @@ -0,0 +1,67 @@ +#include "agnocast/agnocast.hpp" +#include "agnocast/cuda/types.hpp" + +#include + +using namespace std::chrono_literals; + +// Simple CUDA kernel: fills GPU buffer with incrementing values. +__global__ void fill_kernel(uint8_t * data, size_t size, uint8_t offset) +{ + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + data[idx] = static_cast((idx + offset) % 256); + } +} + +class CudaPublisher : public agnocast::Node +{ + int64_t count_ = 0; + agnocast::Publisher::SharedPtr pub_; + agnocast::TimerBase::SharedPtr timer_; + + void timer_callback() + { + auto msg = pub_->borrow_loaned_message(); + + // Set CPU metadata + msg->header.stamp.sec = static_cast(count_); + msg->header.frame_id = "lidar"; + msg->height = 1; + msg->width = 1024; + msg->point_step = 16; + msg->row_step = msg->width * msg->point_step; + msg->is_dense = true; + + // Allocate and fill GPU data + const size_t gpu_size = msg->height * msg->width * msg->point_step; + cudaMalloc(&msg->data, gpu_size); + + const int threads = 256; + const int blocks = (gpu_size + threads - 1) / threads; + fill_kernel<<>>(msg->data, gpu_size, static_cast(count_)); + cudaStreamSynchronize(nullptr); + + pub_->publish(std::move(msg)); + RCLCPP_INFO(get_logger(), "published CUDA PointCloud2: seq=%ld, gpu_size=%zu", count_++, gpu_size); + } + +public: + CudaPublisher() : Node("cuda_publisher") + { + pub_ = this->create_publisher("/cuda_pointcloud", 1); + timer_ = agnocast::create_timer( + this, std::make_shared(RCL_STEADY_TIME), rclcpp::Duration(100ms), + std::bind(&CudaPublisher::timer_callback, this)); + } +}; + +int main(int argc, char ** argv) +{ + agnocast::init(argc, argv); + agnocast::AgnocastOnlySingleThreadedExecutor executor; + auto node = std::make_shared(); + executor.add_node(node); + executor.spin(); + return 0; +} diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp new file mode 100644 index 000000000..4c7b243d4 --- /dev/null +++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp @@ -0,0 +1,52 @@ +#include "agnocast/agnocast.hpp" +#include "agnocast/cuda/types.hpp" + +#include + +#include + +using std::placeholders::_1; + +class CudaSubscriber : public agnocast::Node +{ + agnocast::Subscription::SharedPtr sub_; + + void callback(agnocast::ipc_shared_ptr msg) + { + // Read CPU metadata from shared memory + const uint32_t width = msg->width; + const uint32_t point_step = msg->point_step; + const size_t gpu_size = msg->height * width * point_step; + + // Get subscriber-local GPU pointer (mapped via CUDA IPC) + auto * gpu_ptr = static_cast(msg.get_local_gpu_ptr()); + + // Read first few bytes from GPU to verify data + uint8_t host_buf[16]{}; + const size_t copy_size = std::min(gpu_size, sizeof(host_buf)); + cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost); + + RCLCPP_INFO( + get_logger(), + "received CUDA PointCloud2: width=%u, point_step=%u, gpu_size=%zu, " + "first_bytes=[%u,%u,%u,%u]", + width, point_step, gpu_size, host_buf[0], host_buf[1], host_buf[2], host_buf[3]); + } + +public: + CudaSubscriber() : Node("cuda_subscriber") + { + sub_ = this->create_subscription( + "/cuda_pointcloud", 1, std::bind(&CudaSubscriber::callback, this, _1)); + } +}; + +int main(int argc, char ** argv) +{ + agnocast::init(argc, argv); + agnocast::AgnocastOnlySingleThreadedExecutor executor; + auto node = std::make_shared(); + executor.add_node(node); + executor.spin(); + return 0; +} diff --git a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp index 13b66fbf7..18bc7c1a1 100644 --- a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp @@ -1,6 +1,9 @@ #pragma once #include "agnocast/agnocast_smart_pointer.hpp" +#include "agnocast/cuda_message_tag.hpp" +#include "agnocast/gpu_metadata.hpp" +#include "agnocast/gpu_transfer_backend.hpp" #include #include @@ -61,6 +64,42 @@ extern std::atomic need_epoll_updates; uint32_t allocate_callback_info_id(); +// Creates an ipc_shared_ptr for a subscriber-received message. +// For CUDA messages: imports the GPU handle, stores the subscriber-local GPU pointer in +// control_block->local_gpu_ptr, and registers a gpu_cleanup callback to release the mapping +// on last reference. The pointer is accessed via ipc_shared_ptr::get_local_gpu_ptr(). +// For non-CUDA messages: simply wraps the pointer. +template +agnocast::ipc_shared_ptr create_subscriber_ipc_ptr( + MessageT * msg, const std::string & topic_name, const topic_local_id_t subscriber_id, + const int64_t entry_id) +{ + if constexpr (is_cuda_message_v) { + auto * meta = static_cast(msg->gpu_metadata_); + if (!meta) { + std::fprintf( + stderr, + "[agnocast] FATAL: CUDA message on topic '%s' has null gpu_metadata_. " + "The publisher may have failed to set GpuMetadata during publish().\n", + topic_name.c_str()); + std::abort(); + } + void * local_gpu_ptr = + agnocast::cuda::get_backend().import_handle(meta->handle, meta->gpu_data_size); + // NOTE: If import_handle() fails, the backend aborts (fail-fast). If a future backend + // returns nullptr instead, the subscriber would get a null gpu pointer. Callers should + // check get_local_gpu_ptr() != nullptr before use. + + auto ipc_ptr = agnocast::ipc_shared_ptr(msg, topic_name, subscriber_id, entry_id); + ipc_ptr.set_local_gpu_ptr(local_gpu_ptr); + ipc_ptr.set_gpu_cleanup( + [local_gpu_ptr]() { agnocast::cuda::get_backend().release_handle(local_gpu_ptr); }); + return ipc_ptr; + } else { + return agnocast::ipc_shared_ptr(msg, topic_name, subscriber_id, entry_id); + } +} + template TypeErasedCallback get_erased_callback(Func && callback) { @@ -95,9 +134,9 @@ uint32_t register_callback( auto message_creator = []( const void * ptr, const std::string & topic_name, const topic_local_id_t subscriber_id, const int64_t entry_id) { - return std::make_unique>(agnocast::ipc_shared_ptr( - const_cast(static_cast(ptr)), topic_name, subscriber_id, - entry_id)); + auto * msg = const_cast(static_cast(ptr)); + return std::make_unique>( + create_subscriber_ipc_ptr(msg, topic_name, subscriber_id, entry_id)); }; uint32_t callback_info_id = allocate_callback_info_id(); diff --git a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp index 6ad96b690..faf146c08 100644 --- a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp @@ -5,6 +5,9 @@ #include "agnocast/agnocast_smart_pointer.hpp" #include "agnocast/agnocast_tracepoint_wrapper.h" #include "agnocast/agnocast_utils.hpp" +#include "agnocast/cuda_message_tag.hpp" +#include "agnocast/gpu_metadata.hpp" +#include "agnocast/gpu_transfer_backend.hpp" #include "rclcpp/detail/qos_parameters.hpp" #include "rclcpp/rclcpp.hpp" @@ -184,7 +187,22 @@ class BasicPublisher } // Capture raw pointer BEFORE invalidation (get() returns nullptr after invalidation). - const uint64_t msg_virtual_address = reinterpret_cast(message.get()); + MessageT * raw_ptr = message.get(); + const uint64_t msg_virtual_address = reinterpret_cast(raw_ptr); + + // CUDA publish hook: export GPU handle and allocate GpuMetadata in shared memory. + // Runs while heaphook is still active, so GpuMetadata lands in the publisher's shared memory. + // NOTE: Assumes MessageT has a public `data` member (uint8_t*) pointing to the GPU allocation. + // All CUDA message types must provide this by shadowing the base ROS message's data field. + if constexpr (is_cuda_message_v) { + auto & backend = agnocast::cuda::get_backend(); + const size_t gpu_size = get_cuda_gpu_data_size(*raw_ptr); + auto * meta = new GpuMetadata(); + meta->publisher_gpu_ptr = raw_ptr->data; + meta->gpu_data_size = gpu_size; + meta->handle = backend.export_handle(raw_ptr->data, gpu_size); + raw_ptr->gpu_metadata_ = meta; + } // Invalidate all references sharing this handle's control block. // Any remaining copies held elsewhere will fail-fast on dereference. @@ -197,6 +215,16 @@ class BasicPublisher for (uint32_t i = 0; i < publish_msg_args.ret_released_num; i++) { MessageT * release_ptr = reinterpret_cast(publish_msg_args.ret_released_addrs[i]); + // CUDA reclaim hook: free GPU buffer before deleting the message. + // On abnormal publisher exit, free_device_memory() is never called, but GPU device + // memory is reclaimed by the CUDA driver when the process exits. + if constexpr (is_cuda_message_v) { + if (release_ptr->gpu_metadata_) { + auto * meta = static_cast(release_ptr->gpu_metadata_); + agnocast::cuda::get_backend().free_device_memory(meta->publisher_gpu_ptr); + delete meta; + } + } delete release_ptr; } diff --git a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp index b20f31900..e2b51d4b1 100644 --- a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -40,10 +41,18 @@ extern int agnocast_fd; // Sentinel value indicating entry_id has not been assigned (publisher-side, before publish). constexpr int64_t ENTRY_ID_NOT_ASSIGNED = -1; -// Forward declaration for friend access +// Forward declarations for friend access +template +class ipc_shared_ptr; + template class BasicPublisher; +template +ipc_shared_ptr create_subscriber_ipc_ptr( + MessageT * msg, const std::string & topic_name, const topic_local_id_t subscriber_id, + const int64_t entry_id); + namespace detail { @@ -60,6 +69,16 @@ struct control_block topic_local_id_t pubsub_id; // 4-byte alignment std::atomic valid{true}; // 1-byte alignment + // Optional GPU cleanup callback. Null for non-CUDA messages. + // Called before bitmap release in reset() to ensure GPU mappings are released + // before the publisher can free the underlying GPU buffer. + std::function gpu_cleanup; + + // Subscriber-local GPU device pointer obtained via import_handle(). + // Stored here because the shared memory message is mapped read-only by the subscriber, + // so we cannot inject the local pointer into msg->data. + void * local_gpu_ptr = nullptr; + control_block(std::string topic, topic_local_id_t pubsub, int64_t entry) : topic_name(std::move(topic)), entry_id(entry), pubsub_id(pubsub) { @@ -92,6 +111,11 @@ class ipc_shared_ptr template friend class BasicPublisher; + // Allow create_subscriber_ipc_ptr to call set_gpu_cleanup() and set_local_gpu_ptr() + template + friend ipc_shared_ptr create_subscriber_ipc_ptr( + MessageT *, const std::string &, const topic_local_id_t, const int64_t); + // Allow converting constructors to access private members of ipc_shared_ptr template friend class ipc_shared_ptr; @@ -120,6 +144,24 @@ class ipc_shared_ptr } } + // Sets a GPU cleanup callback to be invoked when the last reference is released. + // Private: only create_subscriber_ipc_ptr() should call this. + void set_gpu_cleanup(std::function fn) + { + if (control_) { + control_->gpu_cleanup = std::move(fn); + } + } + + // Sets the subscriber-local GPU pointer (obtained via import_handle). + // Private: only create_subscriber_ipc_ptr() should call this. + void set_local_gpu_ptr(void * ptr) + { + if (control_) { + control_->local_gpu_ptr = ptr; + } + } + // Publisher-side constructor (entry_id not yet assigned). // Creates control block for reference counting and one-shot invalidation. // Private: users must call BasicPublisher::borrow_loaned_message() instead of constructing @@ -273,6 +315,12 @@ class ipc_shared_ptr T * get() const noexcept { return is_invalidated_() ? nullptr : ptr_; } + // Returns the subscriber-local GPU device pointer, or nullptr for non-CUDA messages. + void * get_local_gpu_ptr() const noexcept + { + return control_ ? control_->local_gpu_ptr : nullptr; + } + // Thread-safe: atomically decrements ref count and performs cleanup if last reference. void reset() { @@ -283,6 +331,12 @@ class ipc_shared_ptr const bool was_last = control_->decrement_and_check(); if (was_last) { + // GPU cleanup must run BEFORE bitmap release: unmapping the GPU buffer before + // the publisher is allowed to cudaFree the underlying allocation. + if (control_->gpu_cleanup) { + control_->gpu_cleanup(); + } + if (control_->entry_id != ENTRY_ID_NOT_ASSIGNED) { // Subscriber side: notify kmod that all references are released. release_subscriber_reference(control_->topic_name, control_->pubsub_id, control_->entry_id); diff --git a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp index 2ad3728d3..80137e8e9 100644 --- a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp @@ -338,8 +338,8 @@ class BasicTakeSubscription : public SubscriptionBase } MessageT * ptr = reinterpret_cast(take_args.ret_addr); - auto result = - agnocast::ipc_shared_ptr(ptr, topic_name_, id_, take_args.ret_entry_id); + auto result = create_subscriber_ipc_ptr( + ptr, topic_name_, id_, take_args.ret_entry_id); old_ptr = std::move(last_taken_ptr_); last_taken_ptr_ = result; return result; @@ -347,7 +347,8 @@ class BasicTakeSubscription : public SubscriptionBase } MessageT * ptr = reinterpret_cast(take_args.ret_addr); - return agnocast::ipc_shared_ptr(ptr, topic_name_, id_, take_args.ret_entry_id); + return create_subscriber_ipc_ptr( + ptr, topic_name_, id_, take_args.ret_entry_id); } }; diff --git a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp index a103b10fe..3d0d5839b 100644 --- a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp +++ b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp @@ -4,6 +4,7 @@ #include "agnocast/agnocast_publisher.hpp" #include "agnocast/agnocast_subscription.hpp" #include "agnocast/bridge/agnocast_bridge_utils.hpp" +#include "agnocast/cuda_message_tag.hpp" #include "rclcpp/rclcpp.hpp" #include @@ -39,11 +40,21 @@ template void request_bridge_core( const std::string & topic_name, topic_local_id_t id, BridgeDirection direction) { - auto bridge_mode = get_bridge_mode(); - if (bridge_mode == BridgeMode::Standard) { - send_bridge_request(topic_name, id, direction); - } else if (bridge_mode == BridgeMode::Performance) { - send_performance_bridge_request(topic_name, id, direction); + // CUDA message types cannot be bridged to ROS 2 directly (GPU pointers are not serializable). + // Bridge support for CUDA types (via cudaMemcpy D2H) is future work. + // TODO(agnocast): Log a one-time warning so users know the bridge was skipped for this topic. + if constexpr (is_cuda_message_v) { + (void)topic_name; + (void)id; + (void)direction; + return; + } else { + auto bridge_mode = get_bridge_mode(); + if (bridge_mode == BridgeMode::Standard) { + send_bridge_request(topic_name, id, direction); + } else if (bridge_mode == BridgeMode::Performance) { + send_performance_bridge_request(topic_name, id, direction); + } } } diff --git a/src/agnocastlib/include/agnocast/cuda_message_tag.hpp b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp new file mode 100644 index 000000000..0c5a7d685 --- /dev/null +++ b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include +#include + +namespace agnocast +{ + +// Base class for CUDA message types. Provides a pointer to GpuMetadata stored in shared memory. +// CUDA message types (e.g., agnocast::cuda::PointCloud2) inherit from both a ROS message type +// and this tag. The publish/receive machinery uses is_cuda_message_v to detect CUDA messages +// at compile time via if constexpr, so no CUDA headers are needed in agnocastlib. +struct cuda_message_tag +{ + void * gpu_metadata_ = nullptr; +}; + +template +inline constexpr bool is_cuda_message_v = std::is_base_of_v>; + +// Returns the GPU data size for a CUDA message. Must be specialized by each CUDA message type +// in the agnocast_cuda package (e.g., for PointCloud2: height * width * point_step). +template +size_t get_cuda_gpu_data_size(const T & msg); + +} // namespace agnocast diff --git a/src/agnocastlib/include/agnocast/gpu_metadata.hpp b/src/agnocastlib/include/agnocast/gpu_metadata.hpp new file mode 100644 index 000000000..f4c860cef --- /dev/null +++ b/src/agnocastlib/include/agnocast/gpu_metadata.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include +#include + +namespace agnocast +{ + +// Opaque handle large enough for cudaIpcMemHandle_t (64 bytes). +struct GpuHandle +{ + uint8_t opaque[64]; +}; + +// GPU sharing metadata stored in shared memory alongside the message. +// Allocated by the publish path (while heaphook is active) so it lands in the publisher's +// shared memory region and is readable by subscribers. +struct GpuMetadata +{ + GpuHandle handle; // backend-specific shareable handle + size_t gpu_data_size; // size of the GPU allocation in bytes + void * publisher_gpu_ptr; // original device pointer for publisher-side free on reclaim +}; + +} // namespace agnocast diff --git a/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp b/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp new file mode 100644 index 000000000..4f91aca30 --- /dev/null +++ b/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include "agnocast/gpu_metadata.hpp" + +#include + +namespace agnocast::cuda +{ + +// Abstract interface for GPU memory sharing backends. +// Concrete implementations (CudaIpcBackend, etc.) live in the agnocast_cuda package. +class GpuTransferBackend +{ +public: + virtual ~GpuTransferBackend() = default; + + // Publisher side: create a shareable handle from a device pointer. + virtual GpuHandle export_handle(void * device_ptr, size_t size) = 0; + + // Publisher side: free GPU buffer on reclaim. + virtual void free_device_memory(void * device_ptr) = 0; + + // Subscriber side: map GPU buffer into this process. + virtual void * import_handle(const GpuHandle & handle, size_t size) = 0; + + // Subscriber side: unmap GPU buffer from this process. + virtual void release_handle(void * local_ptr) = 0; +}; + +// Returns the singleton backend instance. Defined in the agnocast_cuda package and resolved +// at link time. Only called from if-constexpr branches guarded by is_cuda_message_v, +// so the symbol is never referenced unless a CUDA message type is actually used. +GpuTransferBackend & get_backend(); + +} // namespace agnocast::cuda From 9dd28ed7c7708c25248835e10624c3a168d26818 Mon Sep 17 00:00:00 2001 From: sykwer Date: Sat, 14 Mar 2026 04:16:25 +0900 Subject: [PATCH 02/18] fix Signed-off-by: sykwer --- src/agnocast_cuda/src/cuda_ipc_backend.cpp | 3 +-- src/agnocast_sample_application/src/cuda_publisher.cpp | 4 +++- .../include/agnocast/agnocast_smart_pointer.hpp | 5 +---- src/agnocastlib/include/agnocast/agnocast_subscription.hpp | 7 +++---- src/agnocastlib/include/agnocast/cuda_message_tag.hpp | 3 ++- 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.cpp b/src/agnocast_cuda/src/cuda_ipc_backend.cpp index ab3ae8be2..3a2bbb8b4 100644 --- a/src/agnocast_cuda/src/cuda_ipc_backend.cpp +++ b/src/agnocast_cuda/src/cuda_ipc_backend.cpp @@ -44,8 +44,7 @@ void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/) std::memcpy(&ipc_handle, handle.opaque, sizeof(ipc_handle)); void * ptr = nullptr; check_cuda_error( - cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess), - "cudaIpcOpenMemHandle"); + cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess), "cudaIpcOpenMemHandle"); return ptr; } diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp index e8c447290..7abe85d9b 100644 --- a/src/agnocast_sample_application/src/cuda_publisher.cpp +++ b/src/agnocast_sample_application/src/cuda_publisher.cpp @@ -39,11 +39,13 @@ class CudaPublisher : public agnocast::Node const int threads = 256; const int blocks = (gpu_size + threads - 1) / threads; + // cppcheck-suppress shiftTooManyBits // false positive: <<< >>> is CUDA kernel launch syntax fill_kernel<<>>(msg->data, gpu_size, static_cast(count_)); cudaStreamSynchronize(nullptr); pub_->publish(std::move(msg)); - RCLCPP_INFO(get_logger(), "published CUDA PointCloud2: seq=%ld, gpu_size=%zu", count_++, gpu_size); + RCLCPP_INFO( + get_logger(), "published CUDA PointCloud2: seq=%ld, gpu_size=%zu", count_++, gpu_size); } public: diff --git a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp index e2b51d4b1..d055535f7 100644 --- a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp @@ -316,10 +316,7 @@ class ipc_shared_ptr T * get() const noexcept { return is_invalidated_() ? nullptr : ptr_; } // Returns the subscriber-local GPU device pointer, or nullptr for non-CUDA messages. - void * get_local_gpu_ptr() const noexcept - { - return control_ ? control_->local_gpu_ptr : nullptr; - } + void * get_local_gpu_ptr() const noexcept { return control_ ? control_->local_gpu_ptr : nullptr; } // Thread-safe: atomically decrements ref count and performs cleanup if last reference. void reset() diff --git a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp index 80137e8e9..8bb51e07f 100644 --- a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp @@ -338,8 +338,8 @@ class BasicTakeSubscription : public SubscriptionBase } MessageT * ptr = reinterpret_cast(take_args.ret_addr); - auto result = create_subscriber_ipc_ptr( - ptr, topic_name_, id_, take_args.ret_entry_id); + auto result = + create_subscriber_ipc_ptr(ptr, topic_name_, id_, take_args.ret_entry_id); old_ptr = std::move(last_taken_ptr_); last_taken_ptr_ = result; return result; @@ -347,8 +347,7 @@ class BasicTakeSubscription : public SubscriptionBase } MessageT * ptr = reinterpret_cast(take_args.ret_addr); - return create_subscriber_ipc_ptr( - ptr, topic_name_, id_, take_args.ret_entry_id); + return create_subscriber_ipc_ptr(ptr, topic_name_, id_, take_args.ret_entry_id); } }; diff --git a/src/agnocastlib/include/agnocast/cuda_message_tag.hpp b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp index 0c5a7d685..969221b01 100644 --- a/src/agnocastlib/include/agnocast/cuda_message_tag.hpp +++ b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp @@ -16,7 +16,8 @@ struct cuda_message_tag }; template -inline constexpr bool is_cuda_message_v = std::is_base_of_v>; +inline constexpr bool is_cuda_message_v = + std::is_base_of_v>; // Returns the GPU data size for a CUDA message. Must be specialized by each CUDA message type // in the agnocast_cuda package (e.g., for PointCloud2: height * width * point_step). From 8e69df3f3981c62949b4c9437fab2043694d3d9c Mon Sep 17 00:00:00 2001 From: sykwer Date: Sat, 14 Mar 2026 04:23:57 +0900 Subject: [PATCH 03/18] make agnocast_cuda build optional when CUDA toolchain is unavailable Signed-off-by: sykwer --- src/agnocast_cuda/CMakeLists.txt | 61 +++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/src/agnocast_cuda/CMakeLists.txt b/src/agnocast_cuda/CMakeLists.txt index 9d28e687d..d9fa6dd6a 100644 --- a/src/agnocast_cuda/CMakeLists.txt +++ b/src/agnocast_cuda/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.14) -project(agnocast_cuda LANGUAGES CXX CUDA) +project(agnocast_cuda LANGUAGES CXX) if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) @@ -12,34 +12,53 @@ endif() find_package(ament_cmake REQUIRED) find_package(agnocastlib REQUIRED) find_package(sensor_msgs REQUIRED) -find_package(CUDAToolkit REQUIRED) -add_library(agnocast_cuda SHARED - src/cuda_ipc_backend.cpp - src/vmm_backend.cpp - src/nvscibuf_backend.cpp - src/unified_memory_backend.cpp - src/get_backend.cpp) +# CUDA is optional: if not available, headers are still installed but no library is built. +# Downstream packages that use CUDA message types must check if the agnocast_cuda library +# target exists (e.g., via find_package + if(TARGET agnocast_cuda::agnocast_cuda)). +include(CheckLanguage) +check_language(CUDA) -target_include_directories(agnocast_cuda PUBLIC - $ - $) +if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + find_package(CUDAToolkit QUIET) +endif() + +if(CUDAToolkit_FOUND) + add_library(agnocast_cuda SHARED + src/cuda_ipc_backend.cpp + src/vmm_backend.cpp + src/nvscibuf_backend.cpp + src/unified_memory_backend.cpp + src/get_backend.cpp) + + target_include_directories(agnocast_cuda PUBLIC + $ + $) -ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs) -target_link_libraries(agnocast_cuda CUDA::cudart) + ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs) + target_link_libraries(agnocast_cuda CUDA::cudart) -install(TARGETS agnocast_cuda - EXPORT export_${PROJECT_NAME} - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib - RUNTIME DESTINATION bin - INCLUDES DESTINATION include) + install(TARGETS agnocast_cuda + EXPORT export_${PROJECT_NAME} + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + RUNTIME DESTINATION bin + INCLUDES DESTINATION include) + + ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET) + ament_export_dependencies(agnocastlib sensor_msgs) + + message(STATUS "agnocast_cuda: CUDA found — building library and installing headers.") +else() + message(STATUS "agnocast_cuda: CUDA not found — installing headers only (no library).") +endif() +# Always install headers so downstream packages can use the type definitions +# (e.g., cuda_message_tag.hpp is in agnocastlib, but types.hpp is here). install( DIRECTORY include/ DESTINATION include) -ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET) ament_export_include_directories(include) -ament_export_dependencies(agnocastlib sensor_msgs) ament_package() From 3c4e4e5572843749da85f0ca680fe28365f8dd79 Mon Sep 17 00:00:00 2001 From: sykwer Date: Sat, 14 Mar 2026 04:26:18 +0900 Subject: [PATCH 04/18] remove unconditional CUDA build_depend from package.xml Signed-off-by: sykwer --- src/agnocast_sample_application/package.xml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/agnocast_sample_application/package.xml b/src/agnocast_sample_application/package.xml index f69e50068..3efee0479 100644 --- a/src/agnocast_sample_application/package.xml +++ b/src/agnocast_sample_application/package.xml @@ -31,12 +31,9 @@ agnocast_components agnocast_sample_interfaces - - agnocast_cuda - sensor_msgs + ament_cmake From 0188770095daf083eb5853a494d993918a7b8e36 Mon Sep 17 00:00:00 2001 From: sykwer Date: Sat, 14 Mar 2026 04:30:14 +0900 Subject: [PATCH 05/18] log warning when bridge is skipped for CUDA message types Signed-off-by: sykwer --- .../include/agnocast/bridge/agnocast_bridge_node.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp index 3d0d5839b..7686e1fdc 100644 --- a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp +++ b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp @@ -42,9 +42,13 @@ void request_bridge_core( { // CUDA message types cannot be bridged to ROS 2 directly (GPU pointers are not serializable). // Bridge support for CUDA types (via cudaMemcpy D2H) is future work. - // TODO(agnocast): Log a one-time warning so users know the bridge was skipped for this topic. if constexpr (is_cuda_message_v) { - (void)topic_name; + static const auto logger = rclcpp::get_logger("agnocast_bridge_requester"); + RCLCPP_WARN( + logger, + "Bridge skipped for CUDA topic '%s': GPU message types cannot be bridged to ROS 2. " + "Use cudaMemcpy to a standard ROS message if DDS bridging is needed.", + topic_name.c_str()); (void)id; (void)direction; return; From d9d438ea1a3a0c4b2fdf37796b10b0abd538a55c Mon Sep 17 00:00:00 2001 From: sykwer Date: Sat, 14 Mar 2026 04:37:20 +0900 Subject: [PATCH 06/18] replace std::function with plain function pointer for GPU cleanup Signed-off-by: sykwer --- .../agnocast/agnocast_callback_info.hpp | 6 ++--- .../agnocast/agnocast_smart_pointer.hpp | 25 ++++++++++--------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp index 18bc7c1a1..154c454c6 100644 --- a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp @@ -66,7 +66,7 @@ uint32_t allocate_callback_info_id(); // Creates an ipc_shared_ptr for a subscriber-received message. // For CUDA messages: imports the GPU handle, stores the subscriber-local GPU pointer in -// control_block->local_gpu_ptr, and registers a gpu_cleanup callback to release the mapping +// control_block->local_gpu_ptr, and registers a gpu_release_fn to release the mapping // on last reference. The pointer is accessed via ipc_shared_ptr::get_local_gpu_ptr(). // For non-CUDA messages: simply wraps the pointer. template @@ -92,8 +92,8 @@ agnocast::ipc_shared_ptr create_subscriber_ipc_ptr( auto ipc_ptr = agnocast::ipc_shared_ptr(msg, topic_name, subscriber_id, entry_id); ipc_ptr.set_local_gpu_ptr(local_gpu_ptr); - ipc_ptr.set_gpu_cleanup( - [local_gpu_ptr]() { agnocast::cuda::get_backend().release_handle(local_gpu_ptr); }); + ipc_ptr.set_gpu_release_fn( + [](void * ptr) { agnocast::cuda::get_backend().release_handle(ptr); }); return ipc_ptr; } else { return agnocast::ipc_shared_ptr(msg, topic_name, subscriber_id, entry_id); diff --git a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp index d055535f7..a98381ca4 100644 --- a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -69,10 +68,12 @@ struct control_block topic_local_id_t pubsub_id; // 4-byte alignment std::atomic valid{true}; // 1-byte alignment - // Optional GPU cleanup callback. Null for non-CUDA messages. - // Called before bitmap release in reset() to ensure GPU mappings are released - // before the publisher can free the underlying GPU buffer. - std::function gpu_cleanup; + // Optional GPU cleanup function pointer. Null for non-CUDA messages. + // Called as gpu_release_fn(local_gpu_ptr) before bitmap release in reset() to ensure GPU + // mappings are released before the publisher can free the underlying GPU buffer. + // Uses a plain function pointer instead of std::function to avoid heap allocation and + // minimize overhead for non-CUDA messages (16 bytes for two pointers vs ~40+ bytes). + void (*gpu_release_fn)(void *) = nullptr; // Subscriber-local GPU device pointer obtained via import_handle(). // Stored here because the shared memory message is mapped read-only by the subscriber, @@ -111,7 +112,7 @@ class ipc_shared_ptr template friend class BasicPublisher; - // Allow create_subscriber_ipc_ptr to call set_gpu_cleanup() and set_local_gpu_ptr() + // Allow create_subscriber_ipc_ptr to call set_gpu_release_fn() and set_local_gpu_ptr() template friend ipc_shared_ptr create_subscriber_ipc_ptr( MessageT *, const std::string &, const topic_local_id_t, const int64_t); @@ -144,12 +145,12 @@ class ipc_shared_ptr } } - // Sets a GPU cleanup callback to be invoked when the last reference is released. - // Private: only create_subscriber_ipc_ptr() should call this. - void set_gpu_cleanup(std::function fn) + // Sets a GPU release function to be invoked as fn(local_gpu_ptr) when the last reference + // is released. Private: only create_subscriber_ipc_ptr() should call this. + void set_gpu_release_fn(void (*fn)(void *)) { if (control_) { - control_->gpu_cleanup = std::move(fn); + control_->gpu_release_fn = fn; } } @@ -330,8 +331,8 @@ class ipc_shared_ptr if (was_last) { // GPU cleanup must run BEFORE bitmap release: unmapping the GPU buffer before // the publisher is allowed to cudaFree the underlying allocation. - if (control_->gpu_cleanup) { - control_->gpu_cleanup(); + if (control_->gpu_release_fn) { + control_->gpu_release_fn(control_->local_gpu_ptr); } if (control_->entry_id != ENTRY_ID_NOT_ASSIGNED) { From e336b3361252a7547244b7639b4e926b662831e0 Mon Sep 17 00:00:00 2001 From: sykwer Date: Sat, 14 Mar 2026 04:41:50 +0900 Subject: [PATCH 07/18] validate data pointer and gpu_data_size before publish Signed-off-by: sykwer --- .../include/agnocast/agnocast_publisher.hpp | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp index faf146c08..04a9c3859 100644 --- a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp @@ -195,8 +195,24 @@ class BasicPublisher // NOTE: Assumes MessageT has a public `data` member (uint8_t*) pointing to the GPU allocation. // All CUDA message types must provide this by shadowing the base ROS message's data field. if constexpr (is_cuda_message_v) { - auto & backend = agnocast::cuda::get_backend(); + if (!raw_ptr->data) { + std::fprintf( + stderr, + "[agnocast] FATAL: CUDA message on topic '%s' has null data pointer. " + "Did you forget to cudaMalloc(&msg->data, size) before publish()?\n", + topic_name_.c_str()); + std::abort(); + } const size_t gpu_size = get_cuda_gpu_data_size(*raw_ptr); + if (gpu_size == 0) { + std::fprintf( + stderr, + "[agnocast] FATAL: CUDA message on topic '%s' has gpu_data_size == 0. " + "Ensure message fields (height, width, point_step, etc.) are set before publish().\n", + topic_name_.c_str()); + std::abort(); + } + auto & backend = agnocast::cuda::get_backend(); auto * meta = new GpuMetadata(); meta->publisher_gpu_ptr = raw_ptr->data; meta->gpu_data_size = gpu_size; From 1c1ea2a5f6853e442baab719cd1dafccb5a9d9ff Mon Sep 17 00:00:00 2001 From: sykwer Date: Sat, 14 Mar 2026 04:45:17 +0900 Subject: [PATCH 08/18] align message_creator entry_id type to int64_t Signed-off-by: sykwer --- src/agnocastlib/include/agnocast/agnocast_callback_info.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp index 154c454c6..5854eb692 100644 --- a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp @@ -47,7 +47,7 @@ struct CallbackInfo rclcpp::CallbackGroup::SharedPtr callback_group; TypeErasedCallback callback; std::function( - const void *, const std::string &, const topic_local_id_t, const uint64_t)> + const void *, const std::string &, const topic_local_id_t, const int64_t)> message_creator; bool need_epoll_update = true; }; From 1f5400b9f2cf54c94aca54d29278bed8a60bcc61 Mon Sep 17 00:00:00 2001 From: sykwer Date: Sat, 14 Mar 2026 04:53:32 +0900 Subject: [PATCH 09/18] rename get_local_gpu_ptr() to gpu_data() Signed-off-by: sykwer --- .../src/cuda_subscriber.cpp | 2 +- .../include/agnocast/agnocast_callback_info.hpp | 10 +++++----- .../include/agnocast/agnocast_smart_pointer.hpp | 16 ++++++++-------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp index 4c7b243d4..42b5294b8 100644 --- a/src/agnocast_sample_application/src/cuda_subscriber.cpp +++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp @@ -19,7 +19,7 @@ class CudaSubscriber : public agnocast::Node const size_t gpu_size = msg->height * width * point_step; // Get subscriber-local GPU pointer (mapped via CUDA IPC) - auto * gpu_ptr = static_cast(msg.get_local_gpu_ptr()); + auto * gpu_ptr = static_cast(msg.gpu_data()); // Read first few bytes from GPU to verify data uint8_t host_buf[16]{}; diff --git a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp index 5854eb692..08e101447 100644 --- a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp @@ -66,8 +66,8 @@ uint32_t allocate_callback_info_id(); // Creates an ipc_shared_ptr for a subscriber-received message. // For CUDA messages: imports the GPU handle, stores the subscriber-local GPU pointer in -// control_block->local_gpu_ptr, and registers a gpu_release_fn to release the mapping -// on last reference. The pointer is accessed via ipc_shared_ptr::get_local_gpu_ptr(). +// control_block->gpu_data_ptr, and registers a gpu_release_fn to release the mapping +// on last reference. The pointer is accessed via ipc_shared_ptr::gpu_data(). // For non-CUDA messages: simply wraps the pointer. template agnocast::ipc_shared_ptr create_subscriber_ipc_ptr( @@ -84,14 +84,14 @@ agnocast::ipc_shared_ptr create_subscriber_ipc_ptr( topic_name.c_str()); std::abort(); } - void * local_gpu_ptr = + void * gpu_data_ptr = agnocast::cuda::get_backend().import_handle(meta->handle, meta->gpu_data_size); // NOTE: If import_handle() fails, the backend aborts (fail-fast). If a future backend // returns nullptr instead, the subscriber would get a null gpu pointer. Callers should - // check get_local_gpu_ptr() != nullptr before use. + // check gpu_data() != nullptr before use. auto ipc_ptr = agnocast::ipc_shared_ptr(msg, topic_name, subscriber_id, entry_id); - ipc_ptr.set_local_gpu_ptr(local_gpu_ptr); + ipc_ptr.set_gpu_data_ptr(gpu_data_ptr); ipc_ptr.set_gpu_release_fn( [](void * ptr) { agnocast::cuda::get_backend().release_handle(ptr); }); return ipc_ptr; diff --git a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp index a98381ca4..fbc571898 100644 --- a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp @@ -69,7 +69,7 @@ struct control_block std::atomic valid{true}; // 1-byte alignment // Optional GPU cleanup function pointer. Null for non-CUDA messages. - // Called as gpu_release_fn(local_gpu_ptr) before bitmap release in reset() to ensure GPU + // Called as gpu_release_fn(gpu_data_ptr) before bitmap release in reset() to ensure GPU // mappings are released before the publisher can free the underlying GPU buffer. // Uses a plain function pointer instead of std::function to avoid heap allocation and // minimize overhead for non-CUDA messages (16 bytes for two pointers vs ~40+ bytes). @@ -78,7 +78,7 @@ struct control_block // Subscriber-local GPU device pointer obtained via import_handle(). // Stored here because the shared memory message is mapped read-only by the subscriber, // so we cannot inject the local pointer into msg->data. - void * local_gpu_ptr = nullptr; + void * gpu_data_ptr = nullptr; control_block(std::string topic, topic_local_id_t pubsub, int64_t entry) : topic_name(std::move(topic)), entry_id(entry), pubsub_id(pubsub) @@ -112,7 +112,7 @@ class ipc_shared_ptr template friend class BasicPublisher; - // Allow create_subscriber_ipc_ptr to call set_gpu_release_fn() and set_local_gpu_ptr() + // Allow create_subscriber_ipc_ptr to call set_gpu_release_fn() and set_gpu_data_ptr() template friend ipc_shared_ptr create_subscriber_ipc_ptr( MessageT *, const std::string &, const topic_local_id_t, const int64_t); @@ -145,7 +145,7 @@ class ipc_shared_ptr } } - // Sets a GPU release function to be invoked as fn(local_gpu_ptr) when the last reference + // Sets a GPU release function to be invoked as fn(gpu_data_ptr) when the last reference // is released. Private: only create_subscriber_ipc_ptr() should call this. void set_gpu_release_fn(void (*fn)(void *)) { @@ -156,10 +156,10 @@ class ipc_shared_ptr // Sets the subscriber-local GPU pointer (obtained via import_handle). // Private: only create_subscriber_ipc_ptr() should call this. - void set_local_gpu_ptr(void * ptr) + void set_gpu_data_ptr(void * ptr) { if (control_) { - control_->local_gpu_ptr = ptr; + control_->gpu_data_ptr = ptr; } } @@ -317,7 +317,7 @@ class ipc_shared_ptr T * get() const noexcept { return is_invalidated_() ? nullptr : ptr_; } // Returns the subscriber-local GPU device pointer, or nullptr for non-CUDA messages. - void * get_local_gpu_ptr() const noexcept { return control_ ? control_->local_gpu_ptr : nullptr; } + void * gpu_data() const noexcept { return control_ ? control_->gpu_data_ptr : nullptr; } // Thread-safe: atomically decrements ref count and performs cleanup if last reference. void reset() @@ -332,7 +332,7 @@ class ipc_shared_ptr // GPU cleanup must run BEFORE bitmap release: unmapping the GPU buffer before // the publisher is allowed to cudaFree the underlying allocation. if (control_->gpu_release_fn) { - control_->gpu_release_fn(control_->local_gpu_ptr); + control_->gpu_release_fn(control_->gpu_data_ptr); } if (control_->entry_id != ENTRY_ID_NOT_ASSIGNED) { From 8933d200ad829a68ff32ec902e1c607854115c1c Mon Sep 17 00:00:00 2001 From: sykwer Date: Sat, 14 Mar 2026 05:01:25 +0900 Subject: [PATCH 10/18] add error handling Signed-off-by: sykwer --- .../src/cuda_publisher.cpp | 14 ++++++++++++-- .../src/cuda_subscriber.cpp | 10 +++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp index 7abe85d9b..a6efbec7e 100644 --- a/src/agnocast_sample_application/src/cuda_publisher.cpp +++ b/src/agnocast_sample_application/src/cuda_publisher.cpp @@ -35,13 +35,23 @@ class CudaPublisher : public agnocast::Node // Allocate and fill GPU data const size_t gpu_size = msg->height * msg->width * msg->point_step; - cudaMalloc(&msg->data, gpu_size); + if (cudaMalloc(&msg->data, gpu_size) != cudaSuccess) { + RCLCPP_ERROR(get_logger(), "cudaMalloc failed: %s", cudaGetErrorString(cudaGetLastError())); + return; + } const int threads = 256; const int blocks = (gpu_size + threads - 1) / threads; // cppcheck-suppress shiftTooManyBits // false positive: <<< >>> is CUDA kernel launch syntax fill_kernel<<>>(msg->data, gpu_size, static_cast(count_)); - cudaStreamSynchronize(nullptr); + + if (cudaStreamSynchronize(nullptr) != cudaSuccess) { + RCLCPP_ERROR( + get_logger(), "kernel launch failed: %s", cudaGetErrorString(cudaGetLastError())); + cudaFree(msg->data); + msg->data = nullptr; + return; + } pub_->publish(std::move(msg)); RCLCPP_INFO( diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp index 42b5294b8..262c593d5 100644 --- a/src/agnocast_sample_application/src/cuda_subscriber.cpp +++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp @@ -20,11 +20,19 @@ class CudaSubscriber : public agnocast::Node // Get subscriber-local GPU pointer (mapped via CUDA IPC) auto * gpu_ptr = static_cast(msg.gpu_data()); + if (!gpu_ptr) { + RCLCPP_ERROR(get_logger(), "gpu_data() returned nullptr"); + return; + } // Read first few bytes from GPU to verify data uint8_t host_buf[16]{}; const size_t copy_size = std::min(gpu_size, sizeof(host_buf)); - cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost); + if (cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost) != cudaSuccess) { + RCLCPP_ERROR( + get_logger(), "cudaMemcpy failed: %s", cudaGetErrorString(cudaGetLastError())); + return; + } RCLCPP_INFO( get_logger(), From ec3226b6794324829521c7b8530c3f2995690c14 Mon Sep 17 00:00:00 2001 From: sykwer Date: Sat, 14 Mar 2026 05:05:44 +0900 Subject: [PATCH 11/18] add scripts Signed-off-by: sykwer --- scripts/sample_application/run_cuda_listener.bash | 4 ++++ scripts/sample_application/run_cuda_talker.bash | 4 ++++ .../launch/cuda_listener.launch.xml | 5 +++++ .../launch/cuda_talker.launch.xml | 5 +++++ 4 files changed, 18 insertions(+) create mode 100755 scripts/sample_application/run_cuda_listener.bash create mode 100755 scripts/sample_application/run_cuda_talker.bash create mode 100644 src/agnocast_sample_application/launch/cuda_listener.launch.xml create mode 100644 src/agnocast_sample_application/launch/cuda_talker.launch.xml diff --git a/scripts/sample_application/run_cuda_listener.bash b/scripts/sample_application/run_cuda_listener.bash new file mode 100755 index 000000000..296600719 --- /dev/null +++ b/scripts/sample_application/run_cuda_listener.bash @@ -0,0 +1,4 @@ +#!/bin/bash + +source install/setup.bash +ros2 launch agnocast_sample_application cuda_listener.launch.xml diff --git a/scripts/sample_application/run_cuda_talker.bash b/scripts/sample_application/run_cuda_talker.bash new file mode 100755 index 000000000..3c8318bad --- /dev/null +++ b/scripts/sample_application/run_cuda_talker.bash @@ -0,0 +1,4 @@ +#!/bin/bash + +source install/setup.bash +ros2 launch agnocast_sample_application cuda_talker.launch.xml diff --git a/src/agnocast_sample_application/launch/cuda_listener.launch.xml b/src/agnocast_sample_application/launch/cuda_listener.launch.xml new file mode 100644 index 000000000..4136b9649 --- /dev/null +++ b/src/agnocast_sample_application/launch/cuda_listener.launch.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/src/agnocast_sample_application/launch/cuda_talker.launch.xml b/src/agnocast_sample_application/launch/cuda_talker.launch.xml new file mode 100644 index 000000000..ffad40cb2 --- /dev/null +++ b/src/agnocast_sample_application/launch/cuda_talker.launch.xml @@ -0,0 +1,5 @@ + + + + + From 34f93f7c3815548ee161cc009ce7c82f072a7f3c Mon Sep 17 00:00:00 2001 From: sykwer Date: Sat, 14 Mar 2026 05:08:09 +0900 Subject: [PATCH 12/18] fix Signed-off-by: sykwer --- src/agnocast_sample_application/src/cuda_subscriber.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp index 262c593d5..939b36c30 100644 --- a/src/agnocast_sample_application/src/cuda_subscriber.cpp +++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp @@ -29,8 +29,7 @@ class CudaSubscriber : public agnocast::Node uint8_t host_buf[16]{}; const size_t copy_size = std::min(gpu_size, sizeof(host_buf)); if (cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost) != cudaSuccess) { - RCLCPP_ERROR( - get_logger(), "cudaMemcpy failed: %s", cudaGetErrorString(cudaGetLastError())); + RCLCPP_ERROR(get_logger(), "cudaMemcpy failed: %s", cudaGetErrorString(cudaGetLastError())); return; } From ef83ec7b9372ac3d063c10ab7cd06e02697f3aac Mon Sep 17 00:00:00 2001 From: sykwer Date: Thu, 9 Apr 2026 16:37:25 +0900 Subject: [PATCH 13/18] update Signed-off-by: sykwer --- src/agnocast/package.xml | 1 + src/agnocast_cuda/CMakeLists.txt | 77 ++++---- src/agnocast_cuda/package.xml | 7 +- src/agnocast_cuda/src/cuda_ipc_backend.cpp | 22 ++- src/agnocast_cuda/src/cudart_loader.hpp | 184 ++++++++++++++++++ src/agnocast_cuda/src/get_backend.cpp | 13 +- .../CMakeLists.txt | 11 +- src/agnocast_sample_application/package.xml | 7 +- 8 files changed, 264 insertions(+), 58 deletions(-) create mode 100644 src/agnocast_cuda/src/cudart_loader.hpp diff --git a/src/agnocast/package.xml b/src/agnocast/package.xml index ae223d158..2332f1dea 100644 --- a/src/agnocast/package.xml +++ b/src/agnocast/package.xml @@ -18,6 +18,7 @@ agnocast_cie_config_msgs agnocast_cie_thread_configurator agnocast_components + agnocast_cuda agnocast_ioctl_wrapper agnocast_sample_application agnocast_sample_interfaces diff --git a/src/agnocast_cuda/CMakeLists.txt b/src/agnocast_cuda/CMakeLists.txt index d9fa6dd6a..dba9e71f1 100644 --- a/src/agnocast_cuda/CMakeLists.txt +++ b/src/agnocast_cuda/CMakeLists.txt @@ -13,46 +13,51 @@ find_package(ament_cmake REQUIRED) find_package(agnocastlib REQUIRED) find_package(sensor_msgs REQUIRED) -# CUDA is optional: if not available, headers are still installed but no library is built. -# Downstream packages that use CUDA message types must check if the agnocast_cuda library -# target exists (e.g., via find_package + if(TARGET agnocast_cuda::agnocast_cuda)). -include(CheckLanguage) -check_language(CUDA) +# ---- Runtime CUDA loading (no build-time CUDA dependency) ---- +# +# This package has ZERO build-time dependency on the CUDA toolkit. It does NOT use +# find_package(CUDAToolkit), enable_language(CUDA), or link against CUDA::cudart. +# +# Instead, libcudart.so is loaded at runtime via dlopen() (see src/cudart_loader.hpp). +# This allows the package to be built as a pre-built .deb on the ROS build farm, which +# has no CUDA toolchain. Users install CUDA separately on their target machines. +# +# The only system library we link is libdl (via ${CMAKE_DL_LIBS}) for dlopen/dlsym. +# +# IMPORTANT FOR DOWNSTREAM PACKAGES: +# Since this library always builds (even without CUDA), find_package(agnocast_cuda) and +# TARGET agnocast_cuda::agnocast_cuda are NOT reliable indicators that CUDA is available. +# Downstream packages that compile .cu files or call CUDA APIs directly (e.g., cudaMalloc) +# must ALSO check find_package(CUDAToolkit) independently. See the sample application's +# CMakeLists.txt for the correct pattern: +# find_package(CUDAToolkit QUIET) +# find_package(agnocast_cuda QUIET) +# if(CUDAToolkit_FOUND AND agnocast_cuda_FOUND) +# ... +# endif() +add_library(agnocast_cuda SHARED + src/cuda_ipc_backend.cpp + src/vmm_backend.cpp + src/nvscibuf_backend.cpp + src/unified_memory_backend.cpp + src/get_backend.cpp) -if(CMAKE_CUDA_COMPILER) - enable_language(CUDA) - find_package(CUDAToolkit QUIET) -endif() - -if(CUDAToolkit_FOUND) - add_library(agnocast_cuda SHARED - src/cuda_ipc_backend.cpp - src/vmm_backend.cpp - src/nvscibuf_backend.cpp - src/unified_memory_backend.cpp - src/get_backend.cpp) - - target_include_directories(agnocast_cuda PUBLIC - $ - $) +target_include_directories(agnocast_cuda PUBLIC + $ + $) - ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs) - target_link_libraries(agnocast_cuda CUDA::cudart) +ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs) +target_link_libraries(agnocast_cuda ${CMAKE_DL_LIBS}) - install(TARGETS agnocast_cuda - EXPORT export_${PROJECT_NAME} - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib - RUNTIME DESTINATION bin - INCLUDES DESTINATION include) +install(TARGETS agnocast_cuda + EXPORT export_${PROJECT_NAME} + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + RUNTIME DESTINATION bin + INCLUDES DESTINATION include) - ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET) - ament_export_dependencies(agnocastlib sensor_msgs) - - message(STATUS "agnocast_cuda: CUDA found — building library and installing headers.") -else() - message(STATUS "agnocast_cuda: CUDA not found — installing headers only (no library).") -endif() +ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET) +ament_export_dependencies(agnocastlib sensor_msgs) # Always install headers so downstream packages can use the type definitions # (e.g., cuda_message_tag.hpp is in agnocastlib, but types.hpp is here). diff --git a/src/agnocast_cuda/package.xml b/src/agnocast_cuda/package.xml index 1b0245d25..b01f3bc15 100644 --- a/src/agnocast_cuda/package.xml +++ b/src/agnocast_cuda/package.xml @@ -2,13 +2,18 @@ agnocast_cuda - 0.1.0 + 2.3.3 CUDA IPC support for Agnocast. Provides GPU message types and cross-process GPU buffer sharing. Takahiro Ishikawa-Aso + Koichi Imai + Masaki Nishikawa Apache License 2.0 + + Takahiro Ishikawa-Aso + ament_cmake agnocastlib diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.cpp b/src/agnocast_cuda/src/cuda_ipc_backend.cpp index 3a2bbb8b4..6f924db9e 100644 --- a/src/agnocast_cuda/src/cuda_ipc_backend.cpp +++ b/src/agnocast_cuda/src/cuda_ipc_backend.cpp @@ -1,6 +1,5 @@ #include "cuda_ipc_backend.hpp" - -#include +#include "cudart_loader.hpp" #include #include @@ -9,11 +8,12 @@ namespace { -void check_cuda_error(cudaError_t err, const char * operation) +void check_cuda_error(agnocast::cuda::cudaError_t err, const char * operation) { - if (err != cudaSuccess) { + if (err != agnocast::cuda::cudaSuccess) { std::fprintf( - stderr, "[agnocast_cuda] FATAL: %s failed: %s\n", operation, cudaGetErrorString(err)); + stderr, "[agnocast_cuda] FATAL: %s failed: %s\n", operation, + agnocast::cuda::CudartLoader::instance().cudaGetErrorString(err)); std::abort(); } } @@ -28,14 +28,15 @@ GpuHandle CudaIpcBackend::export_handle(void * device_ptr, size_t /*size*/) GpuHandle h{}; static_assert(sizeof(cudaIpcMemHandle_t) <= sizeof(h.opaque)); cudaIpcMemHandle_t ipc_handle; - check_cuda_error(cudaIpcGetMemHandle(&ipc_handle, device_ptr), "cudaIpcGetMemHandle"); + check_cuda_error( + CudartLoader::instance().cudaIpcGetMemHandle(&ipc_handle, device_ptr), "cudaIpcGetMemHandle"); std::memcpy(h.opaque, &ipc_handle, sizeof(ipc_handle)); return h; } void CudaIpcBackend::free_device_memory(void * device_ptr) { - check_cuda_error(cudaFree(device_ptr), "cudaFree"); + check_cuda_error(CudartLoader::instance().cudaFree(device_ptr), "cudaFree"); } void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/) @@ -44,13 +45,16 @@ void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/) std::memcpy(&ipc_handle, handle.opaque, sizeof(ipc_handle)); void * ptr = nullptr; check_cuda_error( - cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess), "cudaIpcOpenMemHandle"); + CudartLoader::instance().cudaIpcOpenMemHandle( + &ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess), + "cudaIpcOpenMemHandle"); return ptr; } void CudaIpcBackend::release_handle(void * local_ptr) { - check_cuda_error(cudaIpcCloseMemHandle(local_ptr), "cudaIpcCloseMemHandle"); + check_cuda_error( + CudartLoader::instance().cudaIpcCloseMemHandle(local_ptr), "cudaIpcCloseMemHandle"); } } // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/cudart_loader.hpp b/src/agnocast_cuda/src/cudart_loader.hpp new file mode 100644 index 000000000..af31b8967 --- /dev/null +++ b/src/agnocast_cuda/src/cudart_loader.hpp @@ -0,0 +1,184 @@ +// ============================================================================ +// Runtime loader for the CUDA Runtime API (libcudart.so) via dlopen/dlsym. +// +// WHY THIS EXISTS +// --------------- +// agnocast_cuda is distributed as a pre-built .deb from the ROS build farm, which has no +// CUDA toolchain. To eliminate the build-time dependency on and -lcudart, +// we replicate the small set of CUDA types/constants we need (stable ABI) and load the +// actual CUDA runtime library at runtime via dlopen. This means: +// - Build time: pure C++17, no CUDA headers or compiler required. +// - Run time: libcudart.so must be present on the target machine (user installs CUDA). +// +// SAFE ON NON-CUDA SYSTEMS +// ------------------------ +// Users who do not use CUDA message types are completely unaffected by this library. +// CudartLoader is lazy: dlopen("libcudart.so") only happens on the first call to +// CudartLoader::instance(), which is only reachable through get_backend(), which is +// only called from if-constexpr branches guarded by is_cuda_message_v. If no CUDA +// message type is ever instantiated, this loader is never constructed and no CUDA symbols +// are ever resolved. The library can be safely installed and loaded on machines without +// a GPU or CUDA runtime. +// +// ABI STABILITY +// ------------- +// The types and constants below are part of NVIDIA's stable C ABI for the CUDA Runtime API. +// They have not changed across CUDA 10.x, 11.x, and 12.x. Specifically: +// - cudaError_t is a C enum (int-sized). cudaSuccess has been 0 since CUDA 1.0. +// - cudaIpcMemHandle_t is a 64-byte opaque struct (CUDA_IPC_HANDLE_SIZE = 64). +// - cudaIpcMemLazyEnablePeerAccess is a flag constant (0x01). +// - cudaDevAttrIntegrated is enum value 73 in cudaDeviceAttr (append-only enum). +// If NVIDIA ever breaks this ABI (extremely unlikely), the static_assert in +// cuda_ipc_backend.cpp will catch size mismatches at compile time. +// +// ADDING NEW CUDA API CALLS +// ------------------------- +// When a new backend or feature needs an additional CUDA runtime function: +// 1. Add the function pointer typedef below (e.g., using cudaMalloc_t = ...). +// 2. Add a public member to CudartLoader (e.g., cudaMalloc_t cudaMalloc;). +// 3. Add a load_symbol() call in the constructor. +// Keep this file as the single place that touches dlopen/dlsym for CUDA. +// +// LIBRARY NAME FALLBACK CHAIN +// --------------------------- +// The constructor tries these names in order: +// 1. "libcudart.so" — unversioned symlink, present when the CUDA dev package is +// installed (e.g., cuda-cudart-dev-12-*) or with the runfile +// installer. +// 2. "libcudart.so.12" — SONAME for CUDA 12.x. Present with the runtime-only deb +// package (cuda-cudart-12-*) even without the dev package. +// 3. "libcudart.so.11.0"— SONAME for CUDA 11.x (NVIDIA used major.minor for the SONAME +// in the 11.x series; all 11.0–11.8 share this SONAME). +// If a future CUDA major version (e.g., 13.x) is released, add its SONAME to the list. +// ============================================================================ +#pragma once + +#include + +#include +#include + +namespace agnocast::cuda +{ + +// --------------------------------------------------------------------------- +// ABI-compatible CUDA Runtime API type definitions. +// These replace #include so that no CUDA headers are needed at build time. +// See "ABI STABILITY" section in the file header for rationale. +// --------------------------------------------------------------------------- +using cudaError_t = int; +constexpr cudaError_t cudaSuccess = 0; + +struct cudaIpcMemHandle_t +{ + char reserved[64]; // CUDA_IPC_HANDLE_SIZE = 64 +}; + +constexpr unsigned int cudaIpcMemLazyEnablePeerAccess = 0x01; + +using cudaDeviceAttr = int; +constexpr cudaDeviceAttr cudaDevAttrIntegrated = 18; // driver_types.h: cudaDevAttrIntegrated = 18 + +// --------------------------------------------------------------------------- +// Function pointer types matching CUDA Runtime API signatures. +// --------------------------------------------------------------------------- +using cudaGetDevice_t = cudaError_t (*)(int *); +using cudaDeviceGetAttribute_t = cudaError_t (*)(int *, cudaDeviceAttr, int); +using cudaIpcGetMemHandle_t = cudaError_t (*)(cudaIpcMemHandle_t *, void *); +using cudaIpcOpenMemHandle_t = cudaError_t (*)(void **, cudaIpcMemHandle_t, unsigned int); +using cudaIpcCloseMemHandle_t = cudaError_t (*)(void *); +using cudaFree_t = cudaError_t (*)(void *); +using cudaGetErrorString_t = const char * (*)(cudaError_t); + +// --------------------------------------------------------------------------- +// CudartLoader — lazy-loading singleton for the CUDA runtime. +// +// Thread safety: the function-local static in instance() is guaranteed to be +// initialized exactly once (C++11 "magic statics"). After construction, all +// function pointers are immutable and safe to read from any thread. +// +// Lifetime: the singleton is never destroyed during normal execution. We +// intentionally do NOT call dlclose() — see destructor comment. +// --------------------------------------------------------------------------- +class CudartLoader +{ +public: + static CudartLoader & instance() + { + static CudartLoader loader; + return loader; + } + + // Public function pointers — valid immediately after instance() returns. + // Callers use these like: CudartLoader::instance().cudaFree(ptr) + cudaGetDevice_t cudaGetDevice; + cudaDeviceGetAttribute_t cudaDeviceGetAttribute; + cudaIpcGetMemHandle_t cudaIpcGetMemHandle; + cudaIpcOpenMemHandle_t cudaIpcOpenMemHandle; + cudaIpcCloseMemHandle_t cudaIpcCloseMemHandle; + cudaFree_t cudaFree; + cudaGetErrorString_t cudaGetErrorString; + +private: + void * handle_ = nullptr; + + CudartLoader() + { + // Try library names in order. See "LIBRARY NAME FALLBACK CHAIN" in the file header. + const char * names[] = {"libcudart.so", "libcudart.so.12", "libcudart.so.11.0"}; + for (const char * name : names) { + handle_ = dlopen(name, RTLD_NOW | RTLD_LOCAL); + if (handle_ != nullptr) { + break; + } + } + if (handle_ == nullptr) { + std::fprintf( + stderr, + "[agnocast_cuda] FATAL: Could not load libcudart.so.\n" + " Your code uses a CUDA message type (e.g., agnocast::cuda::PointCloud2 or\n" + " agnocast::cuda::Image), which requires the NVIDIA CUDA runtime to be installed\n" + " on this machine. If you do not need GPU-accelerated message passing, use the\n" + " standard (non-CUDA) message types instead.\n" + " To fix: install the CUDA runtime (e.g., 'sudo apt install cuda-cudart-12-*')\n" + " and ensure libcudart.so is on LD_LIBRARY_PATH.\n" + " dlopen error: %s\n", + dlerror()); + std::abort(); + } + + load_symbol(cudaGetDevice, "cudaGetDevice"); + load_symbol(cudaDeviceGetAttribute, "cudaDeviceGetAttribute"); + load_symbol(cudaIpcGetMemHandle, "cudaIpcGetMemHandle"); + load_symbol(cudaIpcOpenMemHandle, "cudaIpcOpenMemHandle"); + load_symbol(cudaIpcCloseMemHandle, "cudaIpcCloseMemHandle"); + load_symbol(cudaFree, "cudaFree"); + load_symbol(cudaGetErrorString, "cudaGetErrorString"); + } + + // Intentionally never close the handle. The CUDA runtime maintains process-global state + // (device context, allocations, etc.). Calling dlclose() during static destruction can + // race with other CUDA cleanup and cause segfaults. The OS reclaims everything at exit. + ~CudartLoader() = default; + + CudartLoader(const CudartLoader &) = delete; + CudartLoader & operator=(const CudartLoader &) = delete; + + template + void load_symbol(T & func_ptr, const char * name) + { + // POSIX requires clearing dlerror() before calling dlsym(), then checking dlerror() + // after, because dlsym() can legitimately return NULL for some symbols. + dlerror(); + func_ptr = reinterpret_cast(dlsym(handle_, name)); + const char * err = dlerror(); + if (err != nullptr) { + std::fprintf( + stderr, "[agnocast_cuda] FATAL: Could not load symbol '%s' from libcudart.so: %s\n", name, + err); + std::abort(); + } + } +}; + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/get_backend.cpp b/src/agnocast_cuda/src/get_backend.cpp index 87068ff54..12b428099 100644 --- a/src/agnocast_cuda/src/get_backend.cpp +++ b/src/agnocast_cuda/src/get_backend.cpp @@ -1,11 +1,10 @@ #include "agnocast/gpu_transfer_backend.hpp" #include "cuda_ipc_backend.hpp" +#include "cudart_loader.hpp" #include "nvscibuf_backend.hpp" #include "unified_memory_backend.hpp" #include "vmm_backend.hpp" -#include - #include #include #include @@ -18,20 +17,22 @@ namespace std::unique_ptr select_backend() { + const auto & api = CudartLoader::instance(); + int device = 0; - cudaError_t err = cudaGetDevice(&device); + cudaError_t err = api.cudaGetDevice(&device); if (err != cudaSuccess) { std::fprintf( - stderr, "[agnocast_cuda] FATAL: cudaGetDevice failed: %s\n", cudaGetErrorString(err)); + stderr, "[agnocast_cuda] FATAL: cudaGetDevice failed: %s\n", api.cudaGetErrorString(err)); std::abort(); } int is_integrated = 0; - err = cudaDeviceGetAttribute(&is_integrated, cudaDevAttrIntegrated, device); + err = api.cudaDeviceGetAttribute(&is_integrated, cudaDevAttrIntegrated, device); if (err != cudaSuccess) { std::fprintf( stderr, "[agnocast_cuda] FATAL: cudaDeviceGetAttribute failed: %s\n", - cudaGetErrorString(err)); + api.cudaGetErrorString(err)); std::abort(); } diff --git a/src/agnocast_sample_application/CMakeLists.txt b/src/agnocast_sample_application/CMakeLists.txt index 52dd3153c..c3631df79 100644 --- a/src/agnocast_sample_application/CMakeLists.txt +++ b/src/agnocast_sample_application/CMakeLists.txt @@ -165,11 +165,16 @@ install(TARGETS no_rclcpp_client install(TARGETS sim_time_timer DESTINATION lib/${PROJECT_NAME}) -# CUDA sample applications (only built when CUDA and agnocast_cuda are available) -find_package(CUDAToolkit QUIET) +# CUDA sample applications (only built when the CUDA compiler, CUDA toolkit, and +# agnocast_cuda are all available). Note: agnocast_cuda itself has no build-time CUDA +# dependency (it dlopen's libcudart.so at runtime), but these sample apps compile .cu +# files with CUDA kernels, so they need the full CUDA toolchain. +include(CheckLanguage) +check_language(CUDA) find_package(agnocast_cuda QUIET) -if(CUDAToolkit_FOUND AND agnocast_cuda_FOUND) +if(CMAKE_CUDA_COMPILER AND agnocast_cuda_FOUND) enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) add_executable(cuda_talker src/cuda_publisher.cpp) set_source_files_properties(src/cuda_publisher.cpp PROPERTIES LANGUAGE CUDA) diff --git a/src/agnocast_sample_application/package.xml b/src/agnocast_sample_application/package.xml index 86596a893..5818580c4 100644 --- a/src/agnocast_sample_application/package.xml +++ b/src/agnocast_sample_application/package.xml @@ -31,9 +31,10 @@ ament_lint_auto ament_lint_common - + + agnocast_cuda ament_cmake From 31a6fb8fe82a81e8a7f1e94dcf93a49b920e2eea Mon Sep 17 00:00:00 2001 From: sykwer Date: Thu, 9 Apr 2026 16:45:21 +0900 Subject: [PATCH 14/18] fix Signed-off-by: sykwer --- src/agnocast/package.xml | 2 ++ src/agnocast_cuda/src/cudart_loader.hpp | 3 ++- src/agnocast_sample_application/package.xml | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/agnocast/package.xml b/src/agnocast/package.xml index 2332f1dea..8e7a132eb 100644 --- a/src/agnocast/package.xml +++ b/src/agnocast/package.xml @@ -18,6 +18,8 @@ agnocast_cie_config_msgs agnocast_cie_thread_configurator agnocast_components + agnocast_cuda agnocast_ioctl_wrapper agnocast_sample_application diff --git a/src/agnocast_cuda/src/cudart_loader.hpp b/src/agnocast_cuda/src/cudart_loader.hpp index af31b8967..7ebc1ab7d 100644 --- a/src/agnocast_cuda/src/cudart_loader.hpp +++ b/src/agnocast_cuda/src/cudart_loader.hpp @@ -27,7 +27,7 @@ // - cudaError_t is a C enum (int-sized). cudaSuccess has been 0 since CUDA 1.0. // - cudaIpcMemHandle_t is a 64-byte opaque struct (CUDA_IPC_HANDLE_SIZE = 64). // - cudaIpcMemLazyEnablePeerAccess is a flag constant (0x01). -// - cudaDevAttrIntegrated is enum value 73 in cudaDeviceAttr (append-only enum). +// - cudaDevAttrIntegrated is enum value 18 in cudaDeviceAttr (append-only enum). // If NVIDIA ever breaks this ABI (extremely unlikely), the static_assert in // cuda_ipc_backend.cpp will catch size mismatches at compile time. // @@ -125,6 +125,7 @@ class CudartLoader CudartLoader() { // Try library names in order. See "LIBRARY NAME FALLBACK CHAIN" in the file header. + dlerror(); // Clear any stale error before the dlopen loop. const char * names[] = {"libcudart.so", "libcudart.so.12", "libcudart.so.11.0"}; for (const char * name : names) { handle_ = dlopen(name, RTLD_NOW | RTLD_LOCAL); diff --git a/src/agnocast_sample_application/package.xml b/src/agnocast_sample_application/package.xml index 5818580c4..2a6cf3d7f 100644 --- a/src/agnocast_sample_application/package.xml +++ b/src/agnocast_sample_application/package.xml @@ -33,6 +33,8 @@ agnocast_cuda From a6531484ce6f90ca9043040aad2c15bd60568c83 Mon Sep 17 00:00:00 2001 From: sykwer Date: Thu, 9 Apr 2026 16:49:48 +0900 Subject: [PATCH 15/18] fix Signed-off-by: sykwer --- src/agnocast_cuda/src/cuda_ipc_backend.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.cpp b/src/agnocast_cuda/src/cuda_ipc_backend.cpp index 6f924db9e..36548dc32 100644 --- a/src/agnocast_cuda/src/cuda_ipc_backend.cpp +++ b/src/agnocast_cuda/src/cuda_ipc_backend.cpp @@ -1,4 +1,5 @@ #include "cuda_ipc_backend.hpp" + #include "cudart_loader.hpp" #include @@ -45,8 +46,7 @@ void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/) std::memcpy(&ipc_handle, handle.opaque, sizeof(ipc_handle)); void * ptr = nullptr; check_cuda_error( - CudartLoader::instance().cudaIpcOpenMemHandle( - &ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess), + CudartLoader::instance().cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess), "cudaIpcOpenMemHandle"); return ptr; } From fc4f87ab9fc453ad05b6e30ff0fbd78d0d8f46d3 Mon Sep 17 00:00:00 2001 From: sykwer Date: Thu, 9 Apr 2026 17:13:54 +0900 Subject: [PATCH 16/18] rename Signed-off-by: sykwer --- .../include/agnocast/cuda/{types.hpp => message_types.hpp} | 0 src/agnocast_sample_application/src/cuda_publisher.cpp | 2 +- src/agnocast_sample_application/src/cuda_subscriber.cpp | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename src/agnocast_cuda/include/agnocast/cuda/{types.hpp => message_types.hpp} (100%) diff --git a/src/agnocast_cuda/include/agnocast/cuda/types.hpp b/src/agnocast_cuda/include/agnocast/cuda/message_types.hpp similarity index 100% rename from src/agnocast_cuda/include/agnocast/cuda/types.hpp rename to src/agnocast_cuda/include/agnocast/cuda/message_types.hpp diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp index a6efbec7e..edb661c64 100644 --- a/src/agnocast_sample_application/src/cuda_publisher.cpp +++ b/src/agnocast_sample_application/src/cuda_publisher.cpp @@ -1,5 +1,5 @@ #include "agnocast/agnocast.hpp" -#include "agnocast/cuda/types.hpp" +#include "agnocast/cuda/message_types.hpp" #include diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp index 939b36c30..2a5940412 100644 --- a/src/agnocast_sample_application/src/cuda_subscriber.cpp +++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp @@ -1,5 +1,5 @@ #include "agnocast/agnocast.hpp" -#include "agnocast/cuda/types.hpp" +#include "agnocast/cuda/message_types.hpp" #include From ce5f5f04a855154b31bec69a2701ce5e8672c6fd Mon Sep 17 00:00:00 2001 From: sykwer Date: Thu, 9 Apr 2026 17:40:23 +0900 Subject: [PATCH 17/18] fix Signed-off-by: sykwer --- src/agnocast_sample_application/CMakeLists.txt | 2 ++ src/agnocast_sample_application/package.xml | 8 ++++---- .../src/cuda_publisher.cpp | 18 +++++++++++------- .../src/cuda_subscriber.cpp | 9 ++++++--- .../agnocast/agnocast_callback_info.hpp | 10 ++++++---- .../include/agnocast/agnocast_publisher.hpp | 17 +++++++++-------- 6 files changed, 38 insertions(+), 26 deletions(-) diff --git a/src/agnocast_sample_application/CMakeLists.txt b/src/agnocast_sample_application/CMakeLists.txt index c3631df79..7195e2e5a 100644 --- a/src/agnocast_sample_application/CMakeLists.txt +++ b/src/agnocast_sample_application/CMakeLists.txt @@ -169,6 +169,8 @@ install(TARGETS sim_time_timer # agnocast_cuda are all available). Note: agnocast_cuda itself has no build-time CUDA # dependency (it dlopen's libcudart.so at runtime), but these sample apps compile .cu # files with CUDA kernels, so they need the full CUDA toolchain. +# These binaries are NOT included in the pre-built deb from the ROS build farm (which +# has no CUDA). Users who want to try the CUDA samples must build from source with CUDA. include(CheckLanguage) check_language(CUDA) find_package(agnocast_cuda QUIET) diff --git a/src/agnocast_sample_application/package.xml b/src/agnocast_sample_application/package.xml index 2a6cf3d7f..1315e81e3 100644 --- a/src/agnocast_sample_application/package.xml +++ b/src/agnocast_sample_application/package.xml @@ -32,11 +32,11 @@ ament_lint_common - agnocast_cuda + Packages that compile .cu files must also check for CUDAToolkit in CMakeLists.txt. --> + agnocast_cuda + sensor_msgs ament_cmake diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp index edb661c64..62a0312cc 100644 --- a/src/agnocast_sample_application/src/cuda_publisher.cpp +++ b/src/agnocast_sample_application/src/cuda_publisher.cpp @@ -34,9 +34,12 @@ class CudaPublisher : public agnocast::Node msg->is_dense = true; // Allocate and fill GPU data - const size_t gpu_size = msg->height * msg->width * msg->point_step; - if (cudaMalloc(&msg->data, gpu_size) != cudaSuccess) { - RCLCPP_ERROR(get_logger(), "cudaMalloc failed: %s", cudaGetErrorString(cudaGetLastError())); + const size_t gpu_size = + static_cast(msg->height) * static_cast(msg->width) * + static_cast(msg->point_step); + const cudaError_t malloc_result = cudaMalloc(&msg->data, gpu_size); + if (malloc_result != cudaSuccess) { + RCLCPP_ERROR(get_logger(), "cudaMalloc failed: %s", cudaGetErrorString(malloc_result)); return; } @@ -45,9 +48,9 @@ class CudaPublisher : public agnocast::Node // cppcheck-suppress shiftTooManyBits // false positive: <<< >>> is CUDA kernel launch syntax fill_kernel<<>>(msg->data, gpu_size, static_cast(count_)); - if (cudaStreamSynchronize(nullptr) != cudaSuccess) { - RCLCPP_ERROR( - get_logger(), "kernel launch failed: %s", cudaGetErrorString(cudaGetLastError())); + const cudaError_t sync_result = cudaStreamSynchronize(nullptr); + if (sync_result != cudaSuccess) { + RCLCPP_ERROR(get_logger(), "kernel launch failed: %s", cudaGetErrorString(sync_result)); cudaFree(msg->data); msg->data = nullptr; return; @@ -55,7 +58,8 @@ class CudaPublisher : public agnocast::Node pub_->publish(std::move(msg)); RCLCPP_INFO( - get_logger(), "published CUDA PointCloud2: seq=%ld, gpu_size=%zu", count_++, gpu_size); + get_logger(), "published CUDA PointCloud2: seq=%lld, gpu_size=%zu", + static_cast(count_++), gpu_size); } public: diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp index 2a5940412..f63eb0047 100644 --- a/src/agnocast_sample_application/src/cuda_subscriber.cpp +++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp @@ -16,7 +16,8 @@ class CudaSubscriber : public agnocast::Node // Read CPU metadata from shared memory const uint32_t width = msg->width; const uint32_t point_step = msg->point_step; - const size_t gpu_size = msg->height * width * point_step; + const size_t gpu_size = + static_cast(msg->height) * static_cast(width) * static_cast(point_step); // Get subscriber-local GPU pointer (mapped via CUDA IPC) auto * gpu_ptr = static_cast(msg.gpu_data()); @@ -28,8 +29,10 @@ class CudaSubscriber : public agnocast::Node // Read first few bytes from GPU to verify data uint8_t host_buf[16]{}; const size_t copy_size = std::min(gpu_size, sizeof(host_buf)); - if (cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost) != cudaSuccess) { - RCLCPP_ERROR(get_logger(), "cudaMemcpy failed: %s", cudaGetErrorString(cudaGetLastError())); + const cudaError_t memcpy_result = + cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost); + if (memcpy_result != cudaSuccess) { + RCLCPP_ERROR(get_logger(), "cudaMemcpy failed: %s", cudaGetErrorString(memcpy_result)); return; } diff --git a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp index 08e101447..ff7b183de 100644 --- a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp @@ -1,10 +1,12 @@ #pragma once #include "agnocast/agnocast_smart_pointer.hpp" +#include "agnocast/agnocast_utils.hpp" #include "agnocast/cuda_message_tag.hpp" #include "agnocast/gpu_metadata.hpp" #include "agnocast/gpu_transfer_backend.hpp" +#include #include #include @@ -77,10 +79,10 @@ agnocast::ipc_shared_ptr create_subscriber_ipc_ptr( if constexpr (is_cuda_message_v) { auto * meta = static_cast(msg->gpu_metadata_); if (!meta) { - std::fprintf( - stderr, - "[agnocast] FATAL: CUDA message on topic '%s' has null gpu_metadata_. " - "The publisher may have failed to set GpuMetadata during publish().\n", + RCLCPP_ERROR( + logger, + "CUDA message on topic '%s' has null gpu_metadata_. " + "The publisher may have failed to set GpuMetadata during publish().", topic_name.c_str()); std::abort(); } diff --git a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp index 035436e74..720f14da7 100644 --- a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -219,19 +220,19 @@ class BasicPublisher // All CUDA message types must provide this by shadowing the base ROS message's data field. if constexpr (is_cuda_message_v) { if (!raw_ptr->data) { - std::fprintf( - stderr, - "[agnocast] FATAL: CUDA message on topic '%s' has null data pointer. " - "Did you forget to cudaMalloc(&msg->data, size) before publish()?\n", + RCLCPP_ERROR( + logger, + "CUDA message on topic '%s' has null data pointer. " + "Did you forget to cudaMalloc(&msg->data, size) before publish()?", topic_name_.c_str()); std::abort(); } const size_t gpu_size = get_cuda_gpu_data_size(*raw_ptr); if (gpu_size == 0) { - std::fprintf( - stderr, - "[agnocast] FATAL: CUDA message on topic '%s' has gpu_data_size == 0. " - "Ensure message fields (height, width, point_step, etc.) are set before publish().\n", + RCLCPP_ERROR( + logger, + "CUDA message on topic '%s' has gpu_data_size == 0. " + "Ensure message fields (height, width, point_step, etc.) are set before publish().", topic_name_.c_str()); std::abort(); } From 2336546e03b84803374397128dc53d6aae1bec60 Mon Sep 17 00:00:00 2001 From: sykwer Date: Thu, 9 Apr 2026 17:42:37 +0900 Subject: [PATCH 18/18] fix Signed-off-by: sykwer --- src/agnocast_sample_application/src/cuda_publisher.cpp | 5 ++--- src/agnocast_sample_application/src/cuda_subscriber.cpp | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp index 62a0312cc..43775e30c 100644 --- a/src/agnocast_sample_application/src/cuda_publisher.cpp +++ b/src/agnocast_sample_application/src/cuda_publisher.cpp @@ -34,9 +34,8 @@ class CudaPublisher : public agnocast::Node msg->is_dense = true; // Allocate and fill GPU data - const size_t gpu_size = - static_cast(msg->height) * static_cast(msg->width) * - static_cast(msg->point_step); + const size_t gpu_size = static_cast(msg->height) * static_cast(msg->width) * + static_cast(msg->point_step); const cudaError_t malloc_result = cudaMalloc(&msg->data, gpu_size); if (malloc_result != cudaSuccess) { RCLCPP_ERROR(get_logger(), "cudaMalloc failed: %s", cudaGetErrorString(malloc_result)); diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp index f63eb0047..9f1be14e9 100644 --- a/src/agnocast_sample_application/src/cuda_subscriber.cpp +++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp @@ -16,8 +16,8 @@ class CudaSubscriber : public agnocast::Node // Read CPU metadata from shared memory const uint32_t width = msg->width; const uint32_t point_step = msg->point_step; - const size_t gpu_size = - static_cast(msg->height) * static_cast(width) * static_cast(point_step); + const size_t gpu_size = static_cast(msg->height) * static_cast(width) * + static_cast(point_step); // Get subscriber-local GPU pointer (mapped via CUDA IPC) auto * gpu_ptr = static_cast(msg.gpu_data());