diff --git a/scripts/sample_application/run_cuda_listener.bash b/scripts/sample_application/run_cuda_listener.bash new file mode 100755 index 000000000..296600719 --- /dev/null +++ b/scripts/sample_application/run_cuda_listener.bash @@ -0,0 +1,4 @@ +#!/bin/bash + +source install/setup.bash +ros2 launch agnocast_sample_application cuda_listener.launch.xml diff --git a/scripts/sample_application/run_cuda_talker.bash b/scripts/sample_application/run_cuda_talker.bash new file mode 100755 index 000000000..3c8318bad --- /dev/null +++ b/scripts/sample_application/run_cuda_talker.bash @@ -0,0 +1,4 @@ +#!/bin/bash + +source install/setup.bash +ros2 launch agnocast_sample_application cuda_talker.launch.xml diff --git a/src/agnocast/package.xml b/src/agnocast/package.xml index ae223d158..8e7a132eb 100644 --- a/src/agnocast/package.xml +++ b/src/agnocast/package.xml @@ -18,6 +18,9 @@ agnocast_cie_config_msgs agnocast_cie_thread_configurator agnocast_components + + agnocast_cuda agnocast_ioctl_wrapper agnocast_sample_application agnocast_sample_interfaces diff --git a/src/agnocast_cuda/CMakeLists.txt b/src/agnocast_cuda/CMakeLists.txt new file mode 100644 index 000000000..dba9e71f1 --- /dev/null +++ b/src/agnocast_cuda/CMakeLists.txt @@ -0,0 +1,69 @@ +cmake_minimum_required(VERSION 3.14) +project(agnocast_cuda LANGUAGES CXX) + +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() + +if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-Wall -Wextra -Wpedantic) +endif() + +find_package(ament_cmake REQUIRED) +find_package(agnocastlib REQUIRED) +find_package(sensor_msgs REQUIRED) + +# ---- Runtime CUDA loading (no build-time CUDA dependency) ---- +# +# This package has ZERO build-time dependency on the CUDA toolkit. It does NOT use +# find_package(CUDAToolkit), enable_language(CUDA), or link against CUDA::cudart. +# +# Instead, libcudart.so is loaded at runtime via dlopen() (see src/cudart_loader.hpp). +# This allows the package to be built as a pre-built .deb on the ROS build farm, which +# has no CUDA toolchain. Users install CUDA separately on their target machines. +# +# The only system library we link is libdl (via ${CMAKE_DL_LIBS}) for dlopen/dlsym. +# +# IMPORTANT FOR DOWNSTREAM PACKAGES: +# Since this library always builds (even without CUDA), find_package(agnocast_cuda) and +# TARGET agnocast_cuda::agnocast_cuda are NOT reliable indicators that CUDA is available. +# Downstream packages that compile .cu files or call CUDA APIs directly (e.g., cudaMalloc) +# must ALSO check find_package(CUDAToolkit) independently. See the sample application's +# CMakeLists.txt for the correct pattern: +# find_package(CUDAToolkit QUIET) +# find_package(agnocast_cuda QUIET) +# if(CUDAToolkit_FOUND AND agnocast_cuda_FOUND) +# ... +# endif() +add_library(agnocast_cuda SHARED + src/cuda_ipc_backend.cpp + src/vmm_backend.cpp + src/nvscibuf_backend.cpp + src/unified_memory_backend.cpp + src/get_backend.cpp) + +target_include_directories(agnocast_cuda PUBLIC + $ + $) + +ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs) +target_link_libraries(agnocast_cuda ${CMAKE_DL_LIBS}) + +install(TARGETS agnocast_cuda + EXPORT export_${PROJECT_NAME} + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + RUNTIME DESTINATION bin + INCLUDES DESTINATION include) + +ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET) +ament_export_dependencies(agnocastlib sensor_msgs) + +# Always install headers so downstream packages can use the type definitions +# (e.g., cuda_message_tag.hpp is in agnocastlib, but types.hpp is here). +install( + DIRECTORY include/ + DESTINATION include) + +ament_export_include_directories(include) +ament_package() diff --git a/src/agnocast_cuda/include/agnocast/cuda/message_types.hpp b/src/agnocast_cuda/include/agnocast/cuda/message_types.hpp new file mode 100644 index 000000000..c1d8a90c8 --- /dev/null +++ b/src/agnocast_cuda/include/agnocast/cuda/message_types.hpp @@ -0,0 +1,42 @@ +#pragma once + +#include "agnocast/cuda_message_tag.hpp" + +#include +#include + +#include +#include + +namespace agnocast::cuda +{ + +struct PointCloud2 : public sensor_msgs::msg::PointCloud2, public agnocast::cuda_message_tag +{ + uint8_t * data = nullptr; // GPU device pointer (shadows base class std::vector data) +}; + +struct Image : public sensor_msgs::msg::Image, public agnocast::cuda_message_tag +{ + uint8_t * data = nullptr; // GPU device pointer (shadows base class std::vector data) +}; + +} // namespace agnocast::cuda + +namespace agnocast +{ + +template <> +inline size_t get_cuda_gpu_data_size(const agnocast::cuda::PointCloud2 & msg) +{ + return static_cast(msg.height) * static_cast(msg.width) * + static_cast(msg.point_step); +} + +template <> +inline size_t get_cuda_gpu_data_size(const agnocast::cuda::Image & msg) +{ + return static_cast(msg.height) * msg.step; +} + +} // namespace agnocast diff --git a/src/agnocast_cuda/package.xml b/src/agnocast_cuda/package.xml new file mode 100644 index 000000000..b01f3bc15 --- /dev/null +++ b/src/agnocast_cuda/package.xml @@ -0,0 +1,25 @@ + + + + agnocast_cuda + 2.3.3 + + CUDA IPC support for Agnocast. Provides GPU message types and cross-process GPU buffer sharing. + + Takahiro Ishikawa-Aso + Koichi Imai + Masaki Nishikawa + Apache License 2.0 + + + Takahiro Ishikawa-Aso + + ament_cmake + + agnocastlib + sensor_msgs + + + ament_cmake + + diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.cpp b/src/agnocast_cuda/src/cuda_ipc_backend.cpp new file mode 100644 index 000000000..36548dc32 --- /dev/null +++ b/src/agnocast_cuda/src/cuda_ipc_backend.cpp @@ -0,0 +1,60 @@ +#include "cuda_ipc_backend.hpp" + +#include "cudart_loader.hpp" + +#include +#include +#include + +namespace +{ + +void check_cuda_error(agnocast::cuda::cudaError_t err, const char * operation) +{ + if (err != agnocast::cuda::cudaSuccess) { + std::fprintf( + stderr, "[agnocast_cuda] FATAL: %s failed: %s\n", operation, + agnocast::cuda::CudartLoader::instance().cudaGetErrorString(err)); + std::abort(); + } +} + +} // namespace + +namespace agnocast::cuda +{ + +GpuHandle CudaIpcBackend::export_handle(void * device_ptr, size_t /*size*/) +{ + GpuHandle h{}; + static_assert(sizeof(cudaIpcMemHandle_t) <= sizeof(h.opaque)); + cudaIpcMemHandle_t ipc_handle; + check_cuda_error( + CudartLoader::instance().cudaIpcGetMemHandle(&ipc_handle, device_ptr), "cudaIpcGetMemHandle"); + std::memcpy(h.opaque, &ipc_handle, sizeof(ipc_handle)); + return h; +} + +void CudaIpcBackend::free_device_memory(void * device_ptr) +{ + check_cuda_error(CudartLoader::instance().cudaFree(device_ptr), "cudaFree"); +} + +void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/) +{ + cudaIpcMemHandle_t ipc_handle; + std::memcpy(&ipc_handle, handle.opaque, sizeof(ipc_handle)); + void * ptr = nullptr; + check_cuda_error( + CudartLoader::instance().cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess), + "cudaIpcOpenMemHandle"); + return ptr; +} + +void CudaIpcBackend::release_handle(void * local_ptr) +{ + check_cuda_error( + CudartLoader::instance().cudaIpcCloseMemHandle(local_ptr), "cudaIpcCloseMemHandle"); +} + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.hpp b/src/agnocast_cuda/src/cuda_ipc_backend.hpp new file mode 100644 index 000000000..fc262cea8 --- /dev/null +++ b/src/agnocast_cuda/src/cuda_ipc_backend.hpp @@ -0,0 +1,19 @@ +// Internal header — kept in src/ so it is NOT installed or visible to downstream packages. +// Only get_backend.cpp includes this to instantiate the singleton. +#pragma once + +#include "agnocast/gpu_transfer_backend.hpp" + +namespace agnocast::cuda +{ + +class CudaIpcBackend : public GpuTransferBackend +{ +public: + GpuHandle export_handle(void * device_ptr, size_t size) override; + void free_device_memory(void * device_ptr) override; + void * import_handle(const GpuHandle & handle, size_t size) override; + void release_handle(void * local_ptr) override; +}; + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/cudart_loader.hpp b/src/agnocast_cuda/src/cudart_loader.hpp new file mode 100644 index 000000000..7ebc1ab7d --- /dev/null +++ b/src/agnocast_cuda/src/cudart_loader.hpp @@ -0,0 +1,185 @@ +// ============================================================================ +// Runtime loader for the CUDA Runtime API (libcudart.so) via dlopen/dlsym. +// +// WHY THIS EXISTS +// --------------- +// agnocast_cuda is distributed as a pre-built .deb from the ROS build farm, which has no +// CUDA toolchain. To eliminate the build-time dependency on and -lcudart, +// we replicate the small set of CUDA types/constants we need (stable ABI) and load the +// actual CUDA runtime library at runtime via dlopen. This means: +// - Build time: pure C++17, no CUDA headers or compiler required. +// - Run time: libcudart.so must be present on the target machine (user installs CUDA). +// +// SAFE ON NON-CUDA SYSTEMS +// ------------------------ +// Users who do not use CUDA message types are completely unaffected by this library. +// CudartLoader is lazy: dlopen("libcudart.so") only happens on the first call to +// CudartLoader::instance(), which is only reachable through get_backend(), which is +// only called from if-constexpr branches guarded by is_cuda_message_v. If no CUDA +// message type is ever instantiated, this loader is never constructed and no CUDA symbols +// are ever resolved. The library can be safely installed and loaded on machines without +// a GPU or CUDA runtime. +// +// ABI STABILITY +// ------------- +// The types and constants below are part of NVIDIA's stable C ABI for the CUDA Runtime API. +// They have not changed across CUDA 10.x, 11.x, and 12.x. Specifically: +// - cudaError_t is a C enum (int-sized). cudaSuccess has been 0 since CUDA 1.0. +// - cudaIpcMemHandle_t is a 64-byte opaque struct (CUDA_IPC_HANDLE_SIZE = 64). +// - cudaIpcMemLazyEnablePeerAccess is a flag constant (0x01). +// - cudaDevAttrIntegrated is enum value 18 in cudaDeviceAttr (append-only enum). +// If NVIDIA ever breaks this ABI (extremely unlikely), the static_assert in +// cuda_ipc_backend.cpp will catch size mismatches at compile time. +// +// ADDING NEW CUDA API CALLS +// ------------------------- +// When a new backend or feature needs an additional CUDA runtime function: +// 1. Add the function pointer typedef below (e.g., using cudaMalloc_t = ...). +// 2. Add a public member to CudartLoader (e.g., cudaMalloc_t cudaMalloc;). +// 3. Add a load_symbol() call in the constructor. +// Keep this file as the single place that touches dlopen/dlsym for CUDA. +// +// LIBRARY NAME FALLBACK CHAIN +// --------------------------- +// The constructor tries these names in order: +// 1. "libcudart.so" — unversioned symlink, present when the CUDA dev package is +// installed (e.g., cuda-cudart-dev-12-*) or with the runfile +// installer. +// 2. "libcudart.so.12" — SONAME for CUDA 12.x. Present with the runtime-only deb +// package (cuda-cudart-12-*) even without the dev package. +// 3. "libcudart.so.11.0"— SONAME for CUDA 11.x (NVIDIA used major.minor for the SONAME +// in the 11.x series; all 11.0–11.8 share this SONAME). +// If a future CUDA major version (e.g., 13.x) is released, add its SONAME to the list. +// ============================================================================ +#pragma once + +#include + +#include +#include + +namespace agnocast::cuda +{ + +// --------------------------------------------------------------------------- +// ABI-compatible CUDA Runtime API type definitions. +// These replace #include so that no CUDA headers are needed at build time. +// See "ABI STABILITY" section in the file header for rationale. +// --------------------------------------------------------------------------- +using cudaError_t = int; +constexpr cudaError_t cudaSuccess = 0; + +struct cudaIpcMemHandle_t +{ + char reserved[64]; // CUDA_IPC_HANDLE_SIZE = 64 +}; + +constexpr unsigned int cudaIpcMemLazyEnablePeerAccess = 0x01; + +using cudaDeviceAttr = int; +constexpr cudaDeviceAttr cudaDevAttrIntegrated = 18; // driver_types.h: cudaDevAttrIntegrated = 18 + +// --------------------------------------------------------------------------- +// Function pointer types matching CUDA Runtime API signatures. +// --------------------------------------------------------------------------- +using cudaGetDevice_t = cudaError_t (*)(int *); +using cudaDeviceGetAttribute_t = cudaError_t (*)(int *, cudaDeviceAttr, int); +using cudaIpcGetMemHandle_t = cudaError_t (*)(cudaIpcMemHandle_t *, void *); +using cudaIpcOpenMemHandle_t = cudaError_t (*)(void **, cudaIpcMemHandle_t, unsigned int); +using cudaIpcCloseMemHandle_t = cudaError_t (*)(void *); +using cudaFree_t = cudaError_t (*)(void *); +using cudaGetErrorString_t = const char * (*)(cudaError_t); + +// --------------------------------------------------------------------------- +// CudartLoader — lazy-loading singleton for the CUDA runtime. +// +// Thread safety: the function-local static in instance() is guaranteed to be +// initialized exactly once (C++11 "magic statics"). After construction, all +// function pointers are immutable and safe to read from any thread. +// +// Lifetime: the singleton is never destroyed during normal execution. We +// intentionally do NOT call dlclose() — see destructor comment. +// --------------------------------------------------------------------------- +class CudartLoader +{ +public: + static CudartLoader & instance() + { + static CudartLoader loader; + return loader; + } + + // Public function pointers — valid immediately after instance() returns. + // Callers use these like: CudartLoader::instance().cudaFree(ptr) + cudaGetDevice_t cudaGetDevice; + cudaDeviceGetAttribute_t cudaDeviceGetAttribute; + cudaIpcGetMemHandle_t cudaIpcGetMemHandle; + cudaIpcOpenMemHandle_t cudaIpcOpenMemHandle; + cudaIpcCloseMemHandle_t cudaIpcCloseMemHandle; + cudaFree_t cudaFree; + cudaGetErrorString_t cudaGetErrorString; + +private: + void * handle_ = nullptr; + + CudartLoader() + { + // Try library names in order. See "LIBRARY NAME FALLBACK CHAIN" in the file header. + dlerror(); // Clear any stale error before the dlopen loop. + const char * names[] = {"libcudart.so", "libcudart.so.12", "libcudart.so.11.0"}; + for (const char * name : names) { + handle_ = dlopen(name, RTLD_NOW | RTLD_LOCAL); + if (handle_ != nullptr) { + break; + } + } + if (handle_ == nullptr) { + std::fprintf( + stderr, + "[agnocast_cuda] FATAL: Could not load libcudart.so.\n" + " Your code uses a CUDA message type (e.g., agnocast::cuda::PointCloud2 or\n" + " agnocast::cuda::Image), which requires the NVIDIA CUDA runtime to be installed\n" + " on this machine. If you do not need GPU-accelerated message passing, use the\n" + " standard (non-CUDA) message types instead.\n" + " To fix: install the CUDA runtime (e.g., 'sudo apt install cuda-cudart-12-*')\n" + " and ensure libcudart.so is on LD_LIBRARY_PATH.\n" + " dlopen error: %s\n", + dlerror()); + std::abort(); + } + + load_symbol(cudaGetDevice, "cudaGetDevice"); + load_symbol(cudaDeviceGetAttribute, "cudaDeviceGetAttribute"); + load_symbol(cudaIpcGetMemHandle, "cudaIpcGetMemHandle"); + load_symbol(cudaIpcOpenMemHandle, "cudaIpcOpenMemHandle"); + load_symbol(cudaIpcCloseMemHandle, "cudaIpcCloseMemHandle"); + load_symbol(cudaFree, "cudaFree"); + load_symbol(cudaGetErrorString, "cudaGetErrorString"); + } + + // Intentionally never close the handle. The CUDA runtime maintains process-global state + // (device context, allocations, etc.). Calling dlclose() during static destruction can + // race with other CUDA cleanup and cause segfaults. The OS reclaims everything at exit. + ~CudartLoader() = default; + + CudartLoader(const CudartLoader &) = delete; + CudartLoader & operator=(const CudartLoader &) = delete; + + template + void load_symbol(T & func_ptr, const char * name) + { + // POSIX requires clearing dlerror() before calling dlsym(), then checking dlerror() + // after, because dlsym() can legitimately return NULL for some symbols. + dlerror(); + func_ptr = reinterpret_cast(dlsym(handle_, name)); + const char * err = dlerror(); + if (err != nullptr) { + std::fprintf( + stderr, "[agnocast_cuda] FATAL: Could not load symbol '%s' from libcudart.so: %s\n", name, + err); + std::abort(); + } + } +}; + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/get_backend.cpp b/src/agnocast_cuda/src/get_backend.cpp new file mode 100644 index 000000000..12b428099 --- /dev/null +++ b/src/agnocast_cuda/src/get_backend.cpp @@ -0,0 +1,65 @@ +#include "agnocast/gpu_transfer_backend.hpp" +#include "cuda_ipc_backend.hpp" +#include "cudart_loader.hpp" +#include "nvscibuf_backend.hpp" +#include "unified_memory_backend.hpp" +#include "vmm_backend.hpp" + +#include +#include +#include + +namespace agnocast::cuda +{ + +namespace +{ + +std::unique_ptr select_backend() +{ + const auto & api = CudartLoader::instance(); + + int device = 0; + cudaError_t err = api.cudaGetDevice(&device); + if (err != cudaSuccess) { + std::fprintf( + stderr, "[agnocast_cuda] FATAL: cudaGetDevice failed: %s\n", api.cudaGetErrorString(err)); + std::abort(); + } + + int is_integrated = 0; + err = api.cudaDeviceGetAttribute(&is_integrated, cudaDevAttrIntegrated, device); + if (err != cudaSuccess) { + std::fprintf( + stderr, "[agnocast_cuda] FATAL: cudaDeviceGetAttribute failed: %s\n", + api.cudaGetErrorString(err)); + std::abort(); + } + + if (!is_integrated) { + // Discrete GPU (GeForce, Quadro, Tesla, A/H series) — CUDA IPC is supported. + std::fprintf(stderr, "[agnocast_cuda] Discrete GPU detected, using CudaIpcBackend.\n"); + return std::make_unique(); + } + + // Integrated GPU (Jetson Xavier/Orin/Thor, DRIVE). + // TODO(agnocast): Implement and select the appropriate backend. + // - Jetson Thor (CUDA 13.0+): CudaIpcBackend may work via OpenRM. + // - Jetson Xavier/Orin: NvSciBufBackend or UnifiedMemoryBackend. + // - DRIVE: NvSciBufBackend. + std::fprintf( + stderr, + "[agnocast_cuda] FATAL: Integrated GPU detected (Jetson/DRIVE). " + "No backend is implemented yet for this platform.\n"); + std::abort(); +} + +} // namespace + +GpuTransferBackend & get_backend() +{ + static auto instance = select_backend(); + return *instance; +} + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/nvscibuf_backend.cpp b/src/agnocast_cuda/src/nvscibuf_backend.cpp new file mode 100644 index 000000000..0e900cedf --- /dev/null +++ b/src/agnocast_cuda/src/nvscibuf_backend.cpp @@ -0,0 +1,30 @@ +#include "nvscibuf_backend.hpp" + +#include + +namespace agnocast::cuda +{ + +GpuHandle NvSciBufBackend::export_handle(void * /*device_ptr*/, size_t /*size*/) +{ + throw std::runtime_error( + "[agnocast_cuda] NvSciBufBackend is not yet implemented. " + "Requires NvSciBuf (Jetson Xavier/Orin, NVIDIA DRIVE)."); +} + +void NvSciBufBackend::free_device_memory(void * /*device_ptr*/) +{ + throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented."); +} + +void * NvSciBufBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/) +{ + throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented."); +} + +void NvSciBufBackend::release_handle(void * /*local_ptr*/) +{ + throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented."); +} + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/nvscibuf_backend.hpp b/src/agnocast_cuda/src/nvscibuf_backend.hpp new file mode 100644 index 000000000..b631d1ca4 --- /dev/null +++ b/src/agnocast_cuda/src/nvscibuf_backend.hpp @@ -0,0 +1,21 @@ +// Internal header — kept in src/ so it is NOT installed or visible to downstream packages. +// Only get_backend.cpp includes this to instantiate the singleton. +#pragma once + +#include "agnocast/gpu_transfer_backend.hpp" + +namespace agnocast::cuda +{ + +// Placeholder backend using NvSciBuf / NvSciSync. +// Target platforms: Jetson Xavier/Orin, NVIDIA DRIVE. +class NvSciBufBackend : public GpuTransferBackend +{ +public: + GpuHandle export_handle(void * device_ptr, size_t size) override; + void free_device_memory(void * device_ptr) override; + void * import_handle(const GpuHandle & handle, size_t size) override; + void release_handle(void * local_ptr) override; +}; + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/unified_memory_backend.cpp b/src/agnocast_cuda/src/unified_memory_backend.cpp new file mode 100644 index 000000000..af97fff3b --- /dev/null +++ b/src/agnocast_cuda/src/unified_memory_backend.cpp @@ -0,0 +1,30 @@ +#include "unified_memory_backend.hpp" + +#include + +namespace agnocast::cuda +{ + +GpuHandle UnifiedMemoryBackend::export_handle(void * /*device_ptr*/, size_t /*size*/) +{ + throw std::runtime_error( + "[agnocast_cuda] UnifiedMemoryBackend is not yet implemented. " + "Requires POSIX shm + cudaHostRegister (Jetson unified memory)."); +} + +void UnifiedMemoryBackend::free_device_memory(void * /*device_ptr*/) +{ + throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented."); +} + +void * UnifiedMemoryBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/) +{ + throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented."); +} + +void UnifiedMemoryBackend::release_handle(void * /*local_ptr*/) +{ + throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented."); +} + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/unified_memory_backend.hpp b/src/agnocast_cuda/src/unified_memory_backend.hpp new file mode 100644 index 000000000..f9916f203 --- /dev/null +++ b/src/agnocast_cuda/src/unified_memory_backend.hpp @@ -0,0 +1,21 @@ +// Internal header — kept in src/ so it is NOT installed or visible to downstream packages. +// Only get_backend.cpp includes this to instantiate the singleton. +#pragma once + +#include "agnocast/gpu_transfer_backend.hpp" + +namespace agnocast::cuda +{ + +// Placeholder backend using POSIX shared memory + cudaHostRegister for Jetson platforms +// where CPU and GPU share the same physical memory (unified memory architecture). +class UnifiedMemoryBackend : public GpuTransferBackend +{ +public: + GpuHandle export_handle(void * device_ptr, size_t size) override; + void free_device_memory(void * device_ptr) override; + void * import_handle(const GpuHandle & handle, size_t size) override; + void release_handle(void * local_ptr) override; +}; + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/vmm_backend.cpp b/src/agnocast_cuda/src/vmm_backend.cpp new file mode 100644 index 000000000..4c14cbcd9 --- /dev/null +++ b/src/agnocast_cuda/src/vmm_backend.cpp @@ -0,0 +1,30 @@ +#include "vmm_backend.hpp" + +#include + +namespace agnocast::cuda +{ + +GpuHandle VmmBackend::export_handle(void * /*device_ptr*/, size_t /*size*/) +{ + throw std::runtime_error( + "[agnocast_cuda] VmmBackend is not yet implemented. " + "Requires cuMemExportToShareableHandle (CUDA Driver API)."); +} + +void VmmBackend::free_device_memory(void * /*device_ptr*/) +{ + throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented."); +} + +void * VmmBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/) +{ + throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented."); +} + +void VmmBackend::release_handle(void * /*local_ptr*/) +{ + throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented."); +} + +} // namespace agnocast::cuda diff --git a/src/agnocast_cuda/src/vmm_backend.hpp b/src/agnocast_cuda/src/vmm_backend.hpp new file mode 100644 index 000000000..491e5544a --- /dev/null +++ b/src/agnocast_cuda/src/vmm_backend.hpp @@ -0,0 +1,22 @@ +// Internal header — kept in src/ so it is NOT installed or visible to downstream packages. +// Only get_backend.cpp includes this to instantiate the singleton. +#pragma once + +#include "agnocast/gpu_transfer_backend.hpp" + +namespace agnocast::cuda +{ + +// Placeholder backend using CUDA Virtual Memory Management (VMM) API. +// Uses cuMemExportToShareableHandle / cuMemImportFromShareableHandle. +// Supported on Jetson Orin (newer CUDA) and discrete GPUs with CUDA 10.2+. +class VmmBackend : public GpuTransferBackend +{ +public: + GpuHandle export_handle(void * device_ptr, size_t size) override; + void free_device_memory(void * device_ptr) override; + void * import_handle(const GpuHandle & handle, size_t size) override; + void release_handle(void * local_ptr) override; +}; + +} // namespace agnocast::cuda diff --git a/src/agnocast_sample_application/CMakeLists.txt b/src/agnocast_sample_application/CMakeLists.txt index 41d2aa423..7195e2e5a 100644 --- a/src/agnocast_sample_application/CMakeLists.txt +++ b/src/agnocast_sample_application/CMakeLists.txt @@ -165,6 +165,41 @@ install(TARGETS no_rclcpp_client install(TARGETS sim_time_timer DESTINATION lib/${PROJECT_NAME}) +# CUDA sample applications (only built when the CUDA compiler, CUDA toolkit, and +# agnocast_cuda are all available). Note: agnocast_cuda itself has no build-time CUDA +# dependency (it dlopen's libcudart.so at runtime), but these sample apps compile .cu +# files with CUDA kernels, so they need the full CUDA toolchain. +# These binaries are NOT included in the pre-built deb from the ROS build farm (which +# has no CUDA). Users who want to try the CUDA samples must build from source with CUDA. +include(CheckLanguage) +check_language(CUDA) +find_package(agnocast_cuda QUIET) +if(CMAKE_CUDA_COMPILER AND agnocast_cuda_FOUND) + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) + + add_executable(cuda_talker src/cuda_publisher.cpp) + set_source_files_properties(src/cuda_publisher.cpp PROPERTIES LANGUAGE CUDA) + ament_target_dependencies(cuda_talker agnocastlib agnocast_cuda sensor_msgs) + target_link_libraries(cuda_talker CUDA::cudart) + target_include_directories(cuda_talker PRIVATE + ${agnocastlib_INCLUDE_DIRS} + ${agnocast_cuda_INCLUDE_DIRS} + ) + + add_executable(cuda_listener src/cuda_subscriber.cpp) + set_source_files_properties(src/cuda_subscriber.cpp PROPERTIES LANGUAGE CUDA) + ament_target_dependencies(cuda_listener agnocastlib agnocast_cuda sensor_msgs) + target_link_libraries(cuda_listener CUDA::cudart) + target_include_directories(cuda_listener PRIVATE + ${agnocastlib_INCLUDE_DIRS} + ${agnocast_cuda_INCLUDE_DIRS} + ) + + install(TARGETS cuda_talker cuda_listener + DESTINATION lib/${PROJECT_NAME}) +endif() + install(DIRECTORY launch DESTINATION share/${PROJECT_NAME}/ ) diff --git a/src/agnocast_sample_application/launch/cuda_listener.launch.xml b/src/agnocast_sample_application/launch/cuda_listener.launch.xml new file mode 100644 index 000000000..4136b9649 --- /dev/null +++ b/src/agnocast_sample_application/launch/cuda_listener.launch.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/src/agnocast_sample_application/launch/cuda_talker.launch.xml b/src/agnocast_sample_application/launch/cuda_talker.launch.xml new file mode 100644 index 000000000..ffad40cb2 --- /dev/null +++ b/src/agnocast_sample_application/launch/cuda_talker.launch.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/src/agnocast_sample_application/package.xml b/src/agnocast_sample_application/package.xml index cf534190d..1315e81e3 100644 --- a/src/agnocast_sample_application/package.xml +++ b/src/agnocast_sample_application/package.xml @@ -31,6 +31,13 @@ ament_lint_auto ament_lint_common + + agnocast_cuda + sensor_msgs + ament_cmake diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp new file mode 100644 index 000000000..43775e30c --- /dev/null +++ b/src/agnocast_sample_application/src/cuda_publisher.cpp @@ -0,0 +1,82 @@ +#include "agnocast/agnocast.hpp" +#include "agnocast/cuda/message_types.hpp" + +#include + +using namespace std::chrono_literals; + +// Simple CUDA kernel: fills GPU buffer with incrementing values. +__global__ void fill_kernel(uint8_t * data, size_t size, uint8_t offset) +{ + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + data[idx] = static_cast((idx + offset) % 256); + } +} + +class CudaPublisher : public agnocast::Node +{ + int64_t count_ = 0; + agnocast::Publisher::SharedPtr pub_; + agnocast::TimerBase::SharedPtr timer_; + + void timer_callback() + { + auto msg = pub_->borrow_loaned_message(); + + // Set CPU metadata + msg->header.stamp.sec = static_cast(count_); + msg->header.frame_id = "lidar"; + msg->height = 1; + msg->width = 1024; + msg->point_step = 16; + msg->row_step = msg->width * msg->point_step; + msg->is_dense = true; + + // Allocate and fill GPU data + const size_t gpu_size = static_cast(msg->height) * static_cast(msg->width) * + static_cast(msg->point_step); + const cudaError_t malloc_result = cudaMalloc(&msg->data, gpu_size); + if (malloc_result != cudaSuccess) { + RCLCPP_ERROR(get_logger(), "cudaMalloc failed: %s", cudaGetErrorString(malloc_result)); + return; + } + + const int threads = 256; + const int blocks = (gpu_size + threads - 1) / threads; + // cppcheck-suppress shiftTooManyBits // false positive: <<< >>> is CUDA kernel launch syntax + fill_kernel<<>>(msg->data, gpu_size, static_cast(count_)); + + const cudaError_t sync_result = cudaStreamSynchronize(nullptr); + if (sync_result != cudaSuccess) { + RCLCPP_ERROR(get_logger(), "kernel launch failed: %s", cudaGetErrorString(sync_result)); + cudaFree(msg->data); + msg->data = nullptr; + return; + } + + pub_->publish(std::move(msg)); + RCLCPP_INFO( + get_logger(), "published CUDA PointCloud2: seq=%lld, gpu_size=%zu", + static_cast(count_++), gpu_size); + } + +public: + CudaPublisher() : Node("cuda_publisher") + { + pub_ = this->create_publisher("/cuda_pointcloud", 1); + timer_ = agnocast::create_timer( + this, std::make_shared(RCL_STEADY_TIME), rclcpp::Duration(100ms), + std::bind(&CudaPublisher::timer_callback, this)); + } +}; + +int main(int argc, char ** argv) +{ + agnocast::init(argc, argv); + agnocast::AgnocastOnlySingleThreadedExecutor executor; + auto node = std::make_shared(); + executor.add_node(node); + executor.spin(); + return 0; +} diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp new file mode 100644 index 000000000..9f1be14e9 --- /dev/null +++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp @@ -0,0 +1,62 @@ +#include "agnocast/agnocast.hpp" +#include "agnocast/cuda/message_types.hpp" + +#include + +#include + +using std::placeholders::_1; + +class CudaSubscriber : public agnocast::Node +{ + agnocast::Subscription::SharedPtr sub_; + + void callback(agnocast::ipc_shared_ptr msg) + { + // Read CPU metadata from shared memory + const uint32_t width = msg->width; + const uint32_t point_step = msg->point_step; + const size_t gpu_size = static_cast(msg->height) * static_cast(width) * + static_cast(point_step); + + // Get subscriber-local GPU pointer (mapped via CUDA IPC) + auto * gpu_ptr = static_cast(msg.gpu_data()); + if (!gpu_ptr) { + RCLCPP_ERROR(get_logger(), "gpu_data() returned nullptr"); + return; + } + + // Read first few bytes from GPU to verify data + uint8_t host_buf[16]{}; + const size_t copy_size = std::min(gpu_size, sizeof(host_buf)); + const cudaError_t memcpy_result = + cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost); + if (memcpy_result != cudaSuccess) { + RCLCPP_ERROR(get_logger(), "cudaMemcpy failed: %s", cudaGetErrorString(memcpy_result)); + return; + } + + RCLCPP_INFO( + get_logger(), + "received CUDA PointCloud2: width=%u, point_step=%u, gpu_size=%zu, " + "first_bytes=[%u,%u,%u,%u]", + width, point_step, gpu_size, host_buf[0], host_buf[1], host_buf[2], host_buf[3]); + } + +public: + CudaSubscriber() : Node("cuda_subscriber") + { + sub_ = this->create_subscription( + "/cuda_pointcloud", 1, std::bind(&CudaSubscriber::callback, this, _1)); + } +}; + +int main(int argc, char ** argv) +{ + agnocast::init(argc, argv); + agnocast::AgnocastOnlySingleThreadedExecutor executor; + auto node = std::make_shared(); + executor.add_node(node); + executor.spin(); + return 0; +} diff --git a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp index 13b66fbf7..ff7b183de 100644 --- a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp @@ -1,7 +1,12 @@ #pragma once #include "agnocast/agnocast_smart_pointer.hpp" +#include "agnocast/agnocast_utils.hpp" +#include "agnocast/cuda_message_tag.hpp" +#include "agnocast/gpu_metadata.hpp" +#include "agnocast/gpu_transfer_backend.hpp" +#include #include #include @@ -44,7 +49,7 @@ struct CallbackInfo rclcpp::CallbackGroup::SharedPtr callback_group; TypeErasedCallback callback; std::function( - const void *, const std::string &, const topic_local_id_t, const uint64_t)> + const void *, const std::string &, const topic_local_id_t, const int64_t)> message_creator; bool need_epoll_update = true; }; @@ -61,6 +66,42 @@ extern std::atomic need_epoll_updates; uint32_t allocate_callback_info_id(); +// Creates an ipc_shared_ptr for a subscriber-received message. +// For CUDA messages: imports the GPU handle, stores the subscriber-local GPU pointer in +// control_block->gpu_data_ptr, and registers a gpu_release_fn to release the mapping +// on last reference. The pointer is accessed via ipc_shared_ptr::gpu_data(). +// For non-CUDA messages: simply wraps the pointer. +template +agnocast::ipc_shared_ptr create_subscriber_ipc_ptr( + MessageT * msg, const std::string & topic_name, const topic_local_id_t subscriber_id, + const int64_t entry_id) +{ + if constexpr (is_cuda_message_v) { + auto * meta = static_cast(msg->gpu_metadata_); + if (!meta) { + RCLCPP_ERROR( + logger, + "CUDA message on topic '%s' has null gpu_metadata_. " + "The publisher may have failed to set GpuMetadata during publish().", + topic_name.c_str()); + std::abort(); + } + void * gpu_data_ptr = + agnocast::cuda::get_backend().import_handle(meta->handle, meta->gpu_data_size); + // NOTE: If import_handle() fails, the backend aborts (fail-fast). If a future backend + // returns nullptr instead, the subscriber would get a null gpu pointer. Callers should + // check gpu_data() != nullptr before use. + + auto ipc_ptr = agnocast::ipc_shared_ptr(msg, topic_name, subscriber_id, entry_id); + ipc_ptr.set_gpu_data_ptr(gpu_data_ptr); + ipc_ptr.set_gpu_release_fn( + [](void * ptr) { agnocast::cuda::get_backend().release_handle(ptr); }); + return ipc_ptr; + } else { + return agnocast::ipc_shared_ptr(msg, topic_name, subscriber_id, entry_id); + } +} + template TypeErasedCallback get_erased_callback(Func && callback) { @@ -95,9 +136,9 @@ uint32_t register_callback( auto message_creator = []( const void * ptr, const std::string & topic_name, const topic_local_id_t subscriber_id, const int64_t entry_id) { - return std::make_unique>(agnocast::ipc_shared_ptr( - const_cast(static_cast(ptr)), topic_name, subscriber_id, - entry_id)); + auto * msg = const_cast(static_cast(ptr)); + return std::make_unique>( + create_subscriber_ipc_ptr(msg, topic_name, subscriber_id, entry_id)); }; uint32_t callback_info_id = allocate_callback_info_id(); diff --git a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp index 4e08394a0..720f14da7 100644 --- a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp @@ -6,6 +6,9 @@ #include "agnocast/agnocast_smart_pointer.hpp" #include "agnocast/agnocast_tracepoint_wrapper.h" #include "agnocast/agnocast_utils.hpp" +#include "agnocast/cuda_message_tag.hpp" +#include "agnocast/gpu_metadata.hpp" +#include "agnocast/gpu_transfer_backend.hpp" #include "rclcpp/detail/qos_parameters.hpp" #include "rclcpp/rclcpp.hpp" @@ -19,6 +22,7 @@ #include #include +#include #include #include #include @@ -207,7 +211,38 @@ class BasicPublisher } // Capture raw pointer BEFORE invalidation (get() returns nullptr after invalidation). - const uint64_t msg_virtual_address = reinterpret_cast(message.get()); + MessageT * raw_ptr = message.get(); + const uint64_t msg_virtual_address = reinterpret_cast(raw_ptr); + + // CUDA publish hook: export GPU handle and allocate GpuMetadata in shared memory. + // Runs while heaphook is still active, so GpuMetadata lands in the publisher's shared memory. + // NOTE: Assumes MessageT has a public `data` member (uint8_t*) pointing to the GPU allocation. + // All CUDA message types must provide this by shadowing the base ROS message's data field. + if constexpr (is_cuda_message_v) { + if (!raw_ptr->data) { + RCLCPP_ERROR( + logger, + "CUDA message on topic '%s' has null data pointer. " + "Did you forget to cudaMalloc(&msg->data, size) before publish()?", + topic_name_.c_str()); + std::abort(); + } + const size_t gpu_size = get_cuda_gpu_data_size(*raw_ptr); + if (gpu_size == 0) { + RCLCPP_ERROR( + logger, + "CUDA message on topic '%s' has gpu_data_size == 0. " + "Ensure message fields (height, width, point_step, etc.) are set before publish().", + topic_name_.c_str()); + std::abort(); + } + auto & backend = agnocast::cuda::get_backend(); + auto * meta = new GpuMetadata(); + meta->publisher_gpu_ptr = raw_ptr->data; + meta->gpu_data_size = gpu_size; + meta->handle = backend.export_handle(raw_ptr->data, gpu_size); + raw_ptr->gpu_metadata_ = meta; + } // Invalidate all references sharing this handle's control block. // Any remaining copies held elsewhere will fail-fast on dereference. @@ -220,6 +255,16 @@ class BasicPublisher for (uint32_t i = 0; i < publish_msg_args.ret_released_num; i++) { MessageT * release_ptr = reinterpret_cast(publish_msg_args.ret_released_addrs[i]); + // CUDA reclaim hook: free GPU buffer before deleting the message. + // On abnormal publisher exit, free_device_memory() is never called, but GPU device + // memory is reclaimed by the CUDA driver when the process exits. + if constexpr (is_cuda_message_v) { + if (release_ptr->gpu_metadata_) { + auto * meta = static_cast(release_ptr->gpu_metadata_); + agnocast::cuda::get_backend().free_device_memory(meta->publisher_gpu_ptr); + delete meta; + } + } delete release_ptr; } diff --git a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp index c0181ceda..b3ffafe8f 100644 --- a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp @@ -40,10 +40,18 @@ extern int agnocast_fd; // Sentinel value indicating entry_id has not been assigned (publisher-side, before publish). constexpr int64_t ENTRY_ID_NOT_ASSIGNED = -1; -// Forward declaration for friend access +// Forward declarations for friend access +template +class ipc_shared_ptr; + template class BasicPublisher; +template +ipc_shared_ptr create_subscriber_ipc_ptr( + MessageT * msg, const std::string & topic_name, const topic_local_id_t subscriber_id, + const int64_t entry_id); + namespace detail { @@ -60,6 +68,18 @@ struct control_block topic_local_id_t pubsub_id; // 4-byte alignment std::atomic valid{true}; // 1-byte alignment + // Optional GPU cleanup function pointer. Null for non-CUDA messages. + // Called as gpu_release_fn(gpu_data_ptr) before bitmap release in reset() to ensure GPU + // mappings are released before the publisher can free the underlying GPU buffer. + // Uses a plain function pointer instead of std::function to avoid heap allocation and + // minimize overhead for non-CUDA messages (16 bytes for two pointers vs ~40+ bytes). + void (*gpu_release_fn)(void *) = nullptr; + + // Subscriber-local GPU device pointer obtained via import_handle(). + // Stored here because the shared memory message is mapped read-only by the subscriber, + // so we cannot inject the local pointer into msg->data. + void * gpu_data_ptr = nullptr; + control_block(std::string topic, topic_local_id_t pubsub, int64_t entry) : topic_name(std::move(topic)), entry_id(entry), pubsub_id(pubsub) { @@ -105,6 +125,11 @@ class ipc_shared_ptr template friend class BasicPublisher; + // Allow create_subscriber_ipc_ptr to call set_gpu_release_fn() and set_gpu_data_ptr() + template + friend ipc_shared_ptr create_subscriber_ipc_ptr( + MessageT *, const std::string &, const topic_local_id_t, const int64_t); + // Allow converting constructors to access private members of ipc_shared_ptr template friend class ipc_shared_ptr; @@ -133,6 +158,24 @@ class ipc_shared_ptr } } + // Sets a GPU release function to be invoked as fn(gpu_data_ptr) when the last reference + // is released. Private: only create_subscriber_ipc_ptr() should call this. + void set_gpu_release_fn(void (*fn)(void *)) + { + if (control_) { + control_->gpu_release_fn = fn; + } + } + + // Sets the subscriber-local GPU pointer (obtained via import_handle). + // Private: only create_subscriber_ipc_ptr() should call this. + void set_gpu_data_ptr(void * ptr) + { + if (control_) { + control_->gpu_data_ptr = ptr; + } + } + // Publisher-side constructor (entry_id not yet assigned). // Creates control block for reference counting and one-shot invalidation. // Private: users must call BasicPublisher::borrow_loaned_message() instead of constructing @@ -339,6 +382,9 @@ class ipc_shared_ptr AGNOCAST_PUBLIC T * get() const noexcept { return is_invalidated_() ? nullptr : ptr_; } + // Returns the subscriber-local GPU device pointer, or nullptr for non-CUDA messages. + void * gpu_data() const noexcept { return control_ ? control_->gpu_data_ptr : nullptr; } + /** * @brief Release ownership of the managed message. If this is the last reference: on the * subscriber side, notifies the kernel module that the message can be reclaimed; on the @@ -354,6 +400,12 @@ class ipc_shared_ptr const bool was_last = control_->decrement_and_check(); if (was_last) { + // GPU cleanup must run BEFORE bitmap release: unmapping the GPU buffer before + // the publisher is allowed to cudaFree the underlying allocation. + if (control_->gpu_release_fn) { + control_->gpu_release_fn(control_->gpu_data_ptr); + } + if (control_->entry_id != ENTRY_ID_NOT_ASSIGNED) { // Subscriber side: notify kmod that all references are released. release_subscriber_reference(control_->topic_name, control_->pubsub_id, control_->entry_id); diff --git a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp index f2d296485..9c06196c9 100644 --- a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp +++ b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp @@ -356,7 +356,7 @@ class BasicTakeSubscription : public SubscriptionBase MessageT * ptr = reinterpret_cast(take_args.ret_addr); auto result = - agnocast::ipc_shared_ptr(ptr, topic_name_, id_, take_args.ret_entry_id); + create_subscriber_ipc_ptr(ptr, topic_name_, id_, take_args.ret_entry_id); old_ptr = std::move(last_taken_ptr_); last_taken_ptr_ = result; return result; @@ -364,7 +364,7 @@ class BasicTakeSubscription : public SubscriptionBase } MessageT * ptr = reinterpret_cast(take_args.ret_addr); - return agnocast::ipc_shared_ptr(ptr, topic_name_, id_, take_args.ret_entry_id); + return create_subscriber_ipc_ptr(ptr, topic_name_, id_, take_args.ret_entry_id); } }; diff --git a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp index 984930f6e..942ad7978 100644 --- a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp +++ b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp @@ -4,6 +4,7 @@ #include "agnocast/agnocast_publisher.hpp" #include "agnocast/agnocast_subscription.hpp" #include "agnocast/bridge/agnocast_bridge_utils.hpp" +#include "agnocast/cuda_message_tag.hpp" #include "rclcpp/rclcpp.hpp" #include @@ -39,11 +40,25 @@ template void request_bridge_core( const std::string & topic_name, topic_local_id_t id, BridgeDirection direction) { - auto bridge_mode = get_bridge_mode(); - if (bridge_mode == BridgeMode::Standard) { - send_bridge_request(topic_name, id, direction); - } else if (bridge_mode == BridgeMode::Performance) { - send_performance_bridge_request(topic_name, id, direction); + // CUDA message types cannot be bridged to ROS 2 directly (GPU pointers are not serializable). + // Bridge support for CUDA types (via cudaMemcpy D2H) is future work. + if constexpr (is_cuda_message_v) { + static const auto logger = rclcpp::get_logger("agnocast_bridge_requester"); + RCLCPP_WARN( + logger, + "Bridge skipped for CUDA topic '%s': GPU message types cannot be bridged to ROS 2. " + "Use cudaMemcpy to a standard ROS message if DDS bridging is needed.", + topic_name.c_str()); + (void)id; + (void)direction; + return; + } else { + auto bridge_mode = get_bridge_mode(); + if (bridge_mode == BridgeMode::Standard) { + send_bridge_request(topic_name, id, direction); + } else if (bridge_mode == BridgeMode::Performance) { + send_performance_bridge_request(topic_name, id, direction); + } } } diff --git a/src/agnocastlib/include/agnocast/cuda_message_tag.hpp b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp new file mode 100644 index 000000000..969221b01 --- /dev/null +++ b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include +#include + +namespace agnocast +{ + +// Base class for CUDA message types. Provides a pointer to GpuMetadata stored in shared memory. +// CUDA message types (e.g., agnocast::cuda::PointCloud2) inherit from both a ROS message type +// and this tag. The publish/receive machinery uses is_cuda_message_v to detect CUDA messages +// at compile time via if constexpr, so no CUDA headers are needed in agnocastlib. +struct cuda_message_tag +{ + void * gpu_metadata_ = nullptr; +}; + +template +inline constexpr bool is_cuda_message_v = + std::is_base_of_v>; + +// Returns the GPU data size for a CUDA message. Must be specialized by each CUDA message type +// in the agnocast_cuda package (e.g., for PointCloud2: height * width * point_step). +template +size_t get_cuda_gpu_data_size(const T & msg); + +} // namespace agnocast diff --git a/src/agnocastlib/include/agnocast/gpu_metadata.hpp b/src/agnocastlib/include/agnocast/gpu_metadata.hpp new file mode 100644 index 000000000..f4c860cef --- /dev/null +++ b/src/agnocastlib/include/agnocast/gpu_metadata.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include +#include + +namespace agnocast +{ + +// Opaque handle large enough for cudaIpcMemHandle_t (64 bytes). +struct GpuHandle +{ + uint8_t opaque[64]; +}; + +// GPU sharing metadata stored in shared memory alongside the message. +// Allocated by the publish path (while heaphook is active) so it lands in the publisher's +// shared memory region and is readable by subscribers. +struct GpuMetadata +{ + GpuHandle handle; // backend-specific shareable handle + size_t gpu_data_size; // size of the GPU allocation in bytes + void * publisher_gpu_ptr; // original device pointer for publisher-side free on reclaim +}; + +} // namespace agnocast diff --git a/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp b/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp new file mode 100644 index 000000000..4f91aca30 --- /dev/null +++ b/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include "agnocast/gpu_metadata.hpp" + +#include + +namespace agnocast::cuda +{ + +// Abstract interface for GPU memory sharing backends. +// Concrete implementations (CudaIpcBackend, etc.) live in the agnocast_cuda package. +class GpuTransferBackend +{ +public: + virtual ~GpuTransferBackend() = default; + + // Publisher side: create a shareable handle from a device pointer. + virtual GpuHandle export_handle(void * device_ptr, size_t size) = 0; + + // Publisher side: free GPU buffer on reclaim. + virtual void free_device_memory(void * device_ptr) = 0; + + // Subscriber side: map GPU buffer into this process. + virtual void * import_handle(const GpuHandle & handle, size_t size) = 0; + + // Subscriber side: unmap GPU buffer from this process. + virtual void release_handle(void * local_ptr) = 0; +}; + +// Returns the singleton backend instance. Defined in the agnocast_cuda package and resolved +// at link time. Only called from if-constexpr branches guarded by is_cuda_message_v, +// so the symbol is never referenced unless a CUDA message type is actually used. +GpuTransferBackend & get_backend(); + +} // namespace agnocast::cuda