diff --git a/scripts/sample_application/run_cuda_listener.bash b/scripts/sample_application/run_cuda_listener.bash
new file mode 100755
index 000000000..296600719
--- /dev/null
+++ b/scripts/sample_application/run_cuda_listener.bash
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+source install/setup.bash
+ros2 launch agnocast_sample_application cuda_listener.launch.xml
diff --git a/scripts/sample_application/run_cuda_talker.bash b/scripts/sample_application/run_cuda_talker.bash
new file mode 100755
index 000000000..3c8318bad
--- /dev/null
+++ b/scripts/sample_application/run_cuda_talker.bash
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+source install/setup.bash
+ros2 launch agnocast_sample_application cuda_talker.launch.xml
diff --git a/src/agnocast/package.xml b/src/agnocast/package.xml
index ae223d158..8e7a132eb 100644
--- a/src/agnocast/package.xml
+++ b/src/agnocast/package.xml
@@ -18,6 +18,9 @@
agnocast_cie_config_msgs
agnocast_cie_thread_configurator
agnocast_components
+
+ agnocast_cuda
agnocast_ioctl_wrapper
agnocast_sample_application
agnocast_sample_interfaces
diff --git a/src/agnocast_cuda/CMakeLists.txt b/src/agnocast_cuda/CMakeLists.txt
new file mode 100644
index 000000000..dba9e71f1
--- /dev/null
+++ b/src/agnocast_cuda/CMakeLists.txt
@@ -0,0 +1,69 @@
+cmake_minimum_required(VERSION 3.14)
+project(agnocast_cuda LANGUAGES CXX)
+
+if(NOT CMAKE_CXX_STANDARD)
+ set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ add_compile_options(-Wall -Wextra -Wpedantic)
+endif()
+
+find_package(ament_cmake REQUIRED)
+find_package(agnocastlib REQUIRED)
+find_package(sensor_msgs REQUIRED)
+
+# ---- Runtime CUDA loading (no build-time CUDA dependency) ----
+#
+# This package has ZERO build-time dependency on the CUDA toolkit. It does NOT use
+# find_package(CUDAToolkit), enable_language(CUDA), or link against CUDA::cudart.
+#
+# Instead, libcudart.so is loaded at runtime via dlopen() (see src/cudart_loader.hpp).
+# This allows the package to be built as a pre-built .deb on the ROS build farm, which
+# has no CUDA toolchain. Users install CUDA separately on their target machines.
+#
+# The only system library we link is libdl (via ${CMAKE_DL_LIBS}) for dlopen/dlsym.
+#
+# IMPORTANT FOR DOWNSTREAM PACKAGES:
+# Since this library always builds (even without CUDA), find_package(agnocast_cuda) and
+# TARGET agnocast_cuda::agnocast_cuda are NOT reliable indicators that CUDA is available.
+# Downstream packages that compile .cu files or call CUDA APIs directly (e.g., cudaMalloc)
+# must ALSO check find_package(CUDAToolkit) independently. See the sample application's
+# CMakeLists.txt for the correct pattern:
+# find_package(CUDAToolkit QUIET)
+# find_package(agnocast_cuda QUIET)
+# if(CUDAToolkit_FOUND AND agnocast_cuda_FOUND)
+# ...
+# endif()
+add_library(agnocast_cuda SHARED
+ src/cuda_ipc_backend.cpp
+ src/vmm_backend.cpp
+ src/nvscibuf_backend.cpp
+ src/unified_memory_backend.cpp
+ src/get_backend.cpp)
+
+target_include_directories(agnocast_cuda PUBLIC
+ $
+ $)
+
+ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs)
+target_link_libraries(agnocast_cuda ${CMAKE_DL_LIBS})
+
+install(TARGETS agnocast_cuda
+ EXPORT export_${PROJECT_NAME}
+ LIBRARY DESTINATION lib
+ ARCHIVE DESTINATION lib
+ RUNTIME DESTINATION bin
+ INCLUDES DESTINATION include)
+
+ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET)
+ament_export_dependencies(agnocastlib sensor_msgs)
+
+# Always install headers so downstream packages can use the type definitions
+# (e.g., cuda_message_tag.hpp is in agnocastlib, but types.hpp is here).
+install(
+ DIRECTORY include/
+ DESTINATION include)
+
+ament_export_include_directories(include)
+ament_package()
diff --git a/src/agnocast_cuda/include/agnocast/cuda/message_types.hpp b/src/agnocast_cuda/include/agnocast/cuda/message_types.hpp
new file mode 100644
index 000000000..c1d8a90c8
--- /dev/null
+++ b/src/agnocast_cuda/include/agnocast/cuda/message_types.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "agnocast/cuda_message_tag.hpp"
+
+#include
+#include
+
+#include
+#include
+
+namespace agnocast::cuda
+{
+
+struct PointCloud2 : public sensor_msgs::msg::PointCloud2, public agnocast::cuda_message_tag
+{
+ uint8_t * data = nullptr; // GPU device pointer (shadows base class std::vector data)
+};
+
+struct Image : public sensor_msgs::msg::Image, public agnocast::cuda_message_tag
+{
+ uint8_t * data = nullptr; // GPU device pointer (shadows base class std::vector data)
+};
+
+} // namespace agnocast::cuda
+
+namespace agnocast
+{
+
+template <>
+inline size_t get_cuda_gpu_data_size(const agnocast::cuda::PointCloud2 & msg)
+{
+ return static_cast(msg.height) * static_cast(msg.width) *
+ static_cast(msg.point_step);
+}
+
+template <>
+inline size_t get_cuda_gpu_data_size(const agnocast::cuda::Image & msg)
+{
+ return static_cast(msg.height) * msg.step;
+}
+
+} // namespace agnocast
diff --git a/src/agnocast_cuda/package.xml b/src/agnocast_cuda/package.xml
new file mode 100644
index 000000000..b01f3bc15
--- /dev/null
+++ b/src/agnocast_cuda/package.xml
@@ -0,0 +1,25 @@
+
+
+
+ agnocast_cuda
+ 2.3.3
+
+ CUDA IPC support for Agnocast. Provides GPU message types and cross-process GPU buffer sharing.
+
+ Takahiro Ishikawa-Aso
+ Koichi Imai
+ Masaki Nishikawa
+ Apache License 2.0
+
+
+ Takahiro Ishikawa-Aso
+
+ ament_cmake
+
+ agnocastlib
+ sensor_msgs
+
+
+ ament_cmake
+
+
diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.cpp b/src/agnocast_cuda/src/cuda_ipc_backend.cpp
new file mode 100644
index 000000000..36548dc32
--- /dev/null
+++ b/src/agnocast_cuda/src/cuda_ipc_backend.cpp
@@ -0,0 +1,60 @@
+#include "cuda_ipc_backend.hpp"
+
+#include "cudart_loader.hpp"
+
+#include
+#include
+#include
+
+namespace
+{
+
+void check_cuda_error(agnocast::cuda::cudaError_t err, const char * operation)
+{
+ if (err != agnocast::cuda::cudaSuccess) {
+ std::fprintf(
+ stderr, "[agnocast_cuda] FATAL: %s failed: %s\n", operation,
+ agnocast::cuda::CudartLoader::instance().cudaGetErrorString(err));
+ std::abort();
+ }
+}
+
+} // namespace
+
+namespace agnocast::cuda
+{
+
+GpuHandle CudaIpcBackend::export_handle(void * device_ptr, size_t /*size*/)
+{
+ GpuHandle h{};
+ static_assert(sizeof(cudaIpcMemHandle_t) <= sizeof(h.opaque));
+ cudaIpcMemHandle_t ipc_handle;
+ check_cuda_error(
+ CudartLoader::instance().cudaIpcGetMemHandle(&ipc_handle, device_ptr), "cudaIpcGetMemHandle");
+ std::memcpy(h.opaque, &ipc_handle, sizeof(ipc_handle));
+ return h;
+}
+
+void CudaIpcBackend::free_device_memory(void * device_ptr)
+{
+ check_cuda_error(CudartLoader::instance().cudaFree(device_ptr), "cudaFree");
+}
+
+void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/)
+{
+ cudaIpcMemHandle_t ipc_handle;
+ std::memcpy(&ipc_handle, handle.opaque, sizeof(ipc_handle));
+ void * ptr = nullptr;
+ check_cuda_error(
+ CudartLoader::instance().cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess),
+ "cudaIpcOpenMemHandle");
+ return ptr;
+}
+
+void CudaIpcBackend::release_handle(void * local_ptr)
+{
+ check_cuda_error(
+ CudartLoader::instance().cudaIpcCloseMemHandle(local_ptr), "cudaIpcCloseMemHandle");
+}
+
+} // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.hpp b/src/agnocast_cuda/src/cuda_ipc_backend.hpp
new file mode 100644
index 000000000..fc262cea8
--- /dev/null
+++ b/src/agnocast_cuda/src/cuda_ipc_backend.hpp
@@ -0,0 +1,19 @@
+// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
+// Only get_backend.cpp includes this to instantiate the singleton.
+#pragma once
+
+#include "agnocast/gpu_transfer_backend.hpp"
+
+namespace agnocast::cuda
+{
+
+class CudaIpcBackend : public GpuTransferBackend
+{
+public:
+ GpuHandle export_handle(void * device_ptr, size_t size) override;
+ void free_device_memory(void * device_ptr) override;
+ void * import_handle(const GpuHandle & handle, size_t size) override;
+ void release_handle(void * local_ptr) override;
+};
+
+} // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/cudart_loader.hpp b/src/agnocast_cuda/src/cudart_loader.hpp
new file mode 100644
index 000000000..7ebc1ab7d
--- /dev/null
+++ b/src/agnocast_cuda/src/cudart_loader.hpp
@@ -0,0 +1,185 @@
+// ============================================================================
+// Runtime loader for the CUDA Runtime API (libcudart.so) via dlopen/dlsym.
+//
+// WHY THIS EXISTS
+// ---------------
+// agnocast_cuda is distributed as a pre-built .deb from the ROS build farm, which has no
+// CUDA toolchain. To eliminate the build-time dependency on and -lcudart,
+// we replicate the small set of CUDA types/constants we need (stable ABI) and load the
+// actual CUDA runtime library at runtime via dlopen. This means:
+// - Build time: pure C++17, no CUDA headers or compiler required.
+// - Run time: libcudart.so must be present on the target machine (user installs CUDA).
+//
+// SAFE ON NON-CUDA SYSTEMS
+// ------------------------
+// Users who do not use CUDA message types are completely unaffected by this library.
+// CudartLoader is lazy: dlopen("libcudart.so") only happens on the first call to
+// CudartLoader::instance(), which is only reachable through get_backend(), which is
+// only called from if-constexpr branches guarded by is_cuda_message_v. If no CUDA
+// message type is ever instantiated, this loader is never constructed and no CUDA symbols
+// are ever resolved. The library can be safely installed and loaded on machines without
+// a GPU or CUDA runtime.
+//
+// ABI STABILITY
+// -------------
+// The types and constants below are part of NVIDIA's stable C ABI for the CUDA Runtime API.
+// They have not changed across CUDA 10.x, 11.x, and 12.x. Specifically:
+// - cudaError_t is a C enum (int-sized). cudaSuccess has been 0 since CUDA 1.0.
+// - cudaIpcMemHandle_t is a 64-byte opaque struct (CUDA_IPC_HANDLE_SIZE = 64).
+// - cudaIpcMemLazyEnablePeerAccess is a flag constant (0x01).
+// - cudaDevAttrIntegrated is enum value 18 in cudaDeviceAttr (append-only enum).
+// If NVIDIA ever breaks this ABI (extremely unlikely), the static_assert in
+// cuda_ipc_backend.cpp will catch size mismatches at compile time.
+//
+// ADDING NEW CUDA API CALLS
+// -------------------------
+// When a new backend or feature needs an additional CUDA runtime function:
+// 1. Add the function pointer typedef below (e.g., using cudaMalloc_t = ...).
+// 2. Add a public member to CudartLoader (e.g., cudaMalloc_t cudaMalloc;).
+// 3. Add a load_symbol() call in the constructor.
+// Keep this file as the single place that touches dlopen/dlsym for CUDA.
+//
+// LIBRARY NAME FALLBACK CHAIN
+// ---------------------------
+// The constructor tries these names in order:
+// 1. "libcudart.so" — unversioned symlink, present when the CUDA dev package is
+// installed (e.g., cuda-cudart-dev-12-*) or with the runfile
+// installer.
+// 2. "libcudart.so.12" — SONAME for CUDA 12.x. Present with the runtime-only deb
+// package (cuda-cudart-12-*) even without the dev package.
+// 3. "libcudart.so.11.0"— SONAME for CUDA 11.x (NVIDIA used major.minor for the SONAME
+// in the 11.x series; all 11.0–11.8 share this SONAME).
+// If a future CUDA major version (e.g., 13.x) is released, add its SONAME to the list.
+// ============================================================================
+#pragma once
+
+#include
+
+#include
+#include
+
+namespace agnocast::cuda
+{
+
+// ---------------------------------------------------------------------------
+// ABI-compatible CUDA Runtime API type definitions.
+// These replace #include so that no CUDA headers are needed at build time.
+// See "ABI STABILITY" section in the file header for rationale.
+// ---------------------------------------------------------------------------
+using cudaError_t = int;
+constexpr cudaError_t cudaSuccess = 0;
+
+struct cudaIpcMemHandle_t
+{
+ char reserved[64]; // CUDA_IPC_HANDLE_SIZE = 64
+};
+
+constexpr unsigned int cudaIpcMemLazyEnablePeerAccess = 0x01;
+
+using cudaDeviceAttr = int;
+constexpr cudaDeviceAttr cudaDevAttrIntegrated = 18; // driver_types.h: cudaDevAttrIntegrated = 18
+
+// ---------------------------------------------------------------------------
+// Function pointer types matching CUDA Runtime API signatures.
+// ---------------------------------------------------------------------------
+using cudaGetDevice_t = cudaError_t (*)(int *);
+using cudaDeviceGetAttribute_t = cudaError_t (*)(int *, cudaDeviceAttr, int);
+using cudaIpcGetMemHandle_t = cudaError_t (*)(cudaIpcMemHandle_t *, void *);
+using cudaIpcOpenMemHandle_t = cudaError_t (*)(void **, cudaIpcMemHandle_t, unsigned int);
+using cudaIpcCloseMemHandle_t = cudaError_t (*)(void *);
+using cudaFree_t = cudaError_t (*)(void *);
+using cudaGetErrorString_t = const char * (*)(cudaError_t);
+
+// ---------------------------------------------------------------------------
+// CudartLoader — lazy-loading singleton for the CUDA runtime.
+//
+// Thread safety: the function-local static in instance() is guaranteed to be
+// initialized exactly once (C++11 "magic statics"). After construction, all
+// function pointers are immutable and safe to read from any thread.
+//
+// Lifetime: the singleton is never destroyed during normal execution. We
+// intentionally do NOT call dlclose() — see destructor comment.
+// ---------------------------------------------------------------------------
+class CudartLoader
+{
+public:
+ static CudartLoader & instance()
+ {
+ static CudartLoader loader;
+ return loader;
+ }
+
+ // Public function pointers — valid immediately after instance() returns.
+ // Callers use these like: CudartLoader::instance().cudaFree(ptr)
+ cudaGetDevice_t cudaGetDevice;
+ cudaDeviceGetAttribute_t cudaDeviceGetAttribute;
+ cudaIpcGetMemHandle_t cudaIpcGetMemHandle;
+ cudaIpcOpenMemHandle_t cudaIpcOpenMemHandle;
+ cudaIpcCloseMemHandle_t cudaIpcCloseMemHandle;
+ cudaFree_t cudaFree;
+ cudaGetErrorString_t cudaGetErrorString;
+
+private:
+ void * handle_ = nullptr;
+
+ CudartLoader()
+ {
+ // Try library names in order. See "LIBRARY NAME FALLBACK CHAIN" in the file header.
+ dlerror(); // Clear any stale error before the dlopen loop.
+ const char * names[] = {"libcudart.so", "libcudart.so.12", "libcudart.so.11.0"};
+ for (const char * name : names) {
+ handle_ = dlopen(name, RTLD_NOW | RTLD_LOCAL);
+ if (handle_ != nullptr) {
+ break;
+ }
+ }
+ if (handle_ == nullptr) {
+ std::fprintf(
+ stderr,
+ "[agnocast_cuda] FATAL: Could not load libcudart.so.\n"
+ " Your code uses a CUDA message type (e.g., agnocast::cuda::PointCloud2 or\n"
+ " agnocast::cuda::Image), which requires the NVIDIA CUDA runtime to be installed\n"
+ " on this machine. If you do not need GPU-accelerated message passing, use the\n"
+ " standard (non-CUDA) message types instead.\n"
+ " To fix: install the CUDA runtime (e.g., 'sudo apt install cuda-cudart-12-*')\n"
+ " and ensure libcudart.so is on LD_LIBRARY_PATH.\n"
+ " dlopen error: %s\n",
+ dlerror());
+ std::abort();
+ }
+
+ load_symbol(cudaGetDevice, "cudaGetDevice");
+ load_symbol(cudaDeviceGetAttribute, "cudaDeviceGetAttribute");
+ load_symbol(cudaIpcGetMemHandle, "cudaIpcGetMemHandle");
+ load_symbol(cudaIpcOpenMemHandle, "cudaIpcOpenMemHandle");
+ load_symbol(cudaIpcCloseMemHandle, "cudaIpcCloseMemHandle");
+ load_symbol(cudaFree, "cudaFree");
+ load_symbol(cudaGetErrorString, "cudaGetErrorString");
+ }
+
+ // Intentionally never close the handle. The CUDA runtime maintains process-global state
+ // (device context, allocations, etc.). Calling dlclose() during static destruction can
+ // race with other CUDA cleanup and cause segfaults. The OS reclaims everything at exit.
+ ~CudartLoader() = default;
+
+ CudartLoader(const CudartLoader &) = delete;
+ CudartLoader & operator=(const CudartLoader &) = delete;
+
+ template
+ void load_symbol(T & func_ptr, const char * name)
+ {
+ // POSIX requires clearing dlerror() before calling dlsym(), then checking dlerror()
+ // after, because dlsym() can legitimately return NULL for some symbols.
+ dlerror();
+ func_ptr = reinterpret_cast(dlsym(handle_, name));
+ const char * err = dlerror();
+ if (err != nullptr) {
+ std::fprintf(
+ stderr, "[agnocast_cuda] FATAL: Could not load symbol '%s' from libcudart.so: %s\n", name,
+ err);
+ std::abort();
+ }
+ }
+};
+
+} // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/get_backend.cpp b/src/agnocast_cuda/src/get_backend.cpp
new file mode 100644
index 000000000..12b428099
--- /dev/null
+++ b/src/agnocast_cuda/src/get_backend.cpp
@@ -0,0 +1,65 @@
+#include "agnocast/gpu_transfer_backend.hpp"
+#include "cuda_ipc_backend.hpp"
+#include "cudart_loader.hpp"
+#include "nvscibuf_backend.hpp"
+#include "unified_memory_backend.hpp"
+#include "vmm_backend.hpp"
+
+#include
+#include
+#include
+
+namespace agnocast::cuda
+{
+
+namespace
+{
+
+std::unique_ptr select_backend()
+{
+ const auto & api = CudartLoader::instance();
+
+ int device = 0;
+ cudaError_t err = api.cudaGetDevice(&device);
+ if (err != cudaSuccess) {
+ std::fprintf(
+ stderr, "[agnocast_cuda] FATAL: cudaGetDevice failed: %s\n", api.cudaGetErrorString(err));
+ std::abort();
+ }
+
+ int is_integrated = 0;
+ err = api.cudaDeviceGetAttribute(&is_integrated, cudaDevAttrIntegrated, device);
+ if (err != cudaSuccess) {
+ std::fprintf(
+ stderr, "[agnocast_cuda] FATAL: cudaDeviceGetAttribute failed: %s\n",
+ api.cudaGetErrorString(err));
+ std::abort();
+ }
+
+ if (!is_integrated) {
+ // Discrete GPU (GeForce, Quadro, Tesla, A/H series) — CUDA IPC is supported.
+ std::fprintf(stderr, "[agnocast_cuda] Discrete GPU detected, using CudaIpcBackend.\n");
+ return std::make_unique();
+ }
+
+ // Integrated GPU (Jetson Xavier/Orin/Thor, DRIVE).
+ // TODO(agnocast): Implement and select the appropriate backend.
+ // - Jetson Thor (CUDA 13.0+): CudaIpcBackend may work via OpenRM.
+ // - Jetson Xavier/Orin: NvSciBufBackend or UnifiedMemoryBackend.
+ // - DRIVE: NvSciBufBackend.
+ std::fprintf(
+ stderr,
+ "[agnocast_cuda] FATAL: Integrated GPU detected (Jetson/DRIVE). "
+ "No backend is implemented yet for this platform.\n");
+ std::abort();
+}
+
+} // namespace
+
+GpuTransferBackend & get_backend()
+{
+ static auto instance = select_backend();
+ return *instance;
+}
+
+} // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/nvscibuf_backend.cpp b/src/agnocast_cuda/src/nvscibuf_backend.cpp
new file mode 100644
index 000000000..0e900cedf
--- /dev/null
+++ b/src/agnocast_cuda/src/nvscibuf_backend.cpp
@@ -0,0 +1,30 @@
+#include "nvscibuf_backend.hpp"
+
+#include
+
+namespace agnocast::cuda
+{
+
+GpuHandle NvSciBufBackend::export_handle(void * /*device_ptr*/, size_t /*size*/)
+{
+ throw std::runtime_error(
+ "[agnocast_cuda] NvSciBufBackend is not yet implemented. "
+ "Requires NvSciBuf (Jetson Xavier/Orin, NVIDIA DRIVE).");
+}
+
+void NvSciBufBackend::free_device_memory(void * /*device_ptr*/)
+{
+ throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented.");
+}
+
+void * NvSciBufBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/)
+{
+ throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented.");
+}
+
+void NvSciBufBackend::release_handle(void * /*local_ptr*/)
+{
+ throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented.");
+}
+
+} // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/nvscibuf_backend.hpp b/src/agnocast_cuda/src/nvscibuf_backend.hpp
new file mode 100644
index 000000000..b631d1ca4
--- /dev/null
+++ b/src/agnocast_cuda/src/nvscibuf_backend.hpp
@@ -0,0 +1,21 @@
+// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
+// Only get_backend.cpp includes this to instantiate the singleton.
+#pragma once
+
+#include "agnocast/gpu_transfer_backend.hpp"
+
+namespace agnocast::cuda
+{
+
+// Placeholder backend using NvSciBuf / NvSciSync.
+// Target platforms: Jetson Xavier/Orin, NVIDIA DRIVE.
+class NvSciBufBackend : public GpuTransferBackend
+{
+public:
+ GpuHandle export_handle(void * device_ptr, size_t size) override;
+ void free_device_memory(void * device_ptr) override;
+ void * import_handle(const GpuHandle & handle, size_t size) override;
+ void release_handle(void * local_ptr) override;
+};
+
+} // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/unified_memory_backend.cpp b/src/agnocast_cuda/src/unified_memory_backend.cpp
new file mode 100644
index 000000000..af97fff3b
--- /dev/null
+++ b/src/agnocast_cuda/src/unified_memory_backend.cpp
@@ -0,0 +1,30 @@
+#include "unified_memory_backend.hpp"
+
+#include
+
+namespace agnocast::cuda
+{
+
+GpuHandle UnifiedMemoryBackend::export_handle(void * /*device_ptr*/, size_t /*size*/)
+{
+ throw std::runtime_error(
+ "[agnocast_cuda] UnifiedMemoryBackend is not yet implemented. "
+ "Requires POSIX shm + cudaHostRegister (Jetson unified memory).");
+}
+
+void UnifiedMemoryBackend::free_device_memory(void * /*device_ptr*/)
+{
+ throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented.");
+}
+
+void * UnifiedMemoryBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/)
+{
+ throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented.");
+}
+
+void UnifiedMemoryBackend::release_handle(void * /*local_ptr*/)
+{
+ throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented.");
+}
+
+} // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/unified_memory_backend.hpp b/src/agnocast_cuda/src/unified_memory_backend.hpp
new file mode 100644
index 000000000..f9916f203
--- /dev/null
+++ b/src/agnocast_cuda/src/unified_memory_backend.hpp
@@ -0,0 +1,21 @@
+// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
+// Only get_backend.cpp includes this to instantiate the singleton.
+#pragma once
+
+#include "agnocast/gpu_transfer_backend.hpp"
+
+namespace agnocast::cuda
+{
+
+// Placeholder backend using POSIX shared memory + cudaHostRegister for Jetson platforms
+// where CPU and GPU share the same physical memory (unified memory architecture).
+class UnifiedMemoryBackend : public GpuTransferBackend
+{
+public:
+ GpuHandle export_handle(void * device_ptr, size_t size) override;
+ void free_device_memory(void * device_ptr) override;
+ void * import_handle(const GpuHandle & handle, size_t size) override;
+ void release_handle(void * local_ptr) override;
+};
+
+} // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/vmm_backend.cpp b/src/agnocast_cuda/src/vmm_backend.cpp
new file mode 100644
index 000000000..4c14cbcd9
--- /dev/null
+++ b/src/agnocast_cuda/src/vmm_backend.cpp
@@ -0,0 +1,30 @@
+#include "vmm_backend.hpp"
+
+#include
+
+namespace agnocast::cuda
+{
+
+GpuHandle VmmBackend::export_handle(void * /*device_ptr*/, size_t /*size*/)
+{
+ throw std::runtime_error(
+ "[agnocast_cuda] VmmBackend is not yet implemented. "
+ "Requires cuMemExportToShareableHandle (CUDA Driver API).");
+}
+
+void VmmBackend::free_device_memory(void * /*device_ptr*/)
+{
+ throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented.");
+}
+
+void * VmmBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/)
+{
+ throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented.");
+}
+
+void VmmBackend::release_handle(void * /*local_ptr*/)
+{
+ throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented.");
+}
+
+} // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/vmm_backend.hpp b/src/agnocast_cuda/src/vmm_backend.hpp
new file mode 100644
index 000000000..491e5544a
--- /dev/null
+++ b/src/agnocast_cuda/src/vmm_backend.hpp
@@ -0,0 +1,22 @@
+// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
+// Only get_backend.cpp includes this to instantiate the singleton.
+#pragma once
+
+#include "agnocast/gpu_transfer_backend.hpp"
+
+namespace agnocast::cuda
+{
+
+// Placeholder backend using CUDA Virtual Memory Management (VMM) API.
+// Uses cuMemExportToShareableHandle / cuMemImportFromShareableHandle.
+// Supported on Jetson Orin (newer CUDA) and discrete GPUs with CUDA 10.2+.
+class VmmBackend : public GpuTransferBackend
+{
+public:
+ GpuHandle export_handle(void * device_ptr, size_t size) override;
+ void free_device_memory(void * device_ptr) override;
+ void * import_handle(const GpuHandle & handle, size_t size) override;
+ void release_handle(void * local_ptr) override;
+};
+
+} // namespace agnocast::cuda
diff --git a/src/agnocast_sample_application/CMakeLists.txt b/src/agnocast_sample_application/CMakeLists.txt
index 41d2aa423..7195e2e5a 100644
--- a/src/agnocast_sample_application/CMakeLists.txt
+++ b/src/agnocast_sample_application/CMakeLists.txt
@@ -165,6 +165,41 @@ install(TARGETS no_rclcpp_client
install(TARGETS sim_time_timer
DESTINATION lib/${PROJECT_NAME})
+# CUDA sample applications (only built when the CUDA compiler, CUDA toolkit, and
+# agnocast_cuda are all available). Note: agnocast_cuda itself has no build-time CUDA
+# dependency (it dlopen's libcudart.so at runtime), but these sample apps compile .cu
+# files with CUDA kernels, so they need the full CUDA toolchain.
+# These binaries are NOT included in the pre-built deb from the ROS build farm (which
+# has no CUDA). Users who want to try the CUDA samples must build from source with CUDA.
+include(CheckLanguage)
+check_language(CUDA)
+find_package(agnocast_cuda QUIET)
+if(CMAKE_CUDA_COMPILER AND agnocast_cuda_FOUND)
+ enable_language(CUDA)
+ find_package(CUDAToolkit REQUIRED)
+
+ add_executable(cuda_talker src/cuda_publisher.cpp)
+ set_source_files_properties(src/cuda_publisher.cpp PROPERTIES LANGUAGE CUDA)
+ ament_target_dependencies(cuda_talker agnocastlib agnocast_cuda sensor_msgs)
+ target_link_libraries(cuda_talker CUDA::cudart)
+ target_include_directories(cuda_talker PRIVATE
+ ${agnocastlib_INCLUDE_DIRS}
+ ${agnocast_cuda_INCLUDE_DIRS}
+ )
+
+ add_executable(cuda_listener src/cuda_subscriber.cpp)
+ set_source_files_properties(src/cuda_subscriber.cpp PROPERTIES LANGUAGE CUDA)
+ ament_target_dependencies(cuda_listener agnocastlib agnocast_cuda sensor_msgs)
+ target_link_libraries(cuda_listener CUDA::cudart)
+ target_include_directories(cuda_listener PRIVATE
+ ${agnocastlib_INCLUDE_DIRS}
+ ${agnocast_cuda_INCLUDE_DIRS}
+ )
+
+ install(TARGETS cuda_talker cuda_listener
+ DESTINATION lib/${PROJECT_NAME})
+endif()
+
install(DIRECTORY launch
DESTINATION share/${PROJECT_NAME}/
)
diff --git a/src/agnocast_sample_application/launch/cuda_listener.launch.xml b/src/agnocast_sample_application/launch/cuda_listener.launch.xml
new file mode 100644
index 000000000..4136b9649
--- /dev/null
+++ b/src/agnocast_sample_application/launch/cuda_listener.launch.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
diff --git a/src/agnocast_sample_application/launch/cuda_talker.launch.xml b/src/agnocast_sample_application/launch/cuda_talker.launch.xml
new file mode 100644
index 000000000..ffad40cb2
--- /dev/null
+++ b/src/agnocast_sample_application/launch/cuda_talker.launch.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
diff --git a/src/agnocast_sample_application/package.xml b/src/agnocast_sample_application/package.xml
index cf534190d..1315e81e3 100644
--- a/src/agnocast_sample_application/package.xml
+++ b/src/agnocast_sample_application/package.xml
@@ -31,6 +31,13 @@
ament_lint_auto
ament_lint_common
+
+ agnocast_cuda
+ sensor_msgs
+
ament_cmake
diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp
new file mode 100644
index 000000000..43775e30c
--- /dev/null
+++ b/src/agnocast_sample_application/src/cuda_publisher.cpp
@@ -0,0 +1,82 @@
+#include "agnocast/agnocast.hpp"
+#include "agnocast/cuda/message_types.hpp"
+
+#include
+
+using namespace std::chrono_literals;
+
+// Simple CUDA kernel: fills GPU buffer with incrementing values.
+__global__ void fill_kernel(uint8_t * data, size_t size, uint8_t offset)
+{
+ size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+ if (idx < size) {
+ data[idx] = static_cast((idx + offset) % 256);
+ }
+}
+
+class CudaPublisher : public agnocast::Node
+{
+ int64_t count_ = 0;
+ agnocast::Publisher::SharedPtr pub_;
+ agnocast::TimerBase::SharedPtr timer_;
+
+ void timer_callback()
+ {
+ auto msg = pub_->borrow_loaned_message();
+
+ // Set CPU metadata
+ msg->header.stamp.sec = static_cast(count_);
+ msg->header.frame_id = "lidar";
+ msg->height = 1;
+ msg->width = 1024;
+ msg->point_step = 16;
+ msg->row_step = msg->width * msg->point_step;
+ msg->is_dense = true;
+
+ // Allocate and fill GPU data
+ const size_t gpu_size = static_cast(msg->height) * static_cast(msg->width) *
+ static_cast(msg->point_step);
+ const cudaError_t malloc_result = cudaMalloc(&msg->data, gpu_size);
+ if (malloc_result != cudaSuccess) {
+ RCLCPP_ERROR(get_logger(), "cudaMalloc failed: %s", cudaGetErrorString(malloc_result));
+ return;
+ }
+
+ const int threads = 256;
+ const int blocks = (gpu_size + threads - 1) / threads;
+ // cppcheck-suppress shiftTooManyBits // false positive: <<< >>> is CUDA kernel launch syntax
+ fill_kernel<<>>(msg->data, gpu_size, static_cast(count_));
+
+ const cudaError_t sync_result = cudaStreamSynchronize(nullptr);
+ if (sync_result != cudaSuccess) {
+ RCLCPP_ERROR(get_logger(), "kernel launch failed: %s", cudaGetErrorString(sync_result));
+ cudaFree(msg->data);
+ msg->data = nullptr;
+ return;
+ }
+
+ pub_->publish(std::move(msg));
+ RCLCPP_INFO(
+ get_logger(), "published CUDA PointCloud2: seq=%lld, gpu_size=%zu",
+ static_cast(count_++), gpu_size);
+ }
+
+public:
+ CudaPublisher() : Node("cuda_publisher")
+ {
+ pub_ = this->create_publisher("/cuda_pointcloud", 1);
+ timer_ = agnocast::create_timer(
+ this, std::make_shared(RCL_STEADY_TIME), rclcpp::Duration(100ms),
+ std::bind(&CudaPublisher::timer_callback, this));
+ }
+};
+
+int main(int argc, char ** argv)
+{
+ agnocast::init(argc, argv);
+ agnocast::AgnocastOnlySingleThreadedExecutor executor;
+ auto node = std::make_shared();
+ executor.add_node(node);
+ executor.spin();
+ return 0;
+}
diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp
new file mode 100644
index 000000000..9f1be14e9
--- /dev/null
+++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp
@@ -0,0 +1,62 @@
+#include "agnocast/agnocast.hpp"
+#include "agnocast/cuda/message_types.hpp"
+
+#include
+
+#include
+
+using std::placeholders::_1;
+
+class CudaSubscriber : public agnocast::Node
+{
+ agnocast::Subscription::SharedPtr sub_;
+
+ void callback(agnocast::ipc_shared_ptr msg)
+ {
+ // Read CPU metadata from shared memory
+ const uint32_t width = msg->width;
+ const uint32_t point_step = msg->point_step;
+ const size_t gpu_size = static_cast(msg->height) * static_cast(width) *
+ static_cast(point_step);
+
+ // Get subscriber-local GPU pointer (mapped via CUDA IPC)
+ auto * gpu_ptr = static_cast(msg.gpu_data());
+ if (!gpu_ptr) {
+ RCLCPP_ERROR(get_logger(), "gpu_data() returned nullptr");
+ return;
+ }
+
+ // Read first few bytes from GPU to verify data
+ uint8_t host_buf[16]{};
+ const size_t copy_size = std::min(gpu_size, sizeof(host_buf));
+ const cudaError_t memcpy_result =
+ cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost);
+ if (memcpy_result != cudaSuccess) {
+ RCLCPP_ERROR(get_logger(), "cudaMemcpy failed: %s", cudaGetErrorString(memcpy_result));
+ return;
+ }
+
+ RCLCPP_INFO(
+ get_logger(),
+ "received CUDA PointCloud2: width=%u, point_step=%u, gpu_size=%zu, "
+ "first_bytes=[%u,%u,%u,%u]",
+ width, point_step, gpu_size, host_buf[0], host_buf[1], host_buf[2], host_buf[3]);
+ }
+
+public:
+ CudaSubscriber() : Node("cuda_subscriber")
+ {
+ sub_ = this->create_subscription(
+ "/cuda_pointcloud", 1, std::bind(&CudaSubscriber::callback, this, _1));
+ }
+};
+
+int main(int argc, char ** argv)
+{
+ agnocast::init(argc, argv);
+ agnocast::AgnocastOnlySingleThreadedExecutor executor;
+ auto node = std::make_shared();
+ executor.add_node(node);
+ executor.spin();
+ return 0;
+}
diff --git a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
index 13b66fbf7..ff7b183de 100644
--- a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
@@ -1,7 +1,12 @@
#pragma once
#include "agnocast/agnocast_smart_pointer.hpp"
+#include "agnocast/agnocast_utils.hpp"
+#include "agnocast/cuda_message_tag.hpp"
+#include "agnocast/gpu_metadata.hpp"
+#include "agnocast/gpu_transfer_backend.hpp"
+#include
#include
#include
@@ -44,7 +49,7 @@ struct CallbackInfo
rclcpp::CallbackGroup::SharedPtr callback_group;
TypeErasedCallback callback;
std::function(
- const void *, const std::string &, const topic_local_id_t, const uint64_t)>
+ const void *, const std::string &, const topic_local_id_t, const int64_t)>
message_creator;
bool need_epoll_update = true;
};
@@ -61,6 +66,42 @@ extern std::atomic need_epoll_updates;
uint32_t allocate_callback_info_id();
+// Creates an ipc_shared_ptr for a subscriber-received message.
+// For CUDA messages: imports the GPU handle, stores the subscriber-local GPU pointer in
+// control_block->gpu_data_ptr, and registers a gpu_release_fn to release the mapping
+// on last reference. The pointer is accessed via ipc_shared_ptr::gpu_data().
+// For non-CUDA messages: simply wraps the pointer.
+template
+agnocast::ipc_shared_ptr create_subscriber_ipc_ptr(
+ MessageT * msg, const std::string & topic_name, const topic_local_id_t subscriber_id,
+ const int64_t entry_id)
+{
+ if constexpr (is_cuda_message_v) {
+ auto * meta = static_cast(msg->gpu_metadata_);
+ if (!meta) {
+ RCLCPP_ERROR(
+ logger,
+ "CUDA message on topic '%s' has null gpu_metadata_. "
+ "The publisher may have failed to set GpuMetadata during publish().",
+ topic_name.c_str());
+ std::abort();
+ }
+ void * gpu_data_ptr =
+ agnocast::cuda::get_backend().import_handle(meta->handle, meta->gpu_data_size);
+ // NOTE: If import_handle() fails, the backend aborts (fail-fast). If a future backend
+ // returns nullptr instead, the subscriber would get a null gpu pointer. Callers should
+ // check gpu_data() != nullptr before use.
+
+ auto ipc_ptr = agnocast::ipc_shared_ptr(msg, topic_name, subscriber_id, entry_id);
+ ipc_ptr.set_gpu_data_ptr(gpu_data_ptr);
+ ipc_ptr.set_gpu_release_fn(
+ [](void * ptr) { agnocast::cuda::get_backend().release_handle(ptr); });
+ return ipc_ptr;
+ } else {
+ return agnocast::ipc_shared_ptr(msg, topic_name, subscriber_id, entry_id);
+ }
+}
+
template
TypeErasedCallback get_erased_callback(Func && callback)
{
@@ -95,9 +136,9 @@ uint32_t register_callback(
auto message_creator = [](
const void * ptr, const std::string & topic_name,
const topic_local_id_t subscriber_id, const int64_t entry_id) {
- return std::make_unique>(agnocast::ipc_shared_ptr(
- const_cast(static_cast(ptr)), topic_name, subscriber_id,
- entry_id));
+ auto * msg = const_cast(static_cast(ptr));
+ return std::make_unique>(
+ create_subscriber_ipc_ptr(msg, topic_name, subscriber_id, entry_id));
};
uint32_t callback_info_id = allocate_callback_info_id();
diff --git a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
index 4e08394a0..720f14da7 100644
--- a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
@@ -6,6 +6,9 @@
#include "agnocast/agnocast_smart_pointer.hpp"
#include "agnocast/agnocast_tracepoint_wrapper.h"
#include "agnocast/agnocast_utils.hpp"
+#include "agnocast/cuda_message_tag.hpp"
+#include "agnocast/gpu_metadata.hpp"
+#include "agnocast/gpu_transfer_backend.hpp"
#include "rclcpp/detail/qos_parameters.hpp"
#include "rclcpp/rclcpp.hpp"
@@ -19,6 +22,7 @@
#include
#include
+#include
#include
#include
#include
@@ -207,7 +211,38 @@ class BasicPublisher
}
// Capture raw pointer BEFORE invalidation (get() returns nullptr after invalidation).
- const uint64_t msg_virtual_address = reinterpret_cast(message.get());
+ MessageT * raw_ptr = message.get();
+ const uint64_t msg_virtual_address = reinterpret_cast(raw_ptr);
+
+ // CUDA publish hook: export GPU handle and allocate GpuMetadata in shared memory.
+ // Runs while heaphook is still active, so GpuMetadata lands in the publisher's shared memory.
+ // NOTE: Assumes MessageT has a public `data` member (uint8_t*) pointing to the GPU allocation.
+ // All CUDA message types must provide this by shadowing the base ROS message's data field.
+ if constexpr (is_cuda_message_v) {
+ if (!raw_ptr->data) {
+ RCLCPP_ERROR(
+ logger,
+ "CUDA message on topic '%s' has null data pointer. "
+ "Did you forget to cudaMalloc(&msg->data, size) before publish()?",
+ topic_name_.c_str());
+ std::abort();
+ }
+ const size_t gpu_size = get_cuda_gpu_data_size(*raw_ptr);
+ if (gpu_size == 0) {
+ RCLCPP_ERROR(
+ logger,
+ "CUDA message on topic '%s' has gpu_data_size == 0. "
+ "Ensure message fields (height, width, point_step, etc.) are set before publish().",
+ topic_name_.c_str());
+ std::abort();
+ }
+ auto & backend = agnocast::cuda::get_backend();
+ auto * meta = new GpuMetadata();
+ meta->publisher_gpu_ptr = raw_ptr->data;
+ meta->gpu_data_size = gpu_size;
+ meta->handle = backend.export_handle(raw_ptr->data, gpu_size);
+ raw_ptr->gpu_metadata_ = meta;
+ }
// Invalidate all references sharing this handle's control block.
// Any remaining copies held elsewhere will fail-fast on dereference.
@@ -220,6 +255,16 @@ class BasicPublisher
for (uint32_t i = 0; i < publish_msg_args.ret_released_num; i++) {
MessageT * release_ptr = reinterpret_cast(publish_msg_args.ret_released_addrs[i]);
+ // CUDA reclaim hook: free GPU buffer before deleting the message.
+ // On abnormal publisher exit, free_device_memory() is never called, but GPU device
+ // memory is reclaimed by the CUDA driver when the process exits.
+ if constexpr (is_cuda_message_v) {
+ if (release_ptr->gpu_metadata_) {
+ auto * meta = static_cast(release_ptr->gpu_metadata_);
+ agnocast::cuda::get_backend().free_device_memory(meta->publisher_gpu_ptr);
+ delete meta;
+ }
+ }
delete release_ptr;
}
diff --git a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
index c0181ceda..b3ffafe8f 100644
--- a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
@@ -40,10 +40,18 @@ extern int agnocast_fd;
// Sentinel value indicating entry_id has not been assigned (publisher-side, before publish).
constexpr int64_t ENTRY_ID_NOT_ASSIGNED = -1;
-// Forward declaration for friend access
+// Forward declarations for friend access
+template
+class ipc_shared_ptr;
+
template
class BasicPublisher;
+template
+ipc_shared_ptr create_subscriber_ipc_ptr(
+ MessageT * msg, const std::string & topic_name, const topic_local_id_t subscriber_id,
+ const int64_t entry_id);
+
namespace detail
{
@@ -60,6 +68,18 @@ struct control_block
topic_local_id_t pubsub_id; // 4-byte alignment
std::atomic valid{true}; // 1-byte alignment
+ // Optional GPU cleanup function pointer. Null for non-CUDA messages.
+ // Called as gpu_release_fn(gpu_data_ptr) before bitmap release in reset() to ensure GPU
+ // mappings are released before the publisher can free the underlying GPU buffer.
+ // Uses a plain function pointer instead of std::function to avoid heap allocation and
+ // minimize overhead for non-CUDA messages (16 bytes for two pointers vs ~40+ bytes).
+ void (*gpu_release_fn)(void *) = nullptr;
+
+ // Subscriber-local GPU device pointer obtained via import_handle().
+ // Stored here because the shared memory message is mapped read-only by the subscriber,
+ // so we cannot inject the local pointer into msg->data.
+ void * gpu_data_ptr = nullptr;
+
control_block(std::string topic, topic_local_id_t pubsub, int64_t entry)
: topic_name(std::move(topic)), entry_id(entry), pubsub_id(pubsub)
{
@@ -105,6 +125,11 @@ class ipc_shared_ptr
template
friend class BasicPublisher;
+ // Allow create_subscriber_ipc_ptr to call set_gpu_release_fn() and set_gpu_data_ptr()
+ template
+ friend ipc_shared_ptr create_subscriber_ipc_ptr(
+ MessageT *, const std::string &, const topic_local_id_t, const int64_t);
+
// Allow converting constructors to access private members of ipc_shared_ptr
template
friend class ipc_shared_ptr;
@@ -133,6 +158,24 @@ class ipc_shared_ptr
}
}
+ // Sets a GPU release function to be invoked as fn(gpu_data_ptr) when the last reference
+ // is released. Private: only create_subscriber_ipc_ptr() should call this.
+ void set_gpu_release_fn(void (*fn)(void *))
+ {
+ if (control_) {
+ control_->gpu_release_fn = fn;
+ }
+ }
+
+ // Sets the subscriber-local GPU pointer (obtained via import_handle).
+ // Private: only create_subscriber_ipc_ptr() should call this.
+ void set_gpu_data_ptr(void * ptr)
+ {
+ if (control_) {
+ control_->gpu_data_ptr = ptr;
+ }
+ }
+
// Publisher-side constructor (entry_id not yet assigned).
// Creates control block for reference counting and one-shot invalidation.
// Private: users must call BasicPublisher::borrow_loaned_message() instead of constructing
@@ -339,6 +382,9 @@ class ipc_shared_ptr
AGNOCAST_PUBLIC
T * get() const noexcept { return is_invalidated_() ? nullptr : ptr_; }
+ // Returns the subscriber-local GPU device pointer, or nullptr for non-CUDA messages.
+ void * gpu_data() const noexcept { return control_ ? control_->gpu_data_ptr : nullptr; }
+
/**
* @brief Release ownership of the managed message. If this is the last reference: on the
* subscriber side, notifies the kernel module that the message can be reclaimed; on the
@@ -354,6 +400,12 @@ class ipc_shared_ptr
const bool was_last = control_->decrement_and_check();
if (was_last) {
+ // GPU cleanup must run BEFORE bitmap release: unmapping the GPU buffer before
+ // the publisher is allowed to cudaFree the underlying allocation.
+ if (control_->gpu_release_fn) {
+ control_->gpu_release_fn(control_->gpu_data_ptr);
+ }
+
if (control_->entry_id != ENTRY_ID_NOT_ASSIGNED) {
// Subscriber side: notify kmod that all references are released.
release_subscriber_reference(control_->topic_name, control_->pubsub_id, control_->entry_id);
diff --git a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp
index f2d296485..9c06196c9 100644
--- a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp
@@ -356,7 +356,7 @@ class BasicTakeSubscription : public SubscriptionBase
MessageT * ptr = reinterpret_cast(take_args.ret_addr);
auto result =
- agnocast::ipc_shared_ptr(ptr, topic_name_, id_, take_args.ret_entry_id);
+ create_subscriber_ipc_ptr(ptr, topic_name_, id_, take_args.ret_entry_id);
old_ptr = std::move(last_taken_ptr_);
last_taken_ptr_ = result;
return result;
@@ -364,7 +364,7 @@ class BasicTakeSubscription : public SubscriptionBase
}
MessageT * ptr = reinterpret_cast(take_args.ret_addr);
- return agnocast::ipc_shared_ptr(ptr, topic_name_, id_, take_args.ret_entry_id);
+ return create_subscriber_ipc_ptr(ptr, topic_name_, id_, take_args.ret_entry_id);
}
};
diff --git a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp
index 984930f6e..942ad7978 100644
--- a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp
+++ b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp
@@ -4,6 +4,7 @@
#include "agnocast/agnocast_publisher.hpp"
#include "agnocast/agnocast_subscription.hpp"
#include "agnocast/bridge/agnocast_bridge_utils.hpp"
+#include "agnocast/cuda_message_tag.hpp"
#include "rclcpp/rclcpp.hpp"
#include
@@ -39,11 +40,25 @@ template
void request_bridge_core(
const std::string & topic_name, topic_local_id_t id, BridgeDirection direction)
{
- auto bridge_mode = get_bridge_mode();
- if (bridge_mode == BridgeMode::Standard) {
- send_bridge_request(topic_name, id, direction);
- } else if (bridge_mode == BridgeMode::Performance) {
- send_performance_bridge_request(topic_name, id, direction);
+ // CUDA message types cannot be bridged to ROS 2 directly (GPU pointers are not serializable).
+ // Bridge support for CUDA types (via cudaMemcpy D2H) is future work.
+ if constexpr (is_cuda_message_v) {
+ static const auto logger = rclcpp::get_logger("agnocast_bridge_requester");
+ RCLCPP_WARN(
+ logger,
+ "Bridge skipped for CUDA topic '%s': GPU message types cannot be bridged to ROS 2. "
+ "Use cudaMemcpy to a standard ROS message if DDS bridging is needed.",
+ topic_name.c_str());
+ (void)id;
+ (void)direction;
+ return;
+ } else {
+ auto bridge_mode = get_bridge_mode();
+ if (bridge_mode == BridgeMode::Standard) {
+ send_bridge_request(topic_name, id, direction);
+ } else if (bridge_mode == BridgeMode::Performance) {
+ send_performance_bridge_request(topic_name, id, direction);
+ }
}
}
diff --git a/src/agnocastlib/include/agnocast/cuda_message_tag.hpp b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp
new file mode 100644
index 000000000..969221b01
--- /dev/null
+++ b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include
+#include
+
+namespace agnocast
+{
+
+// Base class for CUDA message types. Provides a pointer to GpuMetadata stored in shared memory.
+// CUDA message types (e.g., agnocast::cuda::PointCloud2) inherit from both a ROS message type
+// and this tag. The publish/receive machinery uses is_cuda_message_v to detect CUDA messages
+// at compile time via if constexpr, so no CUDA headers are needed in agnocastlib.
+struct cuda_message_tag
+{
+ void * gpu_metadata_ = nullptr;
+};
+
+template
+inline constexpr bool is_cuda_message_v =
+ std::is_base_of_v>;
+
+// Returns the GPU data size for a CUDA message. Must be specialized by each CUDA message type
+// in the agnocast_cuda package (e.g., for PointCloud2: height * width * point_step).
+template
+size_t get_cuda_gpu_data_size(const T & msg);
+
+} // namespace agnocast
diff --git a/src/agnocastlib/include/agnocast/gpu_metadata.hpp b/src/agnocastlib/include/agnocast/gpu_metadata.hpp
new file mode 100644
index 000000000..f4c860cef
--- /dev/null
+++ b/src/agnocastlib/include/agnocast/gpu_metadata.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include
+#include
+
+namespace agnocast
+{
+
+// Opaque handle large enough for cudaIpcMemHandle_t (64 bytes).
+struct GpuHandle
+{
+ uint8_t opaque[64];
+};
+
+// GPU sharing metadata stored in shared memory alongside the message.
+// Allocated by the publish path (while heaphook is active) so it lands in the publisher's
+// shared memory region and is readable by subscribers.
+struct GpuMetadata
+{
+ GpuHandle handle; // backend-specific shareable handle
+ size_t gpu_data_size; // size of the GPU allocation in bytes
+ void * publisher_gpu_ptr; // original device pointer for publisher-side free on reclaim
+};
+
+} // namespace agnocast
diff --git a/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp b/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp
new file mode 100644
index 000000000..4f91aca30
--- /dev/null
+++ b/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "agnocast/gpu_metadata.hpp"
+
+#include
+
+namespace agnocast::cuda
+{
+
+// Abstract interface for GPU memory sharing backends.
+// Concrete implementations (CudaIpcBackend, etc.) live in the agnocast_cuda package.
+class GpuTransferBackend
+{
+public:
+ virtual ~GpuTransferBackend() = default;
+
+ // Publisher side: create a shareable handle from a device pointer.
+ virtual GpuHandle export_handle(void * device_ptr, size_t size) = 0;
+
+ // Publisher side: free GPU buffer on reclaim.
+ virtual void free_device_memory(void * device_ptr) = 0;
+
+ // Subscriber side: map GPU buffer into this process.
+ virtual void * import_handle(const GpuHandle & handle, size_t size) = 0;
+
+ // Subscriber side: unmap GPU buffer from this process.
+ virtual void release_handle(void * local_ptr) = 0;
+};
+
+// Returns the singleton backend instance. Defined in the agnocast_cuda package and resolved
+// at link time. Only called from if-constexpr branches guarded by is_cuda_message_v,
+// so the symbol is never referenced unless a CUDA message type is actually used.
+GpuTransferBackend & get_backend();
+
+} // namespace agnocast::cuda