From ceaeec25fbcfc697ededbfbe332704345645e812 Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Sat, 14 Mar 2026 04:07:26 +0900
Subject: [PATCH 01/18] cuda ipc agnocast

Signed-off-by: sykwer <sykwer@gmail.com>
---
 src/agnocast_cuda/CMakeLists.txt              | 45 +++++++++++++
 .../include/agnocast/cuda/types.hpp           | 42 ++++++++++++
 src/agnocast_cuda/package.xml                 | 20 ++++++
 src/agnocast_cuda/src/cuda_ipc_backend.cpp    | 57 ++++++++++++++++
 src/agnocast_cuda/src/cuda_ipc_backend.hpp    | 19 ++++++
 src/agnocast_cuda/src/get_backend.cpp         | 64 ++++++++++++++++++
 src/agnocast_cuda/src/nvscibuf_backend.cpp    | 30 +++++++++
 src/agnocast_cuda/src/nvscibuf_backend.hpp    | 21 ++++++
 .../src/unified_memory_backend.cpp            | 30 +++++++++
 .../src/unified_memory_backend.hpp            | 21 ++++++
 src/agnocast_cuda/src/vmm_backend.cpp         | 30 +++++++++
 src/agnocast_cuda/src/vmm_backend.hpp         | 22 ++++++
 .../CMakeLists.txt                            | 28 ++++++++
 src/agnocast_sample_application/package.xml   |  7 ++
 .../src/cuda_publisher.cpp                    | 67 +++++++++++++++++++
 .../src/cuda_subscriber.cpp                   | 52 ++++++++++++++
 .../agnocast/agnocast_callback_info.hpp       | 45 ++++++++++++-
 .../include/agnocast/agnocast_publisher.hpp   | 30 ++++++++-
 .../agnocast/agnocast_smart_pointer.hpp       | 56 +++++++++++++++-
 .../agnocast/agnocast_subscription.hpp        |  7 +-
 .../agnocast/bridge/agnocast_bridge_node.hpp  | 21 ++++--
 .../include/agnocast/cuda_message_tag.hpp     | 26 +++++++
 .../include/agnocast/gpu_metadata.hpp         | 25 +++++++
 .../include/agnocast/gpu_transfer_backend.hpp | 35 ++++++++++
 24 files changed, 787 insertions(+), 13 deletions(-)
 create mode 100644 src/agnocast_cuda/CMakeLists.txt
 create mode 100644 src/agnocast_cuda/include/agnocast/cuda/types.hpp
 create mode 100644 src/agnocast_cuda/package.xml
 create mode 100644 src/agnocast_cuda/src/cuda_ipc_backend.cpp
 create mode 100644 src/agnocast_cuda/src/cuda_ipc_backend.hpp
 create mode 100644 src/agnocast_cuda/src/get_backend.cpp
 create mode 100644 src/agnocast_cuda/src/nvscibuf_backend.cpp
 create mode 100644 src/agnocast_cuda/src/nvscibuf_backend.hpp
 create mode 100644 src/agnocast_cuda/src/unified_memory_backend.cpp
 create mode 100644 src/agnocast_cuda/src/unified_memory_backend.hpp
 create mode 100644 src/agnocast_cuda/src/vmm_backend.cpp
 create mode 100644 src/agnocast_cuda/src/vmm_backend.hpp
 create mode 100644 src/agnocast_sample_application/src/cuda_publisher.cpp
 create mode 100644 src/agnocast_sample_application/src/cuda_subscriber.cpp
 create mode 100644 src/agnocastlib/include/agnocast/cuda_message_tag.hpp
 create mode 100644 src/agnocastlib/include/agnocast/gpu_metadata.hpp
 create mode 100644 src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp

diff --git a/src/agnocast_cuda/CMakeLists.txt b/src/agnocast_cuda/CMakeLists.txt
new file mode 100644
index 000000000..9d28e687d
--- /dev/null
+++ b/src/agnocast_cuda/CMakeLists.txt
@@ -0,0 +1,45 @@
+cmake_minimum_required(VERSION 3.14)
+project(agnocast_cuda LANGUAGES CXX CUDA)
+
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  add_compile_options(-Wall -Wextra -Wpedantic)
+endif()
+
+find_package(ament_cmake REQUIRED)
+find_package(agnocastlib REQUIRED)
+find_package(sensor_msgs REQUIRED)
+find_package(CUDAToolkit REQUIRED)
+
+add_library(agnocast_cuda SHARED
+  src/cuda_ipc_backend.cpp
+  src/vmm_backend.cpp
+  src/nvscibuf_backend.cpp
+  src/unified_memory_backend.cpp
+  src/get_backend.cpp)
+
+target_include_directories(agnocast_cuda PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:include>)
+
+ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs)
+target_link_libraries(agnocast_cuda CUDA::cudart)
+
+install(TARGETS agnocast_cuda
+  EXPORT export_${PROJECT_NAME}
+  LIBRARY DESTINATION lib
+  ARCHIVE DESTINATION lib
+  RUNTIME DESTINATION bin
+  INCLUDES DESTINATION include)
+
+install(
+  DIRECTORY include/
+  DESTINATION include)
+
+ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET)
+ament_export_include_directories(include)
+ament_export_dependencies(agnocastlib sensor_msgs)
+ament_package()
diff --git a/src/agnocast_cuda/include/agnocast/cuda/types.hpp b/src/agnocast_cuda/include/agnocast/cuda/types.hpp
new file mode 100644
index 000000000..c1d8a90c8
--- /dev/null
+++ b/src/agnocast_cuda/include/agnocast/cuda/types.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "agnocast/cuda_message_tag.hpp"
+
+#include <sensor_msgs/msg/image.hpp>
+#include <sensor_msgs/msg/point_cloud2.hpp>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace agnocast::cuda
+{
+
+struct PointCloud2 : public sensor_msgs::msg::PointCloud2, public agnocast::cuda_message_tag
+{
+  uint8_t * data = nullptr;  // GPU device pointer (shadows base class std::vector<uint8_t> data)
+};
+
+struct Image : public sensor_msgs::msg::Image, public agnocast::cuda_message_tag
+{
+  uint8_t * data = nullptr;  // GPU device pointer (shadows base class std::vector<uint8_t> data)
+};
+
+}  // namespace agnocast::cuda
+
+namespace agnocast
+{
+
+template <>
+inline size_t get_cuda_gpu_data_size(const agnocast::cuda::PointCloud2 & msg)
+{
+  return static_cast<size_t>(msg.height) * static_cast<size_t>(msg.width) *
+         static_cast<size_t>(msg.point_step);
+}
+
+template <>
+inline size_t get_cuda_gpu_data_size(const agnocast::cuda::Image & msg)
+{
+  return static_cast<size_t>(msg.height) * msg.step;
+}
+
+}  // namespace agnocast
diff --git a/src/agnocast_cuda/package.xml b/src/agnocast_cuda/package.xml
new file mode 100644
index 000000000..1b0245d25
--- /dev/null
+++ b/src/agnocast_cuda/package.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>agnocast_cuda</name>
+  <version>0.1.0</version>
+  <description>
+    CUDA IPC support for Agnocast. Provides GPU message types and cross-process GPU buffer sharing.
+  </description>
+  <maintainer email="sykwer@gmail.com">Takahiro Ishikawa-Aso</maintainer>
+  <license>Apache License 2.0</license>
+
+  <buildtool_depend>ament_cmake</buildtool_depend>
+
+  <depend>agnocastlib</depend>
+  <depend>sensor_msgs</depend>
+
+  <export>
+    <build_type>ament_cmake</build_type>
+  </export>
+</package>
diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.cpp b/src/agnocast_cuda/src/cuda_ipc_backend.cpp
new file mode 100644
index 000000000..ab3ae8be2
--- /dev/null
+++ b/src/agnocast_cuda/src/cuda_ipc_backend.cpp
@@ -0,0 +1,57 @@
+#include "cuda_ipc_backend.hpp"
+
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+namespace
+{
+
+void check_cuda_error(cudaError_t err, const char * operation)
+{
+  if (err != cudaSuccess) {
+    std::fprintf(
+      stderr, "[agnocast_cuda] FATAL: %s failed: %s\n", operation, cudaGetErrorString(err));
+    std::abort();
+  }
+}
+
+}  // namespace
+
+namespace agnocast::cuda
+{
+
+GpuHandle CudaIpcBackend::export_handle(void * device_ptr, size_t /*size*/)
+{
+  GpuHandle h{};
+  static_assert(sizeof(cudaIpcMemHandle_t) <= sizeof(h.opaque));
+  cudaIpcMemHandle_t ipc_handle;
+  check_cuda_error(cudaIpcGetMemHandle(&ipc_handle, device_ptr), "cudaIpcGetMemHandle");
+  std::memcpy(h.opaque, &ipc_handle, sizeof(ipc_handle));
+  return h;
+}
+
+void CudaIpcBackend::free_device_memory(void * device_ptr)
+{
+  check_cuda_error(cudaFree(device_ptr), "cudaFree");
+}
+
+void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/)
+{
+  cudaIpcMemHandle_t ipc_handle;
+  std::memcpy(&ipc_handle, handle.opaque, sizeof(ipc_handle));
+  void * ptr = nullptr;
+  check_cuda_error(
+    cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess),
+    "cudaIpcOpenMemHandle");
+  return ptr;
+}
+
+void CudaIpcBackend::release_handle(void * local_ptr)
+{
+  check_cuda_error(cudaIpcCloseMemHandle(local_ptr), "cudaIpcCloseMemHandle");
+}
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.hpp b/src/agnocast_cuda/src/cuda_ipc_backend.hpp
new file mode 100644
index 000000000..fc262cea8
--- /dev/null
+++ b/src/agnocast_cuda/src/cuda_ipc_backend.hpp
@@ -0,0 +1,19 @@
+// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
+// Only get_backend.cpp includes this to instantiate the singleton.
+#pragma once
+
+#include "agnocast/gpu_transfer_backend.hpp"
+
+namespace agnocast::cuda
+{
+
+class CudaIpcBackend : public GpuTransferBackend
+{
+public:
+  GpuHandle export_handle(void * device_ptr, size_t size) override;
+  void free_device_memory(void * device_ptr) override;
+  void * import_handle(const GpuHandle & handle, size_t size) override;
+  void release_handle(void * local_ptr) override;
+};
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/get_backend.cpp b/src/agnocast_cuda/src/get_backend.cpp
new file mode 100644
index 000000000..87068ff54
--- /dev/null
+++ b/src/agnocast_cuda/src/get_backend.cpp
@@ -0,0 +1,64 @@
+#include "agnocast/gpu_transfer_backend.hpp"
+#include "cuda_ipc_backend.hpp"
+#include "nvscibuf_backend.hpp"
+#include "unified_memory_backend.hpp"
+#include "vmm_backend.hpp"
+
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+
+namespace agnocast::cuda
+{
+
+namespace
+{
+
+std::unique_ptr<GpuTransferBackend> select_backend()
+{
+  int device = 0;
+  cudaError_t err = cudaGetDevice(&device);
+  if (err != cudaSuccess) {
+    std::fprintf(
+      stderr, "[agnocast_cuda] FATAL: cudaGetDevice failed: %s\n", cudaGetErrorString(err));
+    std::abort();
+  }
+
+  int is_integrated = 0;
+  err = cudaDeviceGetAttribute(&is_integrated, cudaDevAttrIntegrated, device);
+  if (err != cudaSuccess) {
+    std::fprintf(
+      stderr, "[agnocast_cuda] FATAL: cudaDeviceGetAttribute failed: %s\n",
+      cudaGetErrorString(err));
+    std::abort();
+  }
+
+  if (!is_integrated) {
+    // Discrete GPU (GeForce, Quadro, Tesla, A/H series) — CUDA IPC is supported.
+    std::fprintf(stderr, "[agnocast_cuda] Discrete GPU detected, using CudaIpcBackend.\n");
+    return std::make_unique<CudaIpcBackend>();
+  }
+
+  // Integrated GPU (Jetson Xavier/Orin/Thor, DRIVE).
+  // TODO(agnocast): Implement and select the appropriate backend.
+  //   - Jetson Thor (CUDA 13.0+): CudaIpcBackend may work via OpenRM.
+  //   - Jetson Xavier/Orin: NvSciBufBackend or UnifiedMemoryBackend.
+  //   - DRIVE: NvSciBufBackend.
+  std::fprintf(
+    stderr,
+    "[agnocast_cuda] FATAL: Integrated GPU detected (Jetson/DRIVE). "
+    "No backend is implemented yet for this platform.\n");
+  std::abort();
+}
+
+}  // namespace
+
+GpuTransferBackend & get_backend()
+{
+  static auto instance = select_backend();
+  return *instance;
+}
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/nvscibuf_backend.cpp b/src/agnocast_cuda/src/nvscibuf_backend.cpp
new file mode 100644
index 000000000..0e900cedf
--- /dev/null
+++ b/src/agnocast_cuda/src/nvscibuf_backend.cpp
@@ -0,0 +1,30 @@
+#include "nvscibuf_backend.hpp"
+
+#include <stdexcept>
+
+namespace agnocast::cuda
+{
+
+GpuHandle NvSciBufBackend::export_handle(void * /*device_ptr*/, size_t /*size*/)
+{
+  throw std::runtime_error(
+    "[agnocast_cuda] NvSciBufBackend is not yet implemented. "
+    "Requires NvSciBuf (Jetson Xavier/Orin, NVIDIA DRIVE).");
+}
+
+void NvSciBufBackend::free_device_memory(void * /*device_ptr*/)
+{
+  throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented.");
+}
+
+void * NvSciBufBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/)
+{
+  throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented.");
+}
+
+void NvSciBufBackend::release_handle(void * /*local_ptr*/)
+{
+  throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented.");
+}
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/nvscibuf_backend.hpp b/src/agnocast_cuda/src/nvscibuf_backend.hpp
new file mode 100644
index 000000000..b631d1ca4
--- /dev/null
+++ b/src/agnocast_cuda/src/nvscibuf_backend.hpp
@@ -0,0 +1,21 @@
+// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
+// Only get_backend.cpp includes this to instantiate the singleton.
+#pragma once
+
+#include "agnocast/gpu_transfer_backend.hpp"
+
+namespace agnocast::cuda
+{
+
+// Placeholder backend using NvSciBuf / NvSciSync.
+// Target platforms: Jetson Xavier/Orin, NVIDIA DRIVE.
+class NvSciBufBackend : public GpuTransferBackend
+{
+public:
+  GpuHandle export_handle(void * device_ptr, size_t size) override;
+  void free_device_memory(void * device_ptr) override;
+  void * import_handle(const GpuHandle & handle, size_t size) override;
+  void release_handle(void * local_ptr) override;
+};
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/unified_memory_backend.cpp b/src/agnocast_cuda/src/unified_memory_backend.cpp
new file mode 100644
index 000000000..af97fff3b
--- /dev/null
+++ b/src/agnocast_cuda/src/unified_memory_backend.cpp
@@ -0,0 +1,30 @@
+#include "unified_memory_backend.hpp"
+
+#include <stdexcept>
+
+namespace agnocast::cuda
+{
+
+GpuHandle UnifiedMemoryBackend::export_handle(void * /*device_ptr*/, size_t /*size*/)
+{
+  throw std::runtime_error(
+    "[agnocast_cuda] UnifiedMemoryBackend is not yet implemented. "
+    "Requires POSIX shm + cudaHostRegister (Jetson unified memory).");
+}
+
+void UnifiedMemoryBackend::free_device_memory(void * /*device_ptr*/)
+{
+  throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented.");
+}
+
+void * UnifiedMemoryBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/)
+{
+  throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented.");
+}
+
+void UnifiedMemoryBackend::release_handle(void * /*local_ptr*/)
+{
+  throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented.");
+}
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/unified_memory_backend.hpp b/src/agnocast_cuda/src/unified_memory_backend.hpp
new file mode 100644
index 000000000..f9916f203
--- /dev/null
+++ b/src/agnocast_cuda/src/unified_memory_backend.hpp
@@ -0,0 +1,21 @@
+// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
+// Only get_backend.cpp includes this to instantiate the singleton.
+#pragma once
+
+#include "agnocast/gpu_transfer_backend.hpp"
+
+namespace agnocast::cuda
+{
+
+// Placeholder backend using POSIX shared memory + cudaHostRegister for Jetson platforms
+// where CPU and GPU share the same physical memory (unified memory architecture).
+class UnifiedMemoryBackend : public GpuTransferBackend
+{
+public:
+  GpuHandle export_handle(void * device_ptr, size_t size) override;
+  void free_device_memory(void * device_ptr) override;
+  void * import_handle(const GpuHandle & handle, size_t size) override;
+  void release_handle(void * local_ptr) override;
+};
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/vmm_backend.cpp b/src/agnocast_cuda/src/vmm_backend.cpp
new file mode 100644
index 000000000..4c14cbcd9
--- /dev/null
+++ b/src/agnocast_cuda/src/vmm_backend.cpp
@@ -0,0 +1,30 @@
+#include "vmm_backend.hpp"
+
+#include <stdexcept>
+
+namespace agnocast::cuda
+{
+
+GpuHandle VmmBackend::export_handle(void * /*device_ptr*/, size_t /*size*/)
+{
+  throw std::runtime_error(
+    "[agnocast_cuda] VmmBackend is not yet implemented. "
+    "Requires cuMemExportToShareableHandle (CUDA Driver API).");
+}
+
+void VmmBackend::free_device_memory(void * /*device_ptr*/)
+{
+  throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented.");
+}
+
+void * VmmBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/)
+{
+  throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented.");
+}
+
+void VmmBackend::release_handle(void * /*local_ptr*/)
+{
+  throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented.");
+}
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/vmm_backend.hpp b/src/agnocast_cuda/src/vmm_backend.hpp
new file mode 100644
index 000000000..491e5544a
--- /dev/null
+++ b/src/agnocast_cuda/src/vmm_backend.hpp
@@ -0,0 +1,22 @@
+// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
+// Only get_backend.cpp includes this to instantiate the singleton.
+#pragma once
+
+#include "agnocast/gpu_transfer_backend.hpp"
+
+namespace agnocast::cuda
+{
+
+// Placeholder backend using CUDA Virtual Memory Management (VMM) API.
+// Uses cuMemExportToShareableHandle / cuMemImportFromShareableHandle.
+// Supported on Jetson Orin (newer CUDA) and discrete GPUs with CUDA 10.2+.
+class VmmBackend : public GpuTransferBackend
+{
+public:
+  GpuHandle export_handle(void * device_ptr, size_t size) override;
+  void free_device_memory(void * device_ptr) override;
+  void * import_handle(const GpuHandle & handle, size_t size) override;
+  void release_handle(void * local_ptr) override;
+};
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_sample_application/CMakeLists.txt b/src/agnocast_sample_application/CMakeLists.txt
index 887916d7f..945e03ab7 100644
--- a/src/agnocast_sample_application/CMakeLists.txt
+++ b/src/agnocast_sample_application/CMakeLists.txt
@@ -160,6 +160,34 @@ install(TARGETS no_rclcpp_client
 install(TARGETS sim_time_timer
   DESTINATION lib/${PROJECT_NAME})
 
+# CUDA sample applications (only built when CUDA and agnocast_cuda are available)
+find_package(CUDAToolkit QUIET)
+find_package(agnocast_cuda QUIET)
+if(CUDAToolkit_FOUND AND agnocast_cuda_FOUND)
+  enable_language(CUDA)
+
+  add_executable(cuda_talker src/cuda_publisher.cpp)
+  set_source_files_properties(src/cuda_publisher.cpp PROPERTIES LANGUAGE CUDA)
+  ament_target_dependencies(cuda_talker agnocastlib agnocast_cuda sensor_msgs)
+  target_link_libraries(cuda_talker CUDA::cudart)
+  target_include_directories(cuda_talker PRIVATE
+    ${agnocastlib_INCLUDE_DIRS}
+    ${agnocast_cuda_INCLUDE_DIRS}
+  )
+
+  add_executable(cuda_listener src/cuda_subscriber.cpp)
+  set_source_files_properties(src/cuda_subscriber.cpp PROPERTIES LANGUAGE CUDA)
+  ament_target_dependencies(cuda_listener agnocastlib agnocast_cuda sensor_msgs)
+  target_link_libraries(cuda_listener CUDA::cudart)
+  target_include_directories(cuda_listener PRIVATE
+    ${agnocastlib_INCLUDE_DIRS}
+    ${agnocast_cuda_INCLUDE_DIRS}
+  )
+
+  install(TARGETS cuda_talker cuda_listener
+    DESTINATION lib/${PROJECT_NAME})
+endif()
+
 install(DIRECTORY launch
   DESTINATION share/${PROJECT_NAME}/
 )
diff --git a/src/agnocast_sample_application/package.xml b/src/agnocast_sample_application/package.xml
index dc9385840..f69e50068 100644
--- a/src/agnocast_sample_application/package.xml
+++ b/src/agnocast_sample_application/package.xml
@@ -31,6 +31,13 @@
   <depend>agnocast_components</depend>
   <depend>agnocast_sample_interfaces</depend>
 
+  <!-- Optional: CUDA sample applications (built only when CUDAToolkit and agnocast_cuda are found).
+       These are unconditional in package.xml because ROS 2 package format 3 does not support
+       conditional build_depend. On non-CUDA systems, rosdep may warn about agnocast_cuda being
+       unavailable, but the CMakeLists.txt uses find_package(... QUIET) so the build still succeeds. -->
+  <build_depend>agnocast_cuda</build_depend>
+  <build_depend>sensor_msgs</build_depend>
+
   <export>
     <build_type>ament_cmake</build_type>
   </export>
diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp
new file mode 100644
index 000000000..e8c447290
--- /dev/null
+++ b/src/agnocast_sample_application/src/cuda_publisher.cpp
@@ -0,0 +1,67 @@
+#include "agnocast/agnocast.hpp"
+#include "agnocast/cuda/types.hpp"
+
+#include <cuda_runtime.h>
+
+using namespace std::chrono_literals;
+
+// Simple CUDA kernel: fills GPU buffer with incrementing values.
+__global__ void fill_kernel(uint8_t * data, size_t size, uint8_t offset)
+{
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    data[idx] = static_cast<uint8_t>((idx + offset) % 256);
+  }
+}
+
+class CudaPublisher : public agnocast::Node
+{
+  int64_t count_ = 0;
+  agnocast::Publisher<agnocast::cuda::PointCloud2>::SharedPtr pub_;
+  agnocast::TimerBase::SharedPtr timer_;
+
+  void timer_callback()
+  {
+    auto msg = pub_->borrow_loaned_message();
+
+    // Set CPU metadata
+    msg->header.stamp.sec = static_cast<int32_t>(count_);
+    msg->header.frame_id = "lidar";
+    msg->height = 1;
+    msg->width = 1024;
+    msg->point_step = 16;
+    msg->row_step = msg->width * msg->point_step;
+    msg->is_dense = true;
+
+    // Allocate and fill GPU data
+    const size_t gpu_size = msg->height * msg->width * msg->point_step;
+    cudaMalloc(&msg->data, gpu_size);
+
+    const int threads = 256;
+    const int blocks = (gpu_size + threads - 1) / threads;
+    fill_kernel<<<blocks, threads>>>(msg->data, gpu_size, static_cast<uint8_t>(count_));
+    cudaStreamSynchronize(nullptr);
+
+    pub_->publish(std::move(msg));
+    RCLCPP_INFO(get_logger(), "published CUDA PointCloud2: seq=%ld, gpu_size=%zu", count_++, gpu_size);
+  }
+
+public:
+  CudaPublisher() : Node("cuda_publisher")
+  {
+    pub_ = this->create_publisher<agnocast::cuda::PointCloud2>("/cuda_pointcloud", 1);
+    timer_ = agnocast::create_timer(
+      this, std::make_shared<rclcpp::Clock>(RCL_STEADY_TIME), rclcpp::Duration(100ms),
+      std::bind(&CudaPublisher::timer_callback, this));
+  }
+};
+
+int main(int argc, char ** argv)
+{
+  agnocast::init(argc, argv);
+  agnocast::AgnocastOnlySingleThreadedExecutor executor;
+  auto node = std::make_shared<CudaPublisher>();
+  executor.add_node(node);
+  executor.spin();
+  return 0;
+}
diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp
new file mode 100644
index 000000000..4c7b243d4
--- /dev/null
+++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp
@@ -0,0 +1,52 @@
+#include "agnocast/agnocast.hpp"
+#include "agnocast/cuda/types.hpp"
+
+#include <cuda_runtime.h>
+
+#include <algorithm>
+
+using std::placeholders::_1;
+
+class CudaSubscriber : public agnocast::Node
+{
+  agnocast::Subscription<agnocast::cuda::PointCloud2>::SharedPtr sub_;
+
+  void callback(agnocast::ipc_shared_ptr<const agnocast::cuda::PointCloud2> msg)
+  {
+    // Read CPU metadata from shared memory
+    const uint32_t width = msg->width;
+    const uint32_t point_step = msg->point_step;
+    const size_t gpu_size = msg->height * width * point_step;
+
+    // Get subscriber-local GPU pointer (mapped via CUDA IPC)
+    auto * gpu_ptr = static_cast<uint8_t *>(msg.get_local_gpu_ptr());
+
+    // Read first few bytes from GPU to verify data
+    uint8_t host_buf[16]{};
+    const size_t copy_size = std::min(gpu_size, sizeof(host_buf));
+    cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost);
+
+    RCLCPP_INFO(
+      get_logger(),
+      "received CUDA PointCloud2: width=%u, point_step=%u, gpu_size=%zu, "
+      "first_bytes=[%u,%u,%u,%u]",
+      width, point_step, gpu_size, host_buf[0], host_buf[1], host_buf[2], host_buf[3]);
+  }
+
+public:
+  CudaSubscriber() : Node("cuda_subscriber")
+  {
+    sub_ = this->create_subscription<agnocast::cuda::PointCloud2>(
+      "/cuda_pointcloud", 1, std::bind(&CudaSubscriber::callback, this, _1));
+  }
+};
+
+int main(int argc, char ** argv)
+{
+  agnocast::init(argc, argv);
+  agnocast::AgnocastOnlySingleThreadedExecutor executor;
+  auto node = std::make_shared<CudaSubscriber>();
+  executor.add_node(node);
+  executor.spin();
+  return 0;
+}
diff --git a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
index 13b66fbf7..18bc7c1a1 100644
--- a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
@@ -1,6 +1,9 @@
 #pragma once
 
 #include "agnocast/agnocast_smart_pointer.hpp"
+#include "agnocast/cuda_message_tag.hpp"
+#include "agnocast/gpu_metadata.hpp"
+#include "agnocast/gpu_transfer_backend.hpp"
 
 #include <mutex>
 #include <type_traits>
@@ -61,6 +64,42 @@ extern std::atomic<bool> need_epoll_updates;
 
 uint32_t allocate_callback_info_id();
 
+// Creates an ipc_shared_ptr for a subscriber-received message.
+// For CUDA messages: imports the GPU handle, stores the subscriber-local GPU pointer in
+// control_block->local_gpu_ptr, and registers a gpu_cleanup callback to release the mapping
+// on last reference. The pointer is accessed via ipc_shared_ptr::get_local_gpu_ptr().
+// For non-CUDA messages: simply wraps the pointer.
+template <typename MessageT>
+agnocast::ipc_shared_ptr<MessageT> create_subscriber_ipc_ptr(
+  MessageT * msg, const std::string & topic_name, const topic_local_id_t subscriber_id,
+  const int64_t entry_id)
+{
+  if constexpr (is_cuda_message_v<MessageT>) {
+    auto * meta = static_cast<GpuMetadata *>(msg->gpu_metadata_);
+    if (!meta) {
+      std::fprintf(
+        stderr,
+        "[agnocast] FATAL: CUDA message on topic '%s' has null gpu_metadata_. "
+        "The publisher may have failed to set GpuMetadata during publish().\n",
+        topic_name.c_str());
+      std::abort();
+    }
+    void * local_gpu_ptr =
+      agnocast::cuda::get_backend().import_handle(meta->handle, meta->gpu_data_size);
+    // NOTE: If import_handle() fails, the backend aborts (fail-fast). If a future backend
+    // returns nullptr instead, the subscriber would get a null gpu pointer. Callers should
+    // check get_local_gpu_ptr() != nullptr before use.
+
+    auto ipc_ptr = agnocast::ipc_shared_ptr<MessageT>(msg, topic_name, subscriber_id, entry_id);
+    ipc_ptr.set_local_gpu_ptr(local_gpu_ptr);
+    ipc_ptr.set_gpu_cleanup(
+      [local_gpu_ptr]() { agnocast::cuda::get_backend().release_handle(local_gpu_ptr); });
+    return ipc_ptr;
+  } else {
+    return agnocast::ipc_shared_ptr<MessageT>(msg, topic_name, subscriber_id, entry_id);
+  }
+}
+
 template <typename T, typename Func>
 TypeErasedCallback get_erased_callback(Func && callback)
 {
@@ -95,9 +134,9 @@ uint32_t register_callback(
   auto message_creator = [](
                            const void * ptr, const std::string & topic_name,
                            const topic_local_id_t subscriber_id, const int64_t entry_id) {
-    return std::make_unique<TypedMessagePtr<MessageT>>(agnocast::ipc_shared_ptr<MessageT>(
-      const_cast<MessageT *>(static_cast<const MessageT *>(ptr)), topic_name, subscriber_id,
-      entry_id));
+    auto * msg = const_cast<MessageT *>(static_cast<const MessageT *>(ptr));
+    return std::make_unique<TypedMessagePtr<MessageT>>(
+      create_subscriber_ipc_ptr(msg, topic_name, subscriber_id, entry_id));
   };
 
   uint32_t callback_info_id = allocate_callback_info_id();
diff --git a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
index 6ad96b690..faf146c08 100644
--- a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
@@ -5,6 +5,9 @@
 #include "agnocast/agnocast_smart_pointer.hpp"
 #include "agnocast/agnocast_tracepoint_wrapper.h"
 #include "agnocast/agnocast_utils.hpp"
+#include "agnocast/cuda_message_tag.hpp"
+#include "agnocast/gpu_metadata.hpp"
+#include "agnocast/gpu_transfer_backend.hpp"
 #include "rclcpp/detail/qos_parameters.hpp"
 #include "rclcpp/rclcpp.hpp"
 
@@ -184,7 +187,22 @@ class BasicPublisher
     }
 
     // Capture raw pointer BEFORE invalidation (get() returns nullptr after invalidation).
-    const uint64_t msg_virtual_address = reinterpret_cast<uint64_t>(message.get());
+    MessageT * raw_ptr = message.get();
+    const uint64_t msg_virtual_address = reinterpret_cast<uint64_t>(raw_ptr);
+
+    // CUDA publish hook: export GPU handle and allocate GpuMetadata in shared memory.
+    // Runs while heaphook is still active, so GpuMetadata lands in the publisher's shared memory.
+    // NOTE: Assumes MessageT has a public `data` member (uint8_t*) pointing to the GPU allocation.
+    // All CUDA message types must provide this by shadowing the base ROS message's data field.
+    if constexpr (is_cuda_message_v<MessageT>) {
+      auto & backend = agnocast::cuda::get_backend();
+      const size_t gpu_size = get_cuda_gpu_data_size(*raw_ptr);
+      auto * meta = new GpuMetadata();
+      meta->publisher_gpu_ptr = raw_ptr->data;
+      meta->gpu_data_size = gpu_size;
+      meta->handle = backend.export_handle(raw_ptr->data, gpu_size);
+      raw_ptr->gpu_metadata_ = meta;
+    }
 
     // Invalidate all references sharing this handle's control block.
     // Any remaining copies held elsewhere will fail-fast on dereference.
@@ -197,6 +215,16 @@ class BasicPublisher
 
     for (uint32_t i = 0; i < publish_msg_args.ret_released_num; i++) {
       MessageT * release_ptr = reinterpret_cast<MessageT *>(publish_msg_args.ret_released_addrs[i]);
+      // CUDA reclaim hook: free GPU buffer before deleting the message.
+      // On abnormal publisher exit, free_device_memory() is never called, but GPU device
+      // memory is reclaimed by the CUDA driver when the process exits.
+      if constexpr (is_cuda_message_v<MessageT>) {
+        if (release_ptr->gpu_metadata_) {
+          auto * meta = static_cast<GpuMetadata *>(release_ptr->gpu_metadata_);
+          agnocast::cuda::get_backend().free_device_memory(meta->publisher_gpu_ptr);
+          delete meta;
+        }
+      }
       delete release_ptr;
     }
 
diff --git a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
index b20f31900..e2b51d4b1 100644
--- a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
@@ -15,6 +15,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <functional>
 #include <memory>
 #include <type_traits>
 
@@ -40,10 +41,18 @@ extern int agnocast_fd;
 // Sentinel value indicating entry_id has not been assigned (publisher-side, before publish).
 constexpr int64_t ENTRY_ID_NOT_ASSIGNED = -1;
 
-// Forward declaration for friend access
+// Forward declarations for friend access
+template <typename T>
+class ipc_shared_ptr;
+
 template <typename MessageT, typename BridgeRequestPolicy>
 class BasicPublisher;
 
+template <typename MessageT>
+ipc_shared_ptr<MessageT> create_subscriber_ipc_ptr(
+  MessageT * msg, const std::string & topic_name, const topic_local_id_t subscriber_id,
+  const int64_t entry_id);
+
 namespace detail
 {
 
@@ -60,6 +69,16 @@ struct control_block
   topic_local_id_t pubsub_id;           // 4-byte alignment
   std::atomic<bool> valid{true};        // 1-byte alignment
 
+  // Optional GPU cleanup callback. Null for non-CUDA messages.
+  // Called before bitmap release in reset() to ensure GPU mappings are released
+  // before the publisher can free the underlying GPU buffer.
+  std::function<void()> gpu_cleanup;
+
+  // Subscriber-local GPU device pointer obtained via import_handle().
+  // Stored here because the shared memory message is mapped read-only by the subscriber,
+  // so we cannot inject the local pointer into msg->data.
+  void * local_gpu_ptr = nullptr;
+
   control_block(std::string topic, topic_local_id_t pubsub, int64_t entry)
   : topic_name(std::move(topic)), entry_id(entry), pubsub_id(pubsub)
   {
@@ -92,6 +111,11 @@ class ipc_shared_ptr
   template <typename MessageT, typename BridgeRequestPolicy>
   friend class BasicPublisher;
 
+  // Allow create_subscriber_ipc_ptr to call set_gpu_cleanup() and set_local_gpu_ptr()
+  template <typename MessageT>
+  friend ipc_shared_ptr<MessageT> create_subscriber_ipc_ptr(
+    MessageT *, const std::string &, const topic_local_id_t, const int64_t);
+
   // Allow converting constructors to access private members of ipc_shared_ptr<U>
   template <typename U>
   friend class ipc_shared_ptr;
@@ -120,6 +144,24 @@ class ipc_shared_ptr
     }
   }
 
+  // Sets a GPU cleanup callback to be invoked when the last reference is released.
+  // Private: only create_subscriber_ipc_ptr() should call this.
+  void set_gpu_cleanup(std::function<void()> fn)
+  {
+    if (control_) {
+      control_->gpu_cleanup = std::move(fn);
+    }
+  }
+
+  // Sets the subscriber-local GPU pointer (obtained via import_handle).
+  // Private: only create_subscriber_ipc_ptr() should call this.
+  void set_local_gpu_ptr(void * ptr)
+  {
+    if (control_) {
+      control_->local_gpu_ptr = ptr;
+    }
+  }
+
   // Publisher-side constructor (entry_id not yet assigned).
   // Creates control block for reference counting and one-shot invalidation.
   // Private: users must call BasicPublisher::borrow_loaned_message() instead of constructing
@@ -273,6 +315,12 @@ class ipc_shared_ptr
 
   T * get() const noexcept { return is_invalidated_() ? nullptr : ptr_; }
 
+  // Returns the subscriber-local GPU device pointer, or nullptr for non-CUDA messages.
+  void * get_local_gpu_ptr() const noexcept
+  {
+    return control_ ? control_->local_gpu_ptr : nullptr;
+  }
+
   // Thread-safe: atomically decrements ref count and performs cleanup if last reference.
   void reset()
   {
@@ -283,6 +331,12 @@ class ipc_shared_ptr
     const bool was_last = control_->decrement_and_check();
 
     if (was_last) {
+      // GPU cleanup must run BEFORE bitmap release: unmapping the GPU buffer before
+      // the publisher is allowed to cudaFree the underlying allocation.
+      if (control_->gpu_cleanup) {
+        control_->gpu_cleanup();
+      }
+
       if (control_->entry_id != ENTRY_ID_NOT_ASSIGNED) {
         // Subscriber side: notify kmod that all references are released.
         release_subscriber_reference(control_->topic_name, control_->pubsub_id, control_->entry_id);
diff --git a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp
index 2ad3728d3..80137e8e9 100644
--- a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp
@@ -338,8 +338,8 @@ class BasicTakeSubscription : public SubscriptionBase
         }
 
         MessageT * ptr = reinterpret_cast<MessageT *>(take_args.ret_addr);
-        auto result =
-          agnocast::ipc_shared_ptr<const MessageT>(ptr, topic_name_, id_, take_args.ret_entry_id);
+        auto result = create_subscriber_ipc_ptr<const MessageT>(
+          ptr, topic_name_, id_, take_args.ret_entry_id);
         old_ptr = std::move(last_taken_ptr_);
         last_taken_ptr_ = result;
         return result;
@@ -347,7 +347,8 @@ class BasicTakeSubscription : public SubscriptionBase
     }
 
     MessageT * ptr = reinterpret_cast<MessageT *>(take_args.ret_addr);
-    return agnocast::ipc_shared_ptr<const MessageT>(ptr, topic_name_, id_, take_args.ret_entry_id);
+    return create_subscriber_ipc_ptr<const MessageT>(
+      ptr, topic_name_, id_, take_args.ret_entry_id);
   }
 };
 
diff --git a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp
index a103b10fe..3d0d5839b 100644
--- a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp
+++ b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp
@@ -4,6 +4,7 @@
 #include "agnocast/agnocast_publisher.hpp"
 #include "agnocast/agnocast_subscription.hpp"
 #include "agnocast/bridge/agnocast_bridge_utils.hpp"
+#include "agnocast/cuda_message_tag.hpp"
 #include "rclcpp/rclcpp.hpp"
 
 #include <dlfcn.h>
@@ -39,11 +40,21 @@ template <typename MessageT>
 void request_bridge_core(
   const std::string & topic_name, topic_local_id_t id, BridgeDirection direction)
 {
-  auto bridge_mode = get_bridge_mode();
-  if (bridge_mode == BridgeMode::Standard) {
-    send_bridge_request<MessageT>(topic_name, id, direction);
-  } else if (bridge_mode == BridgeMode::Performance) {
-    send_performance_bridge_request<MessageT>(topic_name, id, direction);
+  // CUDA message types cannot be bridged to ROS 2 directly (GPU pointers are not serializable).
+  // Bridge support for CUDA types (via cudaMemcpy D2H) is future work.
+  // TODO(agnocast): Log a one-time warning so users know the bridge was skipped for this topic.
+  if constexpr (is_cuda_message_v<MessageT>) {
+    (void)topic_name;
+    (void)id;
+    (void)direction;
+    return;
+  } else {
+    auto bridge_mode = get_bridge_mode();
+    if (bridge_mode == BridgeMode::Standard) {
+      send_bridge_request<MessageT>(topic_name, id, direction);
+    } else if (bridge_mode == BridgeMode::Performance) {
+      send_performance_bridge_request<MessageT>(topic_name, id, direction);
+    }
   }
 }
 
diff --git a/src/agnocastlib/include/agnocast/cuda_message_tag.hpp b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp
new file mode 100644
index 000000000..0c5a7d685
--- /dev/null
+++ b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <cstddef>
+#include <type_traits>
+
+namespace agnocast
+{
+
+// Base class for CUDA message types. Provides a pointer to GpuMetadata stored in shared memory.
+// CUDA message types (e.g., agnocast::cuda::PointCloud2) inherit from both a ROS message type
+// and this tag. The publish/receive machinery uses is_cuda_message_v<T> to detect CUDA messages
+// at compile time via if constexpr, so no CUDA headers are needed in agnocastlib.
+struct cuda_message_tag
+{
+  void * gpu_metadata_ = nullptr;
+};
+
+template <typename T>
+inline constexpr bool is_cuda_message_v = std::is_base_of_v<cuda_message_tag, std::remove_const_t<T>>;
+
+// Returns the GPU data size for a CUDA message. Must be specialized by each CUDA message type
+// in the agnocast_cuda package (e.g., for PointCloud2: height * width * point_step).
+template <typename T>
+size_t get_cuda_gpu_data_size(const T & msg);
+
+}  // namespace agnocast
diff --git a/src/agnocastlib/include/agnocast/gpu_metadata.hpp b/src/agnocastlib/include/agnocast/gpu_metadata.hpp
new file mode 100644
index 000000000..f4c860cef
--- /dev/null
+++ b/src/agnocastlib/include/agnocast/gpu_metadata.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace agnocast
+{
+
+// Opaque handle large enough for cudaIpcMemHandle_t (64 bytes).
+struct GpuHandle
+{
+  uint8_t opaque[64];
+};
+
+// GPU sharing metadata stored in shared memory alongside the message.
+// Allocated by the publish path (while heaphook is active) so it lands in the publisher's
+// shared memory region and is readable by subscribers.
+struct GpuMetadata
+{
+  GpuHandle handle;          // backend-specific shareable handle
+  size_t gpu_data_size;      // size of the GPU allocation in bytes
+  void * publisher_gpu_ptr;  // original device pointer for publisher-side free on reclaim
+};
+
+}  // namespace agnocast
diff --git a/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp b/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp
new file mode 100644
index 000000000..4f91aca30
--- /dev/null
+++ b/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "agnocast/gpu_metadata.hpp"
+
+#include <cstddef>
+
+namespace agnocast::cuda
+{
+
+// Abstract interface for GPU memory sharing backends.
+// Concrete implementations (CudaIpcBackend, etc.) live in the agnocast_cuda package.
+class GpuTransferBackend
+{
+public:
+  virtual ~GpuTransferBackend() = default;
+
+  // Publisher side: create a shareable handle from a device pointer.
+  virtual GpuHandle export_handle(void * device_ptr, size_t size) = 0;
+
+  // Publisher side: free GPU buffer on reclaim.
+  virtual void free_device_memory(void * device_ptr) = 0;
+
+  // Subscriber side: map GPU buffer into this process.
+  virtual void * import_handle(const GpuHandle & handle, size_t size) = 0;
+
+  // Subscriber side: unmap GPU buffer from this process.
+  virtual void release_handle(void * local_ptr) = 0;
+};
+
+// Returns the singleton backend instance. Defined in the agnocast_cuda package and resolved
+// at link time. Only called from if-constexpr branches guarded by is_cuda_message_v<T>,
+// so the symbol is never referenced unless a CUDA message type is actually used.
+GpuTransferBackend & get_backend();
+
+}  // namespace agnocast::cuda

From 9dd28ed7c7708c25248835e10624c3a168d26818 Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Sat, 14 Mar 2026 04:16:25 +0900
Subject: [PATCH 02/18] fix

Signed-off-by: sykwer <sykwer@gmail.com>
---
 src/agnocast_cuda/src/cuda_ipc_backend.cpp                 | 3 +--
 src/agnocast_sample_application/src/cuda_publisher.cpp     | 4 +++-
 .../include/agnocast/agnocast_smart_pointer.hpp            | 5 +----
 src/agnocastlib/include/agnocast/agnocast_subscription.hpp | 7 +++----
 src/agnocastlib/include/agnocast/cuda_message_tag.hpp      | 3 ++-
 5 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.cpp b/src/agnocast_cuda/src/cuda_ipc_backend.cpp
index ab3ae8be2..3a2bbb8b4 100644
--- a/src/agnocast_cuda/src/cuda_ipc_backend.cpp
+++ b/src/agnocast_cuda/src/cuda_ipc_backend.cpp
@@ -44,8 +44,7 @@ void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/)
   std::memcpy(&ipc_handle, handle.opaque, sizeof(ipc_handle));
   void * ptr = nullptr;
   check_cuda_error(
-    cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess),
-    "cudaIpcOpenMemHandle");
+    cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess), "cudaIpcOpenMemHandle");
   return ptr;
 }
 
diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp
index e8c447290..7abe85d9b 100644
--- a/src/agnocast_sample_application/src/cuda_publisher.cpp
+++ b/src/agnocast_sample_application/src/cuda_publisher.cpp
@@ -39,11 +39,13 @@ class CudaPublisher : public agnocast::Node
 
     const int threads = 256;
     const int blocks = (gpu_size + threads - 1) / threads;
+    // cppcheck-suppress shiftTooManyBits  // false positive: <<< >>> is CUDA kernel launch syntax
     fill_kernel<<<blocks, threads>>>(msg->data, gpu_size, static_cast<uint8_t>(count_));
     cudaStreamSynchronize(nullptr);
 
     pub_->publish(std::move(msg));
-    RCLCPP_INFO(get_logger(), "published CUDA PointCloud2: seq=%ld, gpu_size=%zu", count_++, gpu_size);
+    RCLCPP_INFO(
+      get_logger(), "published CUDA PointCloud2: seq=%ld, gpu_size=%zu", count_++, gpu_size);
   }
 
 public:
diff --git a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
index e2b51d4b1..d055535f7 100644
--- a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
@@ -316,10 +316,7 @@ class ipc_shared_ptr
   T * get() const noexcept { return is_invalidated_() ? nullptr : ptr_; }
 
   // Returns the subscriber-local GPU device pointer, or nullptr for non-CUDA messages.
-  void * get_local_gpu_ptr() const noexcept
-  {
-    return control_ ? control_->local_gpu_ptr : nullptr;
-  }
+  void * get_local_gpu_ptr() const noexcept { return control_ ? control_->local_gpu_ptr : nullptr; }
 
   // Thread-safe: atomically decrements ref count and performs cleanup if last reference.
   void reset()
diff --git a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp
index 80137e8e9..8bb51e07f 100644
--- a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp
@@ -338,8 +338,8 @@ class BasicTakeSubscription : public SubscriptionBase
         }
 
         MessageT * ptr = reinterpret_cast<MessageT *>(take_args.ret_addr);
-        auto result = create_subscriber_ipc_ptr<const MessageT>(
-          ptr, topic_name_, id_, take_args.ret_entry_id);
+        auto result =
+          create_subscriber_ipc_ptr<const MessageT>(ptr, topic_name_, id_, take_args.ret_entry_id);
         old_ptr = std::move(last_taken_ptr_);
         last_taken_ptr_ = result;
         return result;
@@ -347,8 +347,7 @@ class BasicTakeSubscription : public SubscriptionBase
     }
 
     MessageT * ptr = reinterpret_cast<MessageT *>(take_args.ret_addr);
-    return create_subscriber_ipc_ptr<const MessageT>(
-      ptr, topic_name_, id_, take_args.ret_entry_id);
+    return create_subscriber_ipc_ptr<const MessageT>(ptr, topic_name_, id_, take_args.ret_entry_id);
   }
 };
 
diff --git a/src/agnocastlib/include/agnocast/cuda_message_tag.hpp b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp
index 0c5a7d685..969221b01 100644
--- a/src/agnocastlib/include/agnocast/cuda_message_tag.hpp
+++ b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp
@@ -16,7 +16,8 @@ struct cuda_message_tag
 };
 
 template <typename T>
-inline constexpr bool is_cuda_message_v = std::is_base_of_v<cuda_message_tag, std::remove_const_t<T>>;
+inline constexpr bool is_cuda_message_v =
+  std::is_base_of_v<cuda_message_tag, std::remove_const_t<T>>;
 
 // Returns the GPU data size for a CUDA message. Must be specialized by each CUDA message type
 // in the agnocast_cuda package (e.g., for PointCloud2: height * width * point_step).

From 8e69df3f3981c62949b4c9437fab2043694d3d9c Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Sat, 14 Mar 2026 04:23:57 +0900
Subject: [PATCH 03/18] make agnocast_cuda build optional when CUDA toolchain
 is unavailable

Signed-off-by: sykwer <sykwer@gmail.com>
---
 src/agnocast_cuda/CMakeLists.txt | 61 +++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 21 deletions(-)

diff --git a/src/agnocast_cuda/CMakeLists.txt b/src/agnocast_cuda/CMakeLists.txt
index 9d28e687d..d9fa6dd6a 100644
--- a/src/agnocast_cuda/CMakeLists.txt
+++ b/src/agnocast_cuda/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.14)
-project(agnocast_cuda LANGUAGES CXX CUDA)
+project(agnocast_cuda LANGUAGES CXX)
 
 if(NOT CMAKE_CXX_STANDARD)
   set(CMAKE_CXX_STANDARD 17)
@@ -12,34 +12,53 @@ endif()
 find_package(ament_cmake REQUIRED)
 find_package(agnocastlib REQUIRED)
 find_package(sensor_msgs REQUIRED)
-find_package(CUDAToolkit REQUIRED)
 
-add_library(agnocast_cuda SHARED
-  src/cuda_ipc_backend.cpp
-  src/vmm_backend.cpp
-  src/nvscibuf_backend.cpp
-  src/unified_memory_backend.cpp
-  src/get_backend.cpp)
+# CUDA is optional: if not available, headers are still installed but no library is built.
+# Downstream packages that use CUDA message types must check if the agnocast_cuda library
+# target exists (e.g., via find_package + if(TARGET agnocast_cuda::agnocast_cuda)).
+include(CheckLanguage)
+check_language(CUDA)
 
-target_include_directories(agnocast_cuda PUBLIC
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  $<INSTALL_INTERFACE:include>)
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+  find_package(CUDAToolkit QUIET)
+endif()
+
+if(CUDAToolkit_FOUND)
+  add_library(agnocast_cuda SHARED
+    src/cuda_ipc_backend.cpp
+    src/vmm_backend.cpp
+    src/nvscibuf_backend.cpp
+    src/unified_memory_backend.cpp
+    src/get_backend.cpp)
+
+  target_include_directories(agnocast_cuda PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:include>)
 
-ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs)
-target_link_libraries(agnocast_cuda CUDA::cudart)
+  ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs)
+  target_link_libraries(agnocast_cuda CUDA::cudart)
 
-install(TARGETS agnocast_cuda
-  EXPORT export_${PROJECT_NAME}
-  LIBRARY DESTINATION lib
-  ARCHIVE DESTINATION lib
-  RUNTIME DESTINATION bin
-  INCLUDES DESTINATION include)
+  install(TARGETS agnocast_cuda
+    EXPORT export_${PROJECT_NAME}
+    LIBRARY DESTINATION lib
+    ARCHIVE DESTINATION lib
+    RUNTIME DESTINATION bin
+    INCLUDES DESTINATION include)
+
+  ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET)
+  ament_export_dependencies(agnocastlib sensor_msgs)
+
+  message(STATUS "agnocast_cuda: CUDA found — building library and installing headers.")
+else()
+  message(STATUS "agnocast_cuda: CUDA not found — installing headers only (no library).")
+endif()
 
+# Always install headers so downstream packages can use the type definitions
+# (e.g., cuda_message_tag.hpp is in agnocastlib, but types.hpp is here).
 install(
   DIRECTORY include/
   DESTINATION include)
 
-ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET)
 ament_export_include_directories(include)
-ament_export_dependencies(agnocastlib sensor_msgs)
 ament_package()

From 3c4e4e5572843749da85f0ca680fe28365f8dd79 Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Sat, 14 Mar 2026 04:26:18 +0900
Subject: [PATCH 04/18] remove unconditional CUDA build_depend from package.xml

Signed-off-by: sykwer <sykwer@gmail.com>
---
 src/agnocast_sample_application/package.xml | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/agnocast_sample_application/package.xml b/src/agnocast_sample_application/package.xml
index f69e50068..3efee0479 100644
--- a/src/agnocast_sample_application/package.xml
+++ b/src/agnocast_sample_application/package.xml
@@ -31,12 +31,9 @@
   <depend>agnocast_components</depend>
   <depend>agnocast_sample_interfaces</depend>
 
-  <!-- Optional: CUDA sample applications (built only when CUDAToolkit and agnocast_cuda are found).
-       These are unconditional in package.xml because ROS 2 package format 3 does not support
-       conditional build_depend. On non-CUDA systems, rosdep may warn about agnocast_cuda being
-       unavailable, but the CMakeLists.txt uses find_package(... QUIET) so the build still succeeds. -->
-  <build_depend>agnocast_cuda</build_depend>
-  <build_depend>sensor_msgs</build_depend>
+  <!-- Note: agnocast_cuda and sensor_msgs are intentionally omitted here.
+       CUDA sample apps are conditionally built via find_package(... QUIET) in CMakeLists.txt.
+       Declaring them as build_depend would force rosdep to require CUDA on non-CUDA systems. -->
 
   <export>
     <build_type>ament_cmake</build_type>

From 0188770095daf083eb5853a494d993918a7b8e36 Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Sat, 14 Mar 2026 04:30:14 +0900
Subject: [PATCH 05/18] log warning when bridge is skipped for CUDA message
 types

Signed-off-by: sykwer <sykwer@gmail.com>
---
 .../include/agnocast/bridge/agnocast_bridge_node.hpp      | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp
index 3d0d5839b..7686e1fdc 100644
--- a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp
+++ b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp
@@ -42,9 +42,13 @@ void request_bridge_core(
 {
   // CUDA message types cannot be bridged to ROS 2 directly (GPU pointers are not serializable).
   // Bridge support for CUDA types (via cudaMemcpy D2H) is future work.
-  // TODO(agnocast): Log a one-time warning so users know the bridge was skipped for this topic.
   if constexpr (is_cuda_message_v<MessageT>) {
-    (void)topic_name;
+    static const auto logger = rclcpp::get_logger("agnocast_bridge_requester");
+    RCLCPP_WARN(
+      logger,
+      "Bridge skipped for CUDA topic '%s': GPU message types cannot be bridged to ROS 2. "
+      "Use cudaMemcpy to a standard ROS message if DDS bridging is needed.",
+      topic_name.c_str());
     (void)id;
     (void)direction;
     return;

From d9d438ea1a3a0c4b2fdf37796b10b0abd538a55c Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Sat, 14 Mar 2026 04:37:20 +0900
Subject: [PATCH 06/18] replace std::function with plain function pointer for
 GPU cleanup

Signed-off-by: sykwer <sykwer@gmail.com>
---
 .../agnocast/agnocast_callback_info.hpp       |  6 ++---
 .../agnocast/agnocast_smart_pointer.hpp       | 25 ++++++++++---------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
index 18bc7c1a1..154c454c6 100644
--- a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
@@ -66,7 +66,7 @@ uint32_t allocate_callback_info_id();
 
 // Creates an ipc_shared_ptr for a subscriber-received message.
 // For CUDA messages: imports the GPU handle, stores the subscriber-local GPU pointer in
-// control_block->local_gpu_ptr, and registers a gpu_cleanup callback to release the mapping
+// control_block->local_gpu_ptr, and registers a gpu_release_fn to release the mapping
 // on last reference. The pointer is accessed via ipc_shared_ptr::get_local_gpu_ptr().
 // For non-CUDA messages: simply wraps the pointer.
 template <typename MessageT>
@@ -92,8 +92,8 @@ agnocast::ipc_shared_ptr<MessageT> create_subscriber_ipc_ptr(
 
     auto ipc_ptr = agnocast::ipc_shared_ptr<MessageT>(msg, topic_name, subscriber_id, entry_id);
     ipc_ptr.set_local_gpu_ptr(local_gpu_ptr);
-    ipc_ptr.set_gpu_cleanup(
-      [local_gpu_ptr]() { agnocast::cuda::get_backend().release_handle(local_gpu_ptr); });
+    ipc_ptr.set_gpu_release_fn(
+      [](void * ptr) { agnocast::cuda::get_backend().release_handle(ptr); });
     return ipc_ptr;
   } else {
     return agnocast::ipc_shared_ptr<MessageT>(msg, topic_name, subscriber_id, entry_id);
diff --git a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
index d055535f7..a98381ca4 100644
--- a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
@@ -15,7 +15,6 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <functional>
 #include <memory>
 #include <type_traits>
 
@@ -69,10 +68,12 @@ struct control_block
   topic_local_id_t pubsub_id;           // 4-byte alignment
   std::atomic<bool> valid{true};        // 1-byte alignment
 
-  // Optional GPU cleanup callback. Null for non-CUDA messages.
-  // Called before bitmap release in reset() to ensure GPU mappings are released
-  // before the publisher can free the underlying GPU buffer.
-  std::function<void()> gpu_cleanup;
+  // Optional GPU cleanup function pointer. Null for non-CUDA messages.
+  // Called as gpu_release_fn(local_gpu_ptr) before bitmap release in reset() to ensure GPU
+  // mappings are released before the publisher can free the underlying GPU buffer.
+  // Uses a plain function pointer instead of std::function to avoid heap allocation and
+  // minimize overhead for non-CUDA messages (16 bytes for two pointers vs ~40+ bytes).
+  void (*gpu_release_fn)(void *) = nullptr;
 
   // Subscriber-local GPU device pointer obtained via import_handle().
   // Stored here because the shared memory message is mapped read-only by the subscriber,
@@ -111,7 +112,7 @@ class ipc_shared_ptr
   template <typename MessageT, typename BridgeRequestPolicy>
   friend class BasicPublisher;
 
-  // Allow create_subscriber_ipc_ptr to call set_gpu_cleanup() and set_local_gpu_ptr()
+  // Allow create_subscriber_ipc_ptr to call set_gpu_release_fn() and set_local_gpu_ptr()
   template <typename MessageT>
   friend ipc_shared_ptr<MessageT> create_subscriber_ipc_ptr(
     MessageT *, const std::string &, const topic_local_id_t, const int64_t);
@@ -144,12 +145,12 @@ class ipc_shared_ptr
     }
   }
 
-  // Sets a GPU cleanup callback to be invoked when the last reference is released.
-  // Private: only create_subscriber_ipc_ptr() should call this.
-  void set_gpu_cleanup(std::function<void()> fn)
+  // Sets a GPU release function to be invoked as fn(local_gpu_ptr) when the last reference
+  // is released. Private: only create_subscriber_ipc_ptr() should call this.
+  void set_gpu_release_fn(void (*fn)(void *))
   {
     if (control_) {
-      control_->gpu_cleanup = std::move(fn);
+      control_->gpu_release_fn = fn;
     }
   }
 
@@ -330,8 +331,8 @@ class ipc_shared_ptr
     if (was_last) {
       // GPU cleanup must run BEFORE bitmap release: unmapping the GPU buffer before
       // the publisher is allowed to cudaFree the underlying allocation.
-      if (control_->gpu_cleanup) {
-        control_->gpu_cleanup();
+      if (control_->gpu_release_fn) {
+        control_->gpu_release_fn(control_->local_gpu_ptr);
       }
 
       if (control_->entry_id != ENTRY_ID_NOT_ASSIGNED) {

From e336b3361252a7547244b7639b4e926b662831e0 Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Sat, 14 Mar 2026 04:41:50 +0900
Subject: [PATCH 07/18] validate data pointer and gpu_data_size before publish

Signed-off-by: sykwer <sykwer@gmail.com>
---
 .../include/agnocast/agnocast_publisher.hpp    | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
index faf146c08..04a9c3859 100644
--- a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
@@ -195,8 +195,24 @@ class BasicPublisher
     // NOTE: Assumes MessageT has a public `data` member (uint8_t*) pointing to the GPU allocation.
     // All CUDA message types must provide this by shadowing the base ROS message's data field.
     if constexpr (is_cuda_message_v<MessageT>) {
-      auto & backend = agnocast::cuda::get_backend();
+      if (!raw_ptr->data) {
+        std::fprintf(
+          stderr,
+          "[agnocast] FATAL: CUDA message on topic '%s' has null data pointer. "
+          "Did you forget to cudaMalloc(&msg->data, size) before publish()?\n",
+          topic_name_.c_str());
+        std::abort();
+      }
       const size_t gpu_size = get_cuda_gpu_data_size(*raw_ptr);
+      if (gpu_size == 0) {
+        std::fprintf(
+          stderr,
+          "[agnocast] FATAL: CUDA message on topic '%s' has gpu_data_size == 0. "
+          "Ensure message fields (height, width, point_step, etc.) are set before publish().\n",
+          topic_name_.c_str());
+        std::abort();
+      }
+      auto & backend = agnocast::cuda::get_backend();
       auto * meta = new GpuMetadata();
       meta->publisher_gpu_ptr = raw_ptr->data;
       meta->gpu_data_size = gpu_size;

From 1c1ea2a5f6853e442baab719cd1dafccb5a9d9ff Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Sat, 14 Mar 2026 04:45:17 +0900
Subject: [PATCH 08/18] align message_creator entry_id type to int64_t

Signed-off-by: sykwer <sykwer@gmail.com>
---
 src/agnocastlib/include/agnocast/agnocast_callback_info.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
index 154c454c6..5854eb692 100644
--- a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
@@ -47,7 +47,7 @@ struct CallbackInfo
   rclcpp::CallbackGroup::SharedPtr callback_group;
   TypeErasedCallback callback;
   std::function<std::unique_ptr<AnyObject>(
-    const void *, const std::string &, const topic_local_id_t, const uint64_t)>
+    const void *, const std::string &, const topic_local_id_t, const int64_t)>
     message_creator;
   bool need_epoll_update = true;
 };

From 1f5400b9f2cf54c94aca54d29278bed8a60bcc61 Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Sat, 14 Mar 2026 04:53:32 +0900
Subject: [PATCH 09/18] rename get_local_gpu_ptr() to gpu_data()

Signed-off-by: sykwer <sykwer@gmail.com>
---
 .../src/cuda_subscriber.cpp                      |  2 +-
 .../include/agnocast/agnocast_callback_info.hpp  | 10 +++++-----
 .../include/agnocast/agnocast_smart_pointer.hpp  | 16 ++++++++--------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp
index 4c7b243d4..42b5294b8 100644
--- a/src/agnocast_sample_application/src/cuda_subscriber.cpp
+++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp
@@ -19,7 +19,7 @@ class CudaSubscriber : public agnocast::Node
     const size_t gpu_size = msg->height * width * point_step;
 
     // Get subscriber-local GPU pointer (mapped via CUDA IPC)
-    auto * gpu_ptr = static_cast<uint8_t *>(msg.get_local_gpu_ptr());
+    auto * gpu_ptr = static_cast<uint8_t *>(msg.gpu_data());
 
     // Read first few bytes from GPU to verify data
     uint8_t host_buf[16]{};
diff --git a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
index 5854eb692..08e101447 100644
--- a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
@@ -66,8 +66,8 @@ uint32_t allocate_callback_info_id();
 
 // Creates an ipc_shared_ptr for a subscriber-received message.
 // For CUDA messages: imports the GPU handle, stores the subscriber-local GPU pointer in
-// control_block->local_gpu_ptr, and registers a gpu_release_fn to release the mapping
-// on last reference. The pointer is accessed via ipc_shared_ptr::get_local_gpu_ptr().
+// control_block->gpu_data_ptr, and registers a gpu_release_fn to release the mapping
+// on last reference. The pointer is accessed via ipc_shared_ptr::gpu_data().
 // For non-CUDA messages: simply wraps the pointer.
 template <typename MessageT>
 agnocast::ipc_shared_ptr<MessageT> create_subscriber_ipc_ptr(
@@ -84,14 +84,14 @@ agnocast::ipc_shared_ptr<MessageT> create_subscriber_ipc_ptr(
         topic_name.c_str());
       std::abort();
     }
-    void * local_gpu_ptr =
+    void * gpu_data_ptr =
       agnocast::cuda::get_backend().import_handle(meta->handle, meta->gpu_data_size);
     // NOTE: If import_handle() fails, the backend aborts (fail-fast). If a future backend
     // returns nullptr instead, the subscriber would get a null gpu pointer. Callers should
-    // check get_local_gpu_ptr() != nullptr before use.
+    // check gpu_data() != nullptr before use.
 
     auto ipc_ptr = agnocast::ipc_shared_ptr<MessageT>(msg, topic_name, subscriber_id, entry_id);
-    ipc_ptr.set_local_gpu_ptr(local_gpu_ptr);
+    ipc_ptr.set_gpu_data_ptr(gpu_data_ptr);
     ipc_ptr.set_gpu_release_fn(
       [](void * ptr) { agnocast::cuda::get_backend().release_handle(ptr); });
     return ipc_ptr;
diff --git a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
index a98381ca4..fbc571898 100644
--- a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
@@ -69,7 +69,7 @@ struct control_block
   std::atomic<bool> valid{true};        // 1-byte alignment
 
   // Optional GPU cleanup function pointer. Null for non-CUDA messages.
-  // Called as gpu_release_fn(local_gpu_ptr) before bitmap release in reset() to ensure GPU
+  // Called as gpu_release_fn(gpu_data_ptr) before bitmap release in reset() to ensure GPU
   // mappings are released before the publisher can free the underlying GPU buffer.
   // Uses a plain function pointer instead of std::function to avoid heap allocation and
   // minimize overhead for non-CUDA messages (16 bytes for two pointers vs ~40+ bytes).
@@ -78,7 +78,7 @@ struct control_block
   // Subscriber-local GPU device pointer obtained via import_handle().
   // Stored here because the shared memory message is mapped read-only by the subscriber,
   // so we cannot inject the local pointer into msg->data.
-  void * local_gpu_ptr = nullptr;
+  void * gpu_data_ptr = nullptr;
 
   control_block(std::string topic, topic_local_id_t pubsub, int64_t entry)
   : topic_name(std::move(topic)), entry_id(entry), pubsub_id(pubsub)
@@ -112,7 +112,7 @@ class ipc_shared_ptr
   template <typename MessageT, typename BridgeRequestPolicy>
   friend class BasicPublisher;
 
-  // Allow create_subscriber_ipc_ptr to call set_gpu_release_fn() and set_local_gpu_ptr()
+  // Allow create_subscriber_ipc_ptr to call set_gpu_release_fn() and set_gpu_data_ptr()
   template <typename MessageT>
   friend ipc_shared_ptr<MessageT> create_subscriber_ipc_ptr(
     MessageT *, const std::string &, const topic_local_id_t, const int64_t);
@@ -145,7 +145,7 @@ class ipc_shared_ptr
     }
   }
 
-  // Sets a GPU release function to be invoked as fn(local_gpu_ptr) when the last reference
+  // Sets a GPU release function to be invoked as fn(gpu_data_ptr) when the last reference
   // is released. Private: only create_subscriber_ipc_ptr() should call this.
   void set_gpu_release_fn(void (*fn)(void *))
   {
@@ -156,10 +156,10 @@ class ipc_shared_ptr
 
   // Sets the subscriber-local GPU pointer (obtained via import_handle).
   // Private: only create_subscriber_ipc_ptr() should call this.
-  void set_local_gpu_ptr(void * ptr)
+  void set_gpu_data_ptr(void * ptr)
   {
     if (control_) {
-      control_->local_gpu_ptr = ptr;
+      control_->gpu_data_ptr = ptr;
     }
   }
 
@@ -317,7 +317,7 @@ class ipc_shared_ptr
   T * get() const noexcept { return is_invalidated_() ? nullptr : ptr_; }
 
   // Returns the subscriber-local GPU device pointer, or nullptr for non-CUDA messages.
-  void * get_local_gpu_ptr() const noexcept { return control_ ? control_->local_gpu_ptr : nullptr; }
+  void * gpu_data() const noexcept { return control_ ? control_->gpu_data_ptr : nullptr; }
 
   // Thread-safe: atomically decrements ref count and performs cleanup if last reference.
   void reset()
@@ -332,7 +332,7 @@ class ipc_shared_ptr
       // GPU cleanup must run BEFORE bitmap release: unmapping the GPU buffer before
       // the publisher is allowed to cudaFree the underlying allocation.
       if (control_->gpu_release_fn) {
-        control_->gpu_release_fn(control_->local_gpu_ptr);
+        control_->gpu_release_fn(control_->gpu_data_ptr);
       }
 
       if (control_->entry_id != ENTRY_ID_NOT_ASSIGNED) {

From 8933d200ad829a68ff32ec902e1c607854115c1c Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Sat, 14 Mar 2026 05:01:25 +0900
Subject: [PATCH 10/18] add error handling

Signed-off-by: sykwer <sykwer@gmail.com>
---
 .../src/cuda_publisher.cpp                         | 14 ++++++++++++--
 .../src/cuda_subscriber.cpp                        | 10 +++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp
index 7abe85d9b..a6efbec7e 100644
--- a/src/agnocast_sample_application/src/cuda_publisher.cpp
+++ b/src/agnocast_sample_application/src/cuda_publisher.cpp
@@ -35,13 +35,23 @@ class CudaPublisher : public agnocast::Node
 
     // Allocate and fill GPU data
     const size_t gpu_size = msg->height * msg->width * msg->point_step;
-    cudaMalloc(&msg->data, gpu_size);
+    if (cudaMalloc(&msg->data, gpu_size) != cudaSuccess) {
+      RCLCPP_ERROR(get_logger(), "cudaMalloc failed: %s", cudaGetErrorString(cudaGetLastError()));
+      return;
+    }
 
     const int threads = 256;
     const int blocks = (gpu_size + threads - 1) / threads;
     // cppcheck-suppress shiftTooManyBits  // false positive: <<< >>> is CUDA kernel launch syntax
     fill_kernel<<<blocks, threads>>>(msg->data, gpu_size, static_cast<uint8_t>(count_));
-    cudaStreamSynchronize(nullptr);
+
+    if (cudaStreamSynchronize(nullptr) != cudaSuccess) {
+      RCLCPP_ERROR(
+        get_logger(), "kernel launch failed: %s", cudaGetErrorString(cudaGetLastError()));
+      cudaFree(msg->data);
+      msg->data = nullptr;
+      return;
+    }
 
     pub_->publish(std::move(msg));
     RCLCPP_INFO(
diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp
index 42b5294b8..262c593d5 100644
--- a/src/agnocast_sample_application/src/cuda_subscriber.cpp
+++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp
@@ -20,11 +20,19 @@ class CudaSubscriber : public agnocast::Node
 
     // Get subscriber-local GPU pointer (mapped via CUDA IPC)
     auto * gpu_ptr = static_cast<uint8_t *>(msg.gpu_data());
+    if (!gpu_ptr) {
+      RCLCPP_ERROR(get_logger(), "gpu_data() returned nullptr");
+      return;
+    }
 
     // Read first few bytes from GPU to verify data
     uint8_t host_buf[16]{};
     const size_t copy_size = std::min(gpu_size, sizeof(host_buf));
-    cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost);
+    if (cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost) != cudaSuccess) {
+      RCLCPP_ERROR(
+        get_logger(), "cudaMemcpy failed: %s", cudaGetErrorString(cudaGetLastError()));
+      return;
+    }
 
     RCLCPP_INFO(
       get_logger(),

From ec3226b6794324829521c7b8530c3f2995690c14 Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Sat, 14 Mar 2026 05:05:44 +0900
Subject: [PATCH 11/18] add scripts

Signed-off-by: sykwer <sykwer@gmail.com>
---
 scripts/sample_application/run_cuda_listener.bash            | 4 ++++
 scripts/sample_application/run_cuda_talker.bash              | 4 ++++
 .../launch/cuda_listener.launch.xml                          | 5 +++++
 .../launch/cuda_talker.launch.xml                            | 5 +++++
 4 files changed, 18 insertions(+)
 create mode 100755 scripts/sample_application/run_cuda_listener.bash
 create mode 100755 scripts/sample_application/run_cuda_talker.bash
 create mode 100644 src/agnocast_sample_application/launch/cuda_listener.launch.xml
 create mode 100644 src/agnocast_sample_application/launch/cuda_talker.launch.xml

diff --git a/scripts/sample_application/run_cuda_listener.bash b/scripts/sample_application/run_cuda_listener.bash
new file mode 100755
index 000000000..296600719
--- /dev/null
+++ b/scripts/sample_application/run_cuda_listener.bash
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+source install/setup.bash
+ros2 launch agnocast_sample_application cuda_listener.launch.xml
diff --git a/scripts/sample_application/run_cuda_talker.bash b/scripts/sample_application/run_cuda_talker.bash
new file mode 100755
index 000000000..3c8318bad
--- /dev/null
+++ b/scripts/sample_application/run_cuda_talker.bash
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+source install/setup.bash
+ros2 launch agnocast_sample_application cuda_talker.launch.xml
diff --git a/src/agnocast_sample_application/launch/cuda_listener.launch.xml b/src/agnocast_sample_application/launch/cuda_listener.launch.xml
new file mode 100644
index 000000000..4136b9649
--- /dev/null
+++ b/src/agnocast_sample_application/launch/cuda_listener.launch.xml
@@ -0,0 +1,5 @@
+<launch>
+    <node pkg="agnocast_sample_application" exec="cuda_listener" name="cuda_listener_node" output="screen">
+        <env name="LD_PRELOAD" value="libagnocast_heaphook.so:$(env LD_PRELOAD '')" />
+    </node>
+</launch>
diff --git a/src/agnocast_sample_application/launch/cuda_talker.launch.xml b/src/agnocast_sample_application/launch/cuda_talker.launch.xml
new file mode 100644
index 000000000..ffad40cb2
--- /dev/null
+++ b/src/agnocast_sample_application/launch/cuda_talker.launch.xml
@@ -0,0 +1,5 @@
+<launch>
+    <node pkg="agnocast_sample_application" exec="cuda_talker" name="cuda_talker_node" output="screen">
+        <env name="LD_PRELOAD" value="libagnocast_heaphook.so:$(env LD_PRELOAD '')" />
+    </node>
+</launch>

From 34f93f7c3815548ee161cc009ce7c82f072a7f3c Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Sat, 14 Mar 2026 05:08:09 +0900
Subject: [PATCH 12/18] fix

Signed-off-by: sykwer <sykwer@gmail.com>
---
 src/agnocast_sample_application/src/cuda_subscriber.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp
index 262c593d5..939b36c30 100644
--- a/src/agnocast_sample_application/src/cuda_subscriber.cpp
+++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp
@@ -29,8 +29,7 @@ class CudaSubscriber : public agnocast::Node
     uint8_t host_buf[16]{};
     const size_t copy_size = std::min(gpu_size, sizeof(host_buf));
     if (cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost) != cudaSuccess) {
-      RCLCPP_ERROR(
-        get_logger(), "cudaMemcpy failed: %s", cudaGetErrorString(cudaGetLastError()));
+      RCLCPP_ERROR(get_logger(), "cudaMemcpy failed: %s", cudaGetErrorString(cudaGetLastError()));
       return;
     }
 

From ef83ec7b9372ac3d063c10ab7cd06e02697f3aac Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Thu, 9 Apr 2026 16:37:25 +0900
Subject: [PATCH 13/18] update

Signed-off-by: sykwer <sykwer@gmail.com>
---
 src/agnocast/package.xml                      |   1 +
 src/agnocast_cuda/CMakeLists.txt              |  77 ++++----
 src/agnocast_cuda/package.xml                 |   7 +-
 src/agnocast_cuda/src/cuda_ipc_backend.cpp    |  22 ++-
 src/agnocast_cuda/src/cudart_loader.hpp       | 184 ++++++++++++++++++
 src/agnocast_cuda/src/get_backend.cpp         |  13 +-
 .../CMakeLists.txt                            |  11 +-
 src/agnocast_sample_application/package.xml   |   7 +-
 8 files changed, 264 insertions(+), 58 deletions(-)
 create mode 100644 src/agnocast_cuda/src/cudart_loader.hpp

diff --git a/src/agnocast/package.xml b/src/agnocast/package.xml
index ae223d158..2332f1dea 100644
--- a/src/agnocast/package.xml
+++ b/src/agnocast/package.xml
@@ -18,6 +18,7 @@
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_cie_config_msgs</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_cie_thread_configurator</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_components</exec_depend>
+  <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_cuda</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_ioctl_wrapper</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_sample_application</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_sample_interfaces</exec_depend>
diff --git a/src/agnocast_cuda/CMakeLists.txt b/src/agnocast_cuda/CMakeLists.txt
index d9fa6dd6a..dba9e71f1 100644
--- a/src/agnocast_cuda/CMakeLists.txt
+++ b/src/agnocast_cuda/CMakeLists.txt
@@ -13,46 +13,51 @@ find_package(ament_cmake REQUIRED)
 find_package(agnocastlib REQUIRED)
 find_package(sensor_msgs REQUIRED)
 
-# CUDA is optional: if not available, headers are still installed but no library is built.
-# Downstream packages that use CUDA message types must check if the agnocast_cuda library
-# target exists (e.g., via find_package + if(TARGET agnocast_cuda::agnocast_cuda)).
-include(CheckLanguage)
-check_language(CUDA)
+# ---- Runtime CUDA loading (no build-time CUDA dependency) ----
+#
+# This package has ZERO build-time dependency on the CUDA toolkit. It does NOT use
+# find_package(CUDAToolkit), enable_language(CUDA), or link against CUDA::cudart.
+#
+# Instead, libcudart.so is loaded at runtime via dlopen() (see src/cudart_loader.hpp).
+# This allows the package to be built as a pre-built .deb on the ROS build farm, which
+# has no CUDA toolchain. Users install CUDA separately on their target machines.
+#
+# The only system library we link is libdl (via ${CMAKE_DL_LIBS}) for dlopen/dlsym.
+#
+# IMPORTANT FOR DOWNSTREAM PACKAGES:
+# Since this library always builds (even without CUDA), find_package(agnocast_cuda) and
+# TARGET agnocast_cuda::agnocast_cuda are NOT reliable indicators that CUDA is available.
+# Downstream packages that compile .cu files or call CUDA APIs directly (e.g., cudaMalloc)
+# must ALSO check find_package(CUDAToolkit) independently. See the sample application's
+# CMakeLists.txt for the correct pattern:
+#   find_package(CUDAToolkit QUIET)
+#   find_package(agnocast_cuda QUIET)
+#   if(CUDAToolkit_FOUND AND agnocast_cuda_FOUND)
+#     ...
+#   endif()
+add_library(agnocast_cuda SHARED
+  src/cuda_ipc_backend.cpp
+  src/vmm_backend.cpp
+  src/nvscibuf_backend.cpp
+  src/unified_memory_backend.cpp
+  src/get_backend.cpp)
 
-if(CMAKE_CUDA_COMPILER)
-  enable_language(CUDA)
-  find_package(CUDAToolkit QUIET)
-endif()
-
-if(CUDAToolkit_FOUND)
-  add_library(agnocast_cuda SHARED
-    src/cuda_ipc_backend.cpp
-    src/vmm_backend.cpp
-    src/nvscibuf_backend.cpp
-    src/unified_memory_backend.cpp
-    src/get_backend.cpp)
-
-  target_include_directories(agnocast_cuda PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-    $<INSTALL_INTERFACE:include>)
+target_include_directories(agnocast_cuda PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:include>)
 
-  ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs)
-  target_link_libraries(agnocast_cuda CUDA::cudart)
+ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs)
+target_link_libraries(agnocast_cuda ${CMAKE_DL_LIBS})
 
-  install(TARGETS agnocast_cuda
-    EXPORT export_${PROJECT_NAME}
-    LIBRARY DESTINATION lib
-    ARCHIVE DESTINATION lib
-    RUNTIME DESTINATION bin
-    INCLUDES DESTINATION include)
+install(TARGETS agnocast_cuda
+  EXPORT export_${PROJECT_NAME}
+  LIBRARY DESTINATION lib
+  ARCHIVE DESTINATION lib
+  RUNTIME DESTINATION bin
+  INCLUDES DESTINATION include)
 
-  ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET)
-  ament_export_dependencies(agnocastlib sensor_msgs)
-
-  message(STATUS "agnocast_cuda: CUDA found — building library and installing headers.")
-else()
-  message(STATUS "agnocast_cuda: CUDA not found — installing headers only (no library).")
-endif()
+ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET)
+ament_export_dependencies(agnocastlib sensor_msgs)
 
 # Always install headers so downstream packages can use the type definitions
 # (e.g., cuda_message_tag.hpp is in agnocastlib, but types.hpp is here).
diff --git a/src/agnocast_cuda/package.xml b/src/agnocast_cuda/package.xml
index 1b0245d25..b01f3bc15 100644
--- a/src/agnocast_cuda/package.xml
+++ b/src/agnocast_cuda/package.xml
@@ -2,13 +2,18 @@
 <?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
 <package format="3">
   <name>agnocast_cuda</name>
-  <version>0.1.0</version>
+  <version>2.3.3</version>
   <description>
     CUDA IPC support for Agnocast. Provides GPU message types and cross-process GPU buffer sharing.
   </description>
   <maintainer email="sykwer@gmail.com">Takahiro Ishikawa-Aso</maintainer>
+  <maintainer email="koichi.imai.2@tier4.jp">Koichi Imai</maintainer>
+  <maintainer email="masaki.nishikawa@tier4.jp">Masaki Nishikawa</maintainer>
   <license>Apache License 2.0</license>
 
+  <!-- Original authors -->
+  <author email="sykwer@gmail.com">Takahiro Ishikawa-Aso</author>
+
   <buildtool_depend>ament_cmake</buildtool_depend>
 
   <depend>agnocastlib</depend>
diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.cpp b/src/agnocast_cuda/src/cuda_ipc_backend.cpp
index 3a2bbb8b4..6f924db9e 100644
--- a/src/agnocast_cuda/src/cuda_ipc_backend.cpp
+++ b/src/agnocast_cuda/src/cuda_ipc_backend.cpp
@@ -1,6 +1,5 @@
 #include "cuda_ipc_backend.hpp"
-
-#include <cuda_runtime.h>
+#include "cudart_loader.hpp"
 
 #include <cstdio>
 #include <cstdlib>
@@ -9,11 +8,12 @@
 namespace
 {
 
-void check_cuda_error(cudaError_t err, const char * operation)
+void check_cuda_error(agnocast::cuda::cudaError_t err, const char * operation)
 {
-  if (err != cudaSuccess) {
+  if (err != agnocast::cuda::cudaSuccess) {
     std::fprintf(
-      stderr, "[agnocast_cuda] FATAL: %s failed: %s\n", operation, cudaGetErrorString(err));
+      stderr, "[agnocast_cuda] FATAL: %s failed: %s\n", operation,
+      agnocast::cuda::CudartLoader::instance().cudaGetErrorString(err));
     std::abort();
   }
 }
@@ -28,14 +28,15 @@ GpuHandle CudaIpcBackend::export_handle(void * device_ptr, size_t /*size*/)
   GpuHandle h{};
   static_assert(sizeof(cudaIpcMemHandle_t) <= sizeof(h.opaque));
   cudaIpcMemHandle_t ipc_handle;
-  check_cuda_error(cudaIpcGetMemHandle(&ipc_handle, device_ptr), "cudaIpcGetMemHandle");
+  check_cuda_error(
+    CudartLoader::instance().cudaIpcGetMemHandle(&ipc_handle, device_ptr), "cudaIpcGetMemHandle");
   std::memcpy(h.opaque, &ipc_handle, sizeof(ipc_handle));
   return h;
 }
 
 void CudaIpcBackend::free_device_memory(void * device_ptr)
 {
-  check_cuda_error(cudaFree(device_ptr), "cudaFree");
+  check_cuda_error(CudartLoader::instance().cudaFree(device_ptr), "cudaFree");
 }
 
 void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/)
@@ -44,13 +45,16 @@ void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/)
   std::memcpy(&ipc_handle, handle.opaque, sizeof(ipc_handle));
   void * ptr = nullptr;
   check_cuda_error(
-    cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess), "cudaIpcOpenMemHandle");
+    CudartLoader::instance().cudaIpcOpenMemHandle(
+      &ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess),
+    "cudaIpcOpenMemHandle");
   return ptr;
 }
 
 void CudaIpcBackend::release_handle(void * local_ptr)
 {
-  check_cuda_error(cudaIpcCloseMemHandle(local_ptr), "cudaIpcCloseMemHandle");
+  check_cuda_error(
+    CudartLoader::instance().cudaIpcCloseMemHandle(local_ptr), "cudaIpcCloseMemHandle");
 }
 
 }  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/cudart_loader.hpp b/src/agnocast_cuda/src/cudart_loader.hpp
new file mode 100644
index 000000000..af31b8967
--- /dev/null
+++ b/src/agnocast_cuda/src/cudart_loader.hpp
@@ -0,0 +1,184 @@
+// ============================================================================
+// Runtime loader for the CUDA Runtime API (libcudart.so) via dlopen/dlsym.
+//
+// WHY THIS EXISTS
+// ---------------
+// agnocast_cuda is distributed as a pre-built .deb from the ROS build farm, which has no
+// CUDA toolchain. To eliminate the build-time dependency on <cuda_runtime.h> and -lcudart,
+// we replicate the small set of CUDA types/constants we need (stable ABI) and load the
+// actual CUDA runtime library at runtime via dlopen. This means:
+//   - Build time:  pure C++17, no CUDA headers or compiler required.
+//   - Run time:    libcudart.so must be present on the target machine (user installs CUDA).
+//
+// SAFE ON NON-CUDA SYSTEMS
+// ------------------------
+// Users who do not use CUDA message types are completely unaffected by this library.
+// CudartLoader is lazy: dlopen("libcudart.so") only happens on the first call to
+// CudartLoader::instance(), which is only reachable through get_backend(), which is
+// only called from if-constexpr branches guarded by is_cuda_message_v<T>. If no CUDA
+// message type is ever instantiated, this loader is never constructed and no CUDA symbols
+// are ever resolved. The library can be safely installed and loaded on machines without
+// a GPU or CUDA runtime.
+//
+// ABI STABILITY
+// -------------
+// The types and constants below are part of NVIDIA's stable C ABI for the CUDA Runtime API.
+// They have not changed across CUDA 10.x, 11.x, and 12.x. Specifically:
+//   - cudaError_t is a C enum (int-sized). cudaSuccess has been 0 since CUDA 1.0.
+//   - cudaIpcMemHandle_t is a 64-byte opaque struct (CUDA_IPC_HANDLE_SIZE = 64).
+//   - cudaIpcMemLazyEnablePeerAccess is a flag constant (0x01).
+//   - cudaDevAttrIntegrated is enum value 73 in cudaDeviceAttr (append-only enum).
+// If NVIDIA ever breaks this ABI (extremely unlikely), the static_assert in
+// cuda_ipc_backend.cpp will catch size mismatches at compile time.
+//
+// ADDING NEW CUDA API CALLS
+// -------------------------
+// When a new backend or feature needs an additional CUDA runtime function:
+//   1. Add the function pointer typedef below (e.g., using cudaMalloc_t = ...).
+//   2. Add a public member to CudartLoader (e.g., cudaMalloc_t cudaMalloc;).
+//   3. Add a load_symbol() call in the constructor.
+// Keep this file as the single place that touches dlopen/dlsym for CUDA.
+//
+// LIBRARY NAME FALLBACK CHAIN
+// ---------------------------
+// The constructor tries these names in order:
+//   1. "libcudart.so"     — unversioned symlink, present when the CUDA dev package is
+//                           installed (e.g., cuda-cudart-dev-12-*) or with the runfile
+//                           installer.
+//   2. "libcudart.so.12"  — SONAME for CUDA 12.x. Present with the runtime-only deb
+//                           package (cuda-cudart-12-*) even without the dev package.
+//   3. "libcudart.so.11.0"— SONAME for CUDA 11.x (NVIDIA used major.minor for the SONAME
+//                           in the 11.x series; all 11.0–11.8 share this SONAME).
+// If a future CUDA major version (e.g., 13.x) is released, add its SONAME to the list.
+// ============================================================================
+#pragma once
+
+#include <dlfcn.h>
+
+#include <cstdio>
+#include <cstdlib>
+
+namespace agnocast::cuda
+{
+
+// ---------------------------------------------------------------------------
+// ABI-compatible CUDA Runtime API type definitions.
+// These replace #include <cuda_runtime.h> so that no CUDA headers are needed at build time.
+// See "ABI STABILITY" section in the file header for rationale.
+// ---------------------------------------------------------------------------
+using cudaError_t = int;
+constexpr cudaError_t cudaSuccess = 0;
+
+struct cudaIpcMemHandle_t
+{
+  char reserved[64];  // CUDA_IPC_HANDLE_SIZE = 64
+};
+
+constexpr unsigned int cudaIpcMemLazyEnablePeerAccess = 0x01;
+
+using cudaDeviceAttr = int;
+constexpr cudaDeviceAttr cudaDevAttrIntegrated = 18;  // driver_types.h: cudaDevAttrIntegrated = 18
+
+// ---------------------------------------------------------------------------
+// Function pointer types matching CUDA Runtime API signatures.
+// ---------------------------------------------------------------------------
+using cudaGetDevice_t = cudaError_t (*)(int *);
+using cudaDeviceGetAttribute_t = cudaError_t (*)(int *, cudaDeviceAttr, int);
+using cudaIpcGetMemHandle_t = cudaError_t (*)(cudaIpcMemHandle_t *, void *);
+using cudaIpcOpenMemHandle_t = cudaError_t (*)(void **, cudaIpcMemHandle_t, unsigned int);
+using cudaIpcCloseMemHandle_t = cudaError_t (*)(void *);
+using cudaFree_t = cudaError_t (*)(void *);
+using cudaGetErrorString_t = const char * (*)(cudaError_t);
+
+// ---------------------------------------------------------------------------
+// CudartLoader — lazy-loading singleton for the CUDA runtime.
+//
+// Thread safety: the function-local static in instance() is guaranteed to be
+// initialized exactly once (C++11 "magic statics"). After construction, all
+// function pointers are immutable and safe to read from any thread.
+//
+// Lifetime: the singleton is never destroyed during normal execution. We
+// intentionally do NOT call dlclose() — see destructor comment.
+// ---------------------------------------------------------------------------
+class CudartLoader
+{
+public:
+  static CudartLoader & instance()
+  {
+    static CudartLoader loader;
+    return loader;
+  }
+
+  // Public function pointers — valid immediately after instance() returns.
+  // Callers use these like: CudartLoader::instance().cudaFree(ptr)
+  cudaGetDevice_t cudaGetDevice;
+  cudaDeviceGetAttribute_t cudaDeviceGetAttribute;
+  cudaIpcGetMemHandle_t cudaIpcGetMemHandle;
+  cudaIpcOpenMemHandle_t cudaIpcOpenMemHandle;
+  cudaIpcCloseMemHandle_t cudaIpcCloseMemHandle;
+  cudaFree_t cudaFree;
+  cudaGetErrorString_t cudaGetErrorString;
+
+private:
+  void * handle_ = nullptr;
+
+  CudartLoader()
+  {
+    // Try library names in order. See "LIBRARY NAME FALLBACK CHAIN" in the file header.
+    const char * names[] = {"libcudart.so", "libcudart.so.12", "libcudart.so.11.0"};
+    for (const char * name : names) {
+      handle_ = dlopen(name, RTLD_NOW | RTLD_LOCAL);
+      if (handle_ != nullptr) {
+        break;
+      }
+    }
+    if (handle_ == nullptr) {
+      std::fprintf(
+        stderr,
+        "[agnocast_cuda] FATAL: Could not load libcudart.so.\n"
+        "  Your code uses a CUDA message type (e.g., agnocast::cuda::PointCloud2 or\n"
+        "  agnocast::cuda::Image), which requires the NVIDIA CUDA runtime to be installed\n"
+        "  on this machine. If you do not need GPU-accelerated message passing, use the\n"
+        "  standard (non-CUDA) message types instead.\n"
+        "  To fix: install the CUDA runtime (e.g., 'sudo apt install cuda-cudart-12-*')\n"
+        "  and ensure libcudart.so is on LD_LIBRARY_PATH.\n"
+        "  dlopen error: %s\n",
+        dlerror());
+      std::abort();
+    }
+
+    load_symbol(cudaGetDevice, "cudaGetDevice");
+    load_symbol(cudaDeviceGetAttribute, "cudaDeviceGetAttribute");
+    load_symbol(cudaIpcGetMemHandle, "cudaIpcGetMemHandle");
+    load_symbol(cudaIpcOpenMemHandle, "cudaIpcOpenMemHandle");
+    load_symbol(cudaIpcCloseMemHandle, "cudaIpcCloseMemHandle");
+    load_symbol(cudaFree, "cudaFree");
+    load_symbol(cudaGetErrorString, "cudaGetErrorString");
+  }
+
+  // Intentionally never close the handle. The CUDA runtime maintains process-global state
+  // (device context, allocations, etc.). Calling dlclose() during static destruction can
+  // race with other CUDA cleanup and cause segfaults. The OS reclaims everything at exit.
+  ~CudartLoader() = default;
+
+  CudartLoader(const CudartLoader &) = delete;
+  CudartLoader & operator=(const CudartLoader &) = delete;
+
+  template <typename T>
+  void load_symbol(T & func_ptr, const char * name)
+  {
+    // POSIX requires clearing dlerror() before calling dlsym(), then checking dlerror()
+    // after, because dlsym() can legitimately return NULL for some symbols.
+    dlerror();
+    func_ptr = reinterpret_cast<T>(dlsym(handle_, name));
+    const char * err = dlerror();
+    if (err != nullptr) {
+      std::fprintf(
+        stderr, "[agnocast_cuda] FATAL: Could not load symbol '%s' from libcudart.so: %s\n", name,
+        err);
+      std::abort();
+    }
+  }
+};
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/get_backend.cpp b/src/agnocast_cuda/src/get_backend.cpp
index 87068ff54..12b428099 100644
--- a/src/agnocast_cuda/src/get_backend.cpp
+++ b/src/agnocast_cuda/src/get_backend.cpp
@@ -1,11 +1,10 @@
 #include "agnocast/gpu_transfer_backend.hpp"
 #include "cuda_ipc_backend.hpp"
+#include "cudart_loader.hpp"
 #include "nvscibuf_backend.hpp"
 #include "unified_memory_backend.hpp"
 #include "vmm_backend.hpp"
 
-#include <cuda_runtime.h>
-
 #include <cstdio>
 #include <cstdlib>
 #include <memory>
@@ -18,20 +17,22 @@ namespace
 
 std::unique_ptr<GpuTransferBackend> select_backend()
 {
+  const auto & api = CudartLoader::instance();
+
   int device = 0;
-  cudaError_t err = cudaGetDevice(&device);
+  cudaError_t err = api.cudaGetDevice(&device);
   if (err != cudaSuccess) {
     std::fprintf(
-      stderr, "[agnocast_cuda] FATAL: cudaGetDevice failed: %s\n", cudaGetErrorString(err));
+      stderr, "[agnocast_cuda] FATAL: cudaGetDevice failed: %s\n", api.cudaGetErrorString(err));
     std::abort();
   }
 
   int is_integrated = 0;
-  err = cudaDeviceGetAttribute(&is_integrated, cudaDevAttrIntegrated, device);
+  err = api.cudaDeviceGetAttribute(&is_integrated, cudaDevAttrIntegrated, device);
   if (err != cudaSuccess) {
     std::fprintf(
       stderr, "[agnocast_cuda] FATAL: cudaDeviceGetAttribute failed: %s\n",
-      cudaGetErrorString(err));
+      api.cudaGetErrorString(err));
     std::abort();
   }
 
diff --git a/src/agnocast_sample_application/CMakeLists.txt b/src/agnocast_sample_application/CMakeLists.txt
index 52dd3153c..c3631df79 100644
--- a/src/agnocast_sample_application/CMakeLists.txt
+++ b/src/agnocast_sample_application/CMakeLists.txt
@@ -165,11 +165,16 @@ install(TARGETS no_rclcpp_client
 install(TARGETS sim_time_timer
   DESTINATION lib/${PROJECT_NAME})
 
-# CUDA sample applications (only built when CUDA and agnocast_cuda are available)
-find_package(CUDAToolkit QUIET)
+# CUDA sample applications (only built when the CUDA compiler, CUDA toolkit, and
+# agnocast_cuda are all available). Note: agnocast_cuda itself has no build-time CUDA
+# dependency (it dlopen's libcudart.so at runtime), but these sample apps compile .cu
+# files with CUDA kernels, so they need the full CUDA toolchain.
+include(CheckLanguage)
+check_language(CUDA)
 find_package(agnocast_cuda QUIET)
-if(CUDAToolkit_FOUND AND agnocast_cuda_FOUND)
+if(CMAKE_CUDA_COMPILER AND agnocast_cuda_FOUND)
   enable_language(CUDA)
+  find_package(CUDAToolkit REQUIRED)
 
   add_executable(cuda_talker src/cuda_publisher.cpp)
   set_source_files_properties(src/cuda_publisher.cpp PROPERTIES LANGUAGE CUDA)
diff --git a/src/agnocast_sample_application/package.xml b/src/agnocast_sample_application/package.xml
index 86596a893..5818580c4 100644
--- a/src/agnocast_sample_application/package.xml
+++ b/src/agnocast_sample_application/package.xml
@@ -31,9 +31,10 @@
   <test_depend>ament_lint_auto</test_depend>
   <test_depend>ament_lint_common</test_depend>
 
-  <!-- Note: agnocast_cuda and sensor_msgs are intentionally omitted here.
-       CUDA sample apps are conditionally built via find_package(... QUIET) in CMakeLists.txt.
-       Declaring them as build_depend would force rosdep to require CUDA on non-CUDA systems. -->
+  <!-- agnocast_cuda has no build-time CUDA dependency (it dlopen's libcudart.so at runtime),
+       so it is safe to declare as a build_depend on all systems including non-CUDA ones.
+       sensor_msgs is pulled in transitively via agnocast_cuda. -->
+  <build_depend>agnocast_cuda</build_depend>
 
   <export>
     <build_type>ament_cmake</build_type>

From 31a6fb8fe82a81e8a7f1e94dcf93a49b920e2eea Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Thu, 9 Apr 2026 16:45:21 +0900
Subject: [PATCH 14/18] fix

Signed-off-by: sykwer <sykwer@gmail.com>
---
 src/agnocast/package.xml                    | 2 ++
 src/agnocast_cuda/src/cudart_loader.hpp     | 3 ++-
 src/agnocast_sample_application/package.xml | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/agnocast/package.xml b/src/agnocast/package.xml
index 2332f1dea..8e7a132eb 100644
--- a/src/agnocast/package.xml
+++ b/src/agnocast/package.xml
@@ -18,6 +18,8 @@
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_cie_config_msgs</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_cie_thread_configurator</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_components</exec_depend>
+  <!-- agnocast_cuda is safe on non-CUDA systems: it dlopen's libcudart.so lazily only when
+       a CUDA message type is actually used. Installing this package does NOT require CUDA. -->
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_cuda</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_ioctl_wrapper</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_sample_application</exec_depend>
diff --git a/src/agnocast_cuda/src/cudart_loader.hpp b/src/agnocast_cuda/src/cudart_loader.hpp
index af31b8967..7ebc1ab7d 100644
--- a/src/agnocast_cuda/src/cudart_loader.hpp
+++ b/src/agnocast_cuda/src/cudart_loader.hpp
@@ -27,7 +27,7 @@
 //   - cudaError_t is a C enum (int-sized). cudaSuccess has been 0 since CUDA 1.0.
 //   - cudaIpcMemHandle_t is a 64-byte opaque struct (CUDA_IPC_HANDLE_SIZE = 64).
 //   - cudaIpcMemLazyEnablePeerAccess is a flag constant (0x01).
-//   - cudaDevAttrIntegrated is enum value 73 in cudaDeviceAttr (append-only enum).
+//   - cudaDevAttrIntegrated is enum value 18 in cudaDeviceAttr (append-only enum).
 // If NVIDIA ever breaks this ABI (extremely unlikely), the static_assert in
 // cuda_ipc_backend.cpp will catch size mismatches at compile time.
 //
@@ -125,6 +125,7 @@ class CudartLoader
   CudartLoader()
   {
     // Try library names in order. See "LIBRARY NAME FALLBACK CHAIN" in the file header.
+    dlerror();  // Clear any stale error before the dlopen loop.
     const char * names[] = {"libcudart.so", "libcudart.so.12", "libcudart.so.11.0"};
     for (const char * name : names) {
       handle_ = dlopen(name, RTLD_NOW | RTLD_LOCAL);
diff --git a/src/agnocast_sample_application/package.xml b/src/agnocast_sample_application/package.xml
index 5818580c4..2a6cf3d7f 100644
--- a/src/agnocast_sample_application/package.xml
+++ b/src/agnocast_sample_application/package.xml
@@ -33,6 +33,8 @@
 
   <!-- agnocast_cuda has no build-time CUDA dependency (it dlopen's libcudart.so at runtime),
        so it is safe to declare as a build_depend on all systems including non-CUDA ones.
+       Note: linking against agnocast_cuda does NOT mean CUDA is available at runtime.
+       Packages that compile .cu files must also check for CUDAToolkit in CMakeLists.txt.
        sensor_msgs is pulled in transitively via agnocast_cuda. -->
   <build_depend>agnocast_cuda</build_depend>
 

From a6531484ce6f90ca9043040aad2c15bd60568c83 Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Thu, 9 Apr 2026 16:49:48 +0900
Subject: [PATCH 15/18] fix

Signed-off-by: sykwer <sykwer@gmail.com>
---
 src/agnocast_cuda/src/cuda_ipc_backend.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.cpp b/src/agnocast_cuda/src/cuda_ipc_backend.cpp
index 6f924db9e..36548dc32 100644
--- a/src/agnocast_cuda/src/cuda_ipc_backend.cpp
+++ b/src/agnocast_cuda/src/cuda_ipc_backend.cpp
@@ -1,4 +1,5 @@
 #include "cuda_ipc_backend.hpp"
+
 #include "cudart_loader.hpp"
 
 #include <cstdio>
@@ -45,8 +46,7 @@ void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/)
   std::memcpy(&ipc_handle, handle.opaque, sizeof(ipc_handle));
   void * ptr = nullptr;
   check_cuda_error(
-    CudartLoader::instance().cudaIpcOpenMemHandle(
-      &ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess),
+    CudartLoader::instance().cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess),
     "cudaIpcOpenMemHandle");
   return ptr;
 }

From fc4f87ab9fc453ad05b6e30ff0fbd78d0d8f46d3 Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Thu, 9 Apr 2026 17:13:54 +0900
Subject: [PATCH 16/18] rename

Signed-off-by: sykwer <sykwer@gmail.com>
---
 .../include/agnocast/cuda/{types.hpp => message_types.hpp}      | 0
 src/agnocast_sample_application/src/cuda_publisher.cpp          | 2 +-
 src/agnocast_sample_application/src/cuda_subscriber.cpp         | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename src/agnocast_cuda/include/agnocast/cuda/{types.hpp => message_types.hpp} (100%)

diff --git a/src/agnocast_cuda/include/agnocast/cuda/types.hpp b/src/agnocast_cuda/include/agnocast/cuda/message_types.hpp
similarity index 100%
rename from src/agnocast_cuda/include/agnocast/cuda/types.hpp
rename to src/agnocast_cuda/include/agnocast/cuda/message_types.hpp
diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp
index a6efbec7e..edb661c64 100644
--- a/src/agnocast_sample_application/src/cuda_publisher.cpp
+++ b/src/agnocast_sample_application/src/cuda_publisher.cpp
@@ -1,5 +1,5 @@
 #include "agnocast/agnocast.hpp"
-#include "agnocast/cuda/types.hpp"
+#include "agnocast/cuda/message_types.hpp"
 
 #include <cuda_runtime.h>
 
diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp
index 939b36c30..2a5940412 100644
--- a/src/agnocast_sample_application/src/cuda_subscriber.cpp
+++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp
@@ -1,5 +1,5 @@
 #include "agnocast/agnocast.hpp"
-#include "agnocast/cuda/types.hpp"
+#include "agnocast/cuda/message_types.hpp"
 
 #include <cuda_runtime.h>
 

From ce5f5f04a855154b31bec69a2701ce5e8672c6fd Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Thu, 9 Apr 2026 17:40:23 +0900
Subject: [PATCH 17/18] fix

Signed-off-by: sykwer <sykwer@gmail.com>
---
 src/agnocast_sample_application/CMakeLists.txt |  2 ++
 src/agnocast_sample_application/package.xml    |  8 ++++----
 .../src/cuda_publisher.cpp                     | 18 +++++++++++-------
 .../src/cuda_subscriber.cpp                    |  9 ++++++---
 .../agnocast/agnocast_callback_info.hpp        | 10 ++++++----
 .../include/agnocast/agnocast_publisher.hpp    | 17 +++++++++--------
 6 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/src/agnocast_sample_application/CMakeLists.txt b/src/agnocast_sample_application/CMakeLists.txt
index c3631df79..7195e2e5a 100644
--- a/src/agnocast_sample_application/CMakeLists.txt
+++ b/src/agnocast_sample_application/CMakeLists.txt
@@ -169,6 +169,8 @@ install(TARGETS sim_time_timer
 # agnocast_cuda are all available). Note: agnocast_cuda itself has no build-time CUDA
 # dependency (it dlopen's libcudart.so at runtime), but these sample apps compile .cu
 # files with CUDA kernels, so they need the full CUDA toolchain.
+# These binaries are NOT included in the pre-built deb from the ROS build farm (which
+# has no CUDA). Users who want to try the CUDA samples must build from source with CUDA.
 include(CheckLanguage)
 check_language(CUDA)
 find_package(agnocast_cuda QUIET)
diff --git a/src/agnocast_sample_application/package.xml b/src/agnocast_sample_application/package.xml
index 2a6cf3d7f..1315e81e3 100644
--- a/src/agnocast_sample_application/package.xml
+++ b/src/agnocast_sample_application/package.xml
@@ -32,11 +32,11 @@
   <test_depend>ament_lint_common</test_depend>
 
   <!-- agnocast_cuda has no build-time CUDA dependency (it dlopen's libcudart.so at runtime),
-       so it is safe to declare as a build_depend on all systems including non-CUDA ones.
+       so it is safe to declare as a dependency on all systems including non-CUDA ones.
        Note: linking against agnocast_cuda does NOT mean CUDA is available at runtime.
-       Packages that compile .cu files must also check for CUDAToolkit in CMakeLists.txt.
-       sensor_msgs is pulled in transitively via agnocast_cuda. -->
-  <build_depend>agnocast_cuda</build_depend>
+       Packages that compile .cu files must also check for CUDAToolkit in CMakeLists.txt. -->
+  <depend>agnocast_cuda</depend>
+  <depend>sensor_msgs</depend>
 
   <export>
     <build_type>ament_cmake</build_type>
diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp
index edb661c64..62a0312cc 100644
--- a/src/agnocast_sample_application/src/cuda_publisher.cpp
+++ b/src/agnocast_sample_application/src/cuda_publisher.cpp
@@ -34,9 +34,12 @@ class CudaPublisher : public agnocast::Node
     msg->is_dense = true;
 
     // Allocate and fill GPU data
-    const size_t gpu_size = msg->height * msg->width * msg->point_step;
-    if (cudaMalloc(&msg->data, gpu_size) != cudaSuccess) {
-      RCLCPP_ERROR(get_logger(), "cudaMalloc failed: %s", cudaGetErrorString(cudaGetLastError()));
+    const size_t gpu_size =
+      static_cast<size_t>(msg->height) * static_cast<size_t>(msg->width) *
+      static_cast<size_t>(msg->point_step);
+    const cudaError_t malloc_result = cudaMalloc(&msg->data, gpu_size);
+    if (malloc_result != cudaSuccess) {
+      RCLCPP_ERROR(get_logger(), "cudaMalloc failed: %s", cudaGetErrorString(malloc_result));
       return;
     }
 
@@ -45,9 +48,9 @@ class CudaPublisher : public agnocast::Node
     // cppcheck-suppress shiftTooManyBits  // false positive: <<< >>> is CUDA kernel launch syntax
     fill_kernel<<<blocks, threads>>>(msg->data, gpu_size, static_cast<uint8_t>(count_));
 
-    if (cudaStreamSynchronize(nullptr) != cudaSuccess) {
-      RCLCPP_ERROR(
-        get_logger(), "kernel launch failed: %s", cudaGetErrorString(cudaGetLastError()));
+    const cudaError_t sync_result = cudaStreamSynchronize(nullptr);
+    if (sync_result != cudaSuccess) {
+      RCLCPP_ERROR(get_logger(), "kernel launch failed: %s", cudaGetErrorString(sync_result));
       cudaFree(msg->data);
       msg->data = nullptr;
       return;
@@ -55,7 +58,8 @@ class CudaPublisher : public agnocast::Node
 
     pub_->publish(std::move(msg));
     RCLCPP_INFO(
-      get_logger(), "published CUDA PointCloud2: seq=%ld, gpu_size=%zu", count_++, gpu_size);
+      get_logger(), "published CUDA PointCloud2: seq=%lld, gpu_size=%zu",
+      static_cast<long long>(count_++), gpu_size);
   }
 
 public:
diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp
index 2a5940412..f63eb0047 100644
--- a/src/agnocast_sample_application/src/cuda_subscriber.cpp
+++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp
@@ -16,7 +16,8 @@ class CudaSubscriber : public agnocast::Node
     // Read CPU metadata from shared memory
     const uint32_t width = msg->width;
     const uint32_t point_step = msg->point_step;
-    const size_t gpu_size = msg->height * width * point_step;
+    const size_t gpu_size =
+      static_cast<size_t>(msg->height) * static_cast<size_t>(width) * static_cast<size_t>(point_step);
 
     // Get subscriber-local GPU pointer (mapped via CUDA IPC)
     auto * gpu_ptr = static_cast<uint8_t *>(msg.gpu_data());
@@ -28,8 +29,10 @@ class CudaSubscriber : public agnocast::Node
     // Read first few bytes from GPU to verify data
     uint8_t host_buf[16]{};
     const size_t copy_size = std::min(gpu_size, sizeof(host_buf));
-    if (cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost) != cudaSuccess) {
-      RCLCPP_ERROR(get_logger(), "cudaMemcpy failed: %s", cudaGetErrorString(cudaGetLastError()));
+    const cudaError_t memcpy_result =
+      cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost);
+    if (memcpy_result != cudaSuccess) {
+      RCLCPP_ERROR(get_logger(), "cudaMemcpy failed: %s", cudaGetErrorString(memcpy_result));
       return;
     }
 
diff --git a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
index 08e101447..ff7b183de 100644
--- a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
@@ -1,10 +1,12 @@
 #pragma once
 
 #include "agnocast/agnocast_smart_pointer.hpp"
+#include "agnocast/agnocast_utils.hpp"
 #include "agnocast/cuda_message_tag.hpp"
 #include "agnocast/gpu_metadata.hpp"
 #include "agnocast/gpu_transfer_backend.hpp"
 
+#include <cstdlib>
 #include <mutex>
 #include <type_traits>
 
@@ -77,10 +79,10 @@ agnocast::ipc_shared_ptr<MessageT> create_subscriber_ipc_ptr(
   if constexpr (is_cuda_message_v<MessageT>) {
     auto * meta = static_cast<GpuMetadata *>(msg->gpu_metadata_);
     if (!meta) {
-      std::fprintf(
-        stderr,
-        "[agnocast] FATAL: CUDA message on topic '%s' has null gpu_metadata_. "
-        "The publisher may have failed to set GpuMetadata during publish().\n",
+      RCLCPP_ERROR(
+        logger,
+        "CUDA message on topic '%s' has null gpu_metadata_. "
+        "The publisher may have failed to set GpuMetadata during publish().",
         topic_name.c_str());
       std::abort();
     }
diff --git a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
index 035436e74..720f14da7 100644
--- a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
@@ -22,6 +22,7 @@
 #include <unistd.h>
 
 #include <cstdint>
+#include <cstdlib>
 #include <cstring>
 #include <functional>
 #include <mutex>
@@ -219,19 +220,19 @@ class BasicPublisher
     // All CUDA message types must provide this by shadowing the base ROS message's data field.
     if constexpr (is_cuda_message_v<MessageT>) {
       if (!raw_ptr->data) {
-        std::fprintf(
-          stderr,
-          "[agnocast] FATAL: CUDA message on topic '%s' has null data pointer. "
-          "Did you forget to cudaMalloc(&msg->data, size) before publish()?\n",
+        RCLCPP_ERROR(
+          logger,
+          "CUDA message on topic '%s' has null data pointer. "
+          "Did you forget to cudaMalloc(&msg->data, size) before publish()?",
           topic_name_.c_str());
         std::abort();
       }
       const size_t gpu_size = get_cuda_gpu_data_size(*raw_ptr);
       if (gpu_size == 0) {
-        std::fprintf(
-          stderr,
-          "[agnocast] FATAL: CUDA message on topic '%s' has gpu_data_size == 0. "
-          "Ensure message fields (height, width, point_step, etc.) are set before publish().\n",
+        RCLCPP_ERROR(
+          logger,
+          "CUDA message on topic '%s' has gpu_data_size == 0. "
+          "Ensure message fields (height, width, point_step, etc.) are set before publish().",
           topic_name_.c_str());
         std::abort();
       }

From 2336546e03b84803374397128dc53d6aae1bec60 Mon Sep 17 00:00:00 2001
From: sykwer <sykwer@gmail.com>
Date: Thu, 9 Apr 2026 17:42:37 +0900
Subject: [PATCH 18/18] fix

Signed-off-by: sykwer <sykwer@gmail.com>
---
 src/agnocast_sample_application/src/cuda_publisher.cpp  | 5 ++---
 src/agnocast_sample_application/src/cuda_subscriber.cpp | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp
index 62a0312cc..43775e30c 100644
--- a/src/agnocast_sample_application/src/cuda_publisher.cpp
+++ b/src/agnocast_sample_application/src/cuda_publisher.cpp
@@ -34,9 +34,8 @@ class CudaPublisher : public agnocast::Node
     msg->is_dense = true;
 
     // Allocate and fill GPU data
-    const size_t gpu_size =
-      static_cast<size_t>(msg->height) * static_cast<size_t>(msg->width) *
-      static_cast<size_t>(msg->point_step);
+    const size_t gpu_size = static_cast<size_t>(msg->height) * static_cast<size_t>(msg->width) *
+                            static_cast<size_t>(msg->point_step);
     const cudaError_t malloc_result = cudaMalloc(&msg->data, gpu_size);
     if (malloc_result != cudaSuccess) {
       RCLCPP_ERROR(get_logger(), "cudaMalloc failed: %s", cudaGetErrorString(malloc_result));
diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp
index f63eb0047..9f1be14e9 100644
--- a/src/agnocast_sample_application/src/cuda_subscriber.cpp
+++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp
@@ -16,8 +16,8 @@ class CudaSubscriber : public agnocast::Node
     // Read CPU metadata from shared memory
     const uint32_t width = msg->width;
     const uint32_t point_step = msg->point_step;
-    const size_t gpu_size =
-      static_cast<size_t>(msg->height) * static_cast<size_t>(width) * static_cast<size_t>(point_step);
+    const size_t gpu_size = static_cast<size_t>(msg->height) * static_cast<size_t>(width) *
+                            static_cast<size_t>(point_step);
 
     // Get subscriber-local GPU pointer (mapped via CUDA IPC)
     auto * gpu_ptr = static_cast<uint8_t *>(msg.gpu_data());