diff --git a/scripts/sample_application/run_cuda_listener.bash b/scripts/sample_application/run_cuda_listener.bash
new file mode 100755
index 000000000..296600719
--- /dev/null
+++ b/scripts/sample_application/run_cuda_listener.bash
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+source install/setup.bash
+ros2 launch agnocast_sample_application cuda_listener.launch.xml
diff --git a/scripts/sample_application/run_cuda_talker.bash b/scripts/sample_application/run_cuda_talker.bash
new file mode 100755
index 000000000..3c8318bad
--- /dev/null
+++ b/scripts/sample_application/run_cuda_talker.bash
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+source install/setup.bash
+ros2 launch agnocast_sample_application cuda_talker.launch.xml
diff --git a/src/agnocast/package.xml b/src/agnocast/package.xml
index ae223d158..8e7a132eb 100644
--- a/src/agnocast/package.xml
+++ b/src/agnocast/package.xml
@@ -18,6 +18,9 @@
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_cie_config_msgs</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_cie_thread_configurator</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_components</exec_depend>
+  <!-- agnocast_cuda is safe on non-CUDA systems: it dlopen's libcudart.so lazily only when
+       a CUDA message type is actually used. Installing this package does NOT require CUDA. -->
+  <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_cuda</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_ioctl_wrapper</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_sample_application</exec_depend>
   <exec_depend version_gte="2.3.3" version_lt="2.3.4">agnocast_sample_interfaces</exec_depend>
diff --git a/src/agnocast_cuda/CMakeLists.txt b/src/agnocast_cuda/CMakeLists.txt
new file mode 100644
index 000000000..dba9e71f1
--- /dev/null
+++ b/src/agnocast_cuda/CMakeLists.txt
@@ -0,0 +1,69 @@
+cmake_minimum_required(VERSION 3.14)
+project(agnocast_cuda LANGUAGES CXX)
+
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  add_compile_options(-Wall -Wextra -Wpedantic)
+endif()
+
+find_package(ament_cmake REQUIRED)
+find_package(agnocastlib REQUIRED)
+find_package(sensor_msgs REQUIRED)
+
+# ---- Runtime CUDA loading (no build-time CUDA dependency) ----
+#
+# This package has ZERO build-time dependency on the CUDA toolkit. It does NOT use
+# find_package(CUDAToolkit), enable_language(CUDA), or link against CUDA::cudart.
+#
+# Instead, libcudart.so is loaded at runtime via dlopen() (see src/cudart_loader.hpp).
+# This allows the package to be built as a pre-built .deb on the ROS build farm, which
+# has no CUDA toolchain. Users install CUDA separately on their target machines.
+#
+# The only system library we link is libdl (via ${CMAKE_DL_LIBS}) for dlopen/dlsym.
+#
+# IMPORTANT FOR DOWNSTREAM PACKAGES:
+# Since this library always builds (even without CUDA), find_package(agnocast_cuda) and
+# TARGET agnocast_cuda::agnocast_cuda are NOT reliable indicators that CUDA is available.
+# Downstream packages that compile .cu files or call CUDA APIs directly (e.g., cudaMalloc)
+# must ALSO check find_package(CUDAToolkit) independently. See the sample application's
+# CMakeLists.txt for the correct pattern:
+#   find_package(CUDAToolkit QUIET)
+#   find_package(agnocast_cuda QUIET)
+#   if(CUDAToolkit_FOUND AND agnocast_cuda_FOUND)
+#     ...
+#   endif()
+add_library(agnocast_cuda SHARED
+  src/cuda_ipc_backend.cpp
+  src/vmm_backend.cpp
+  src/nvscibuf_backend.cpp
+  src/unified_memory_backend.cpp
+  src/get_backend.cpp)
+
+target_include_directories(agnocast_cuda PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:include>)
+
+ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs)
+target_link_libraries(agnocast_cuda ${CMAKE_DL_LIBS})
+
+install(TARGETS agnocast_cuda
+  EXPORT export_${PROJECT_NAME}
+  LIBRARY DESTINATION lib
+  ARCHIVE DESTINATION lib
+  RUNTIME DESTINATION bin
+  INCLUDES DESTINATION include)
+
+ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET)
+ament_export_dependencies(agnocastlib sensor_msgs)
+
+# Always install headers so downstream packages can use the type definitions
+# (e.g., cuda_message_tag.hpp is in agnocastlib, but types.hpp is here).
+install(
+  DIRECTORY include/
+  DESTINATION include)
+
+ament_export_include_directories(include)
+ament_package()
diff --git a/src/agnocast_cuda/include/agnocast/cuda/message_types.hpp b/src/agnocast_cuda/include/agnocast/cuda/message_types.hpp
new file mode 100644
index 000000000..c1d8a90c8
--- /dev/null
+++ b/src/agnocast_cuda/include/agnocast/cuda/message_types.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "agnocast/cuda_message_tag.hpp"
+
+#include <sensor_msgs/msg/image.hpp>
+#include <sensor_msgs/msg/point_cloud2.hpp>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace agnocast::cuda
+{
+
+struct PointCloud2 : public sensor_msgs::msg::PointCloud2, public agnocast::cuda_message_tag
+{
+  uint8_t * data = nullptr;  // GPU device pointer (shadows base class std::vector<uint8_t> data)
+};
+
+struct Image : public sensor_msgs::msg::Image, public agnocast::cuda_message_tag
+{
+  uint8_t * data = nullptr;  // GPU device pointer (shadows base class std::vector<uint8_t> data)
+};
+
+}  // namespace agnocast::cuda
+
+namespace agnocast
+{
+
+template <>
+inline size_t get_cuda_gpu_data_size(const agnocast::cuda::PointCloud2 & msg)
+{
+  return static_cast<size_t>(msg.height) * static_cast<size_t>(msg.width) *
+         static_cast<size_t>(msg.point_step);
+}
+
+template <>
+inline size_t get_cuda_gpu_data_size(const agnocast::cuda::Image & msg)
+{
+  return static_cast<size_t>(msg.height) * msg.step;
+}
+
+}  // namespace agnocast
diff --git a/src/agnocast_cuda/package.xml b/src/agnocast_cuda/package.xml
new file mode 100644
index 000000000..b01f3bc15
--- /dev/null
+++ b/src/agnocast_cuda/package.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>agnocast_cuda</name>
+  <version>2.3.3</version>
+  <description>
+    CUDA IPC support for Agnocast. Provides GPU message types and cross-process GPU buffer sharing.
+  </description>
+  <maintainer email="sykwer@gmail.com">Takahiro Ishikawa-Aso</maintainer>
+  <maintainer email="koichi.imai.2@tier4.jp">Koichi Imai</maintainer>
+  <maintainer email="masaki.nishikawa@tier4.jp">Masaki Nishikawa</maintainer>
+  <license>Apache License 2.0</license>
+
+  <!-- Original authors -->
+  <author email="sykwer@gmail.com">Takahiro Ishikawa-Aso</author>
+
+  <buildtool_depend>ament_cmake</buildtool_depend>
+
+  <depend>agnocastlib</depend>
+  <depend>sensor_msgs</depend>
+
+  <export>
+    <build_type>ament_cmake</build_type>
+  </export>
+</package>
diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.cpp b/src/agnocast_cuda/src/cuda_ipc_backend.cpp
new file mode 100644
index 000000000..36548dc32
--- /dev/null
+++ b/src/agnocast_cuda/src/cuda_ipc_backend.cpp
@@ -0,0 +1,60 @@
+#include "cuda_ipc_backend.hpp"
+
+#include "cudart_loader.hpp"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+namespace
+{
+
+void check_cuda_error(agnocast::cuda::cudaError_t err, const char * operation)
+{
+  if (err != agnocast::cuda::cudaSuccess) {
+    std::fprintf(
+      stderr, "[agnocast_cuda] FATAL: %s failed: %s\n", operation,
+      agnocast::cuda::CudartLoader::instance().cudaGetErrorString(err));
+    std::abort();
+  }
+}
+
+}  // namespace
+
+namespace agnocast::cuda
+{
+
+GpuHandle CudaIpcBackend::export_handle(void * device_ptr, size_t /*size*/)
+{
+  GpuHandle h{};
+  static_assert(sizeof(cudaIpcMemHandle_t) <= sizeof(h.opaque));
+  cudaIpcMemHandle_t ipc_handle;
+  check_cuda_error(
+    CudartLoader::instance().cudaIpcGetMemHandle(&ipc_handle, device_ptr), "cudaIpcGetMemHandle");
+  std::memcpy(h.opaque, &ipc_handle, sizeof(ipc_handle));
+  return h;
+}
+
+void CudaIpcBackend::free_device_memory(void * device_ptr)
+{
+  check_cuda_error(CudartLoader::instance().cudaFree(device_ptr), "cudaFree");
+}
+
+void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/)
+{
+  cudaIpcMemHandle_t ipc_handle;
+  std::memcpy(&ipc_handle, handle.opaque, sizeof(ipc_handle));
+  void * ptr = nullptr;
+  check_cuda_error(
+    CudartLoader::instance().cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess),
+    "cudaIpcOpenMemHandle");
+  return ptr;
+}
+
+void CudaIpcBackend::release_handle(void * local_ptr)
+{
+  check_cuda_error(
+    CudartLoader::instance().cudaIpcCloseMemHandle(local_ptr), "cudaIpcCloseMemHandle");
+}
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/cuda_ipc_backend.hpp b/src/agnocast_cuda/src/cuda_ipc_backend.hpp
new file mode 100644
index 000000000..fc262cea8
--- /dev/null
+++ b/src/agnocast_cuda/src/cuda_ipc_backend.hpp
@@ -0,0 +1,19 @@
+// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
+// Only get_backend.cpp includes this to instantiate the singleton.
+#pragma once
+
+#include "agnocast/gpu_transfer_backend.hpp"
+
+namespace agnocast::cuda
+{
+
+class CudaIpcBackend : public GpuTransferBackend
+{
+public:
+  GpuHandle export_handle(void * device_ptr, size_t size) override;
+  void free_device_memory(void * device_ptr) override;
+  void * import_handle(const GpuHandle & handle, size_t size) override;
+  void release_handle(void * local_ptr) override;
+};
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/cudart_loader.hpp b/src/agnocast_cuda/src/cudart_loader.hpp
new file mode 100644
index 000000000..7ebc1ab7d
--- /dev/null
+++ b/src/agnocast_cuda/src/cudart_loader.hpp
@@ -0,0 +1,185 @@
+// ============================================================================
+// Runtime loader for the CUDA Runtime API (libcudart.so) via dlopen/dlsym.
+//
+// WHY THIS EXISTS
+// ---------------
+// agnocast_cuda is distributed as a pre-built .deb from the ROS build farm, which has no
+// CUDA toolchain. To eliminate the build-time dependency on <cuda_runtime.h> and -lcudart,
+// we replicate the small set of CUDA types/constants we need (stable ABI) and load the
+// actual CUDA runtime library at runtime via dlopen. This means:
+//   - Build time:  pure C++17, no CUDA headers or compiler required.
+//   - Run time:    libcudart.so must be present on the target machine (user installs CUDA).
+//
+// SAFE ON NON-CUDA SYSTEMS
+// ------------------------
+// Users who do not use CUDA message types are completely unaffected by this library.
+// CudartLoader is lazy: dlopen("libcudart.so") only happens on the first call to
+// CudartLoader::instance(), which is only reachable through get_backend(), which is
+// only called from if-constexpr branches guarded by is_cuda_message_v<T>. If no CUDA
+// message type is ever instantiated, this loader is never constructed and no CUDA symbols
+// are ever resolved. The library can be safely installed and loaded on machines without
+// a GPU or CUDA runtime.
+//
+// ABI STABILITY
+// -------------
+// The types and constants below are part of NVIDIA's stable C ABI for the CUDA Runtime API.
+// They have not changed across CUDA 10.x, 11.x, and 12.x. Specifically:
+//   - cudaError_t is a C enum (int-sized). cudaSuccess has been 0 since CUDA 1.0.
+//   - cudaIpcMemHandle_t is a 64-byte opaque struct (CUDA_IPC_HANDLE_SIZE = 64).
+//   - cudaIpcMemLazyEnablePeerAccess is a flag constant (0x01).
+//   - cudaDevAttrIntegrated is enum value 18 in cudaDeviceAttr (append-only enum).
+// If NVIDIA ever breaks this ABI (extremely unlikely), the static_assert in
+// cuda_ipc_backend.cpp will catch size mismatches at compile time.
+//
+// ADDING NEW CUDA API CALLS
+// -------------------------
+// When a new backend or feature needs an additional CUDA runtime function:
+//   1. Add the function pointer typedef below (e.g., using cudaMalloc_t = ...).
+//   2. Add a public member to CudartLoader (e.g., cudaMalloc_t cudaMalloc;).
+//   3. Add a load_symbol() call in the constructor.
+// Keep this file as the single place that touches dlopen/dlsym for CUDA.
+//
+// LIBRARY NAME FALLBACK CHAIN
+// ---------------------------
+// The constructor tries these names in order:
+//   1. "libcudart.so"     — unversioned symlink, present when the CUDA dev package is
+//                           installed (e.g., cuda-cudart-dev-12-*) or with the runfile
+//                           installer.
+//   2. "libcudart.so.12"  — SONAME for CUDA 12.x. Present with the runtime-only deb
+//                           package (cuda-cudart-12-*) even without the dev package.
+//   3. "libcudart.so.11.0"— SONAME for CUDA 11.x (NVIDIA used major.minor for the SONAME
+//                           in the 11.x series; all 11.0–11.8 share this SONAME).
+// If a future CUDA major version (e.g., 13.x) is released, add its SONAME to the list.
+// ============================================================================
+#pragma once
+
+#include <dlfcn.h>
+
+#include <cstdio>
+#include <cstdlib>
+
+namespace agnocast::cuda
+{
+
+// ---------------------------------------------------------------------------
+// ABI-compatible CUDA Runtime API type definitions.
+// These replace #include <cuda_runtime.h> so that no CUDA headers are needed at build time.
+// See "ABI STABILITY" section in the file header for rationale.
+// ---------------------------------------------------------------------------
+using cudaError_t = int;
+constexpr cudaError_t cudaSuccess = 0;
+
+struct cudaIpcMemHandle_t
+{
+  char reserved[64];  // CUDA_IPC_HANDLE_SIZE = 64
+};
+
+constexpr unsigned int cudaIpcMemLazyEnablePeerAccess = 0x01;
+
+using cudaDeviceAttr = int;
+constexpr cudaDeviceAttr cudaDevAttrIntegrated = 18;  // driver_types.h: cudaDevAttrIntegrated = 18
+
+// ---------------------------------------------------------------------------
+// Function pointer types matching CUDA Runtime API signatures.
+// ---------------------------------------------------------------------------
+using cudaGetDevice_t = cudaError_t (*)(int *);
+using cudaDeviceGetAttribute_t = cudaError_t (*)(int *, cudaDeviceAttr, int);
+using cudaIpcGetMemHandle_t = cudaError_t (*)(cudaIpcMemHandle_t *, void *);
+using cudaIpcOpenMemHandle_t = cudaError_t (*)(void **, cudaIpcMemHandle_t, unsigned int);
+using cudaIpcCloseMemHandle_t = cudaError_t (*)(void *);
+using cudaFree_t = cudaError_t (*)(void *);
+using cudaGetErrorString_t = const char * (*)(cudaError_t);
+
+// ---------------------------------------------------------------------------
+// CudartLoader — lazy-loading singleton for the CUDA runtime.
+//
+// Thread safety: the function-local static in instance() is guaranteed to be
+// initialized exactly once (C++11 "magic statics"). After construction, all
+// function pointers are immutable and safe to read from any thread.
+//
+// Lifetime: the singleton is never destroyed during normal execution. We
+// intentionally do NOT call dlclose() — see destructor comment.
+// ---------------------------------------------------------------------------
+class CudartLoader
+{
+public:
+  static CudartLoader & instance()
+  {
+    static CudartLoader loader;
+    return loader;
+  }
+
+  // Public function pointers — valid immediately after instance() returns.
+  // Callers use these like: CudartLoader::instance().cudaFree(ptr)
+  cudaGetDevice_t cudaGetDevice;
+  cudaDeviceGetAttribute_t cudaDeviceGetAttribute;
+  cudaIpcGetMemHandle_t cudaIpcGetMemHandle;
+  cudaIpcOpenMemHandle_t cudaIpcOpenMemHandle;
+  cudaIpcCloseMemHandle_t cudaIpcCloseMemHandle;
+  cudaFree_t cudaFree;
+  cudaGetErrorString_t cudaGetErrorString;
+
+private:
+  void * handle_ = nullptr;
+
+  CudartLoader()
+  {
+    // Try library names in order. See "LIBRARY NAME FALLBACK CHAIN" in the file header.
+    dlerror();  // Clear any stale error before the dlopen loop.
+    const char * names[] = {"libcudart.so", "libcudart.so.12", "libcudart.so.11.0"};
+    for (const char * name : names) {
+      handle_ = dlopen(name, RTLD_NOW | RTLD_LOCAL);
+      if (handle_ != nullptr) {
+        break;
+      }
+    }
+    if (handle_ == nullptr) {
+      std::fprintf(
+        stderr,
+        "[agnocast_cuda] FATAL: Could not load libcudart.so.\n"
+        "  Your code uses a CUDA message type (e.g., agnocast::cuda::PointCloud2 or\n"
+        "  agnocast::cuda::Image), which requires the NVIDIA CUDA runtime to be installed\n"
+        "  on this machine. If you do not need GPU-accelerated message passing, use the\n"
+        "  standard (non-CUDA) message types instead.\n"
+        "  To fix: install the CUDA runtime (e.g., 'sudo apt install cuda-cudart-12-*')\n"
+        "  and ensure libcudart.so is on LD_LIBRARY_PATH.\n"
+        "  dlopen error: %s\n",
+        dlerror());
+      std::abort();
+    }
+
+    load_symbol(cudaGetDevice, "cudaGetDevice");
+    load_symbol(cudaDeviceGetAttribute, "cudaDeviceGetAttribute");
+    load_symbol(cudaIpcGetMemHandle, "cudaIpcGetMemHandle");
+    load_symbol(cudaIpcOpenMemHandle, "cudaIpcOpenMemHandle");
+    load_symbol(cudaIpcCloseMemHandle, "cudaIpcCloseMemHandle");
+    load_symbol(cudaFree, "cudaFree");
+    load_symbol(cudaGetErrorString, "cudaGetErrorString");
+  }
+
+  // Intentionally never close the handle. The CUDA runtime maintains process-global state
+  // (device context, allocations, etc.). Calling dlclose() during static destruction can
+  // race with other CUDA cleanup and cause segfaults. The OS reclaims everything at exit.
+  ~CudartLoader() = default;
+
+  CudartLoader(const CudartLoader &) = delete;
+  CudartLoader & operator=(const CudartLoader &) = delete;
+
+  template <typename T>
+  void load_symbol(T & func_ptr, const char * name)
+  {
+    // POSIX requires clearing dlerror() before calling dlsym(), then checking dlerror()
+    // after, because dlsym() can legitimately return NULL for some symbols.
+    dlerror();
+    func_ptr = reinterpret_cast<T>(dlsym(handle_, name));
+    const char * err = dlerror();
+    if (err != nullptr) {
+      std::fprintf(
+        stderr, "[agnocast_cuda] FATAL: Could not load symbol '%s' from libcudart.so: %s\n", name,
+        err);
+      std::abort();
+    }
+  }
+};
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/get_backend.cpp b/src/agnocast_cuda/src/get_backend.cpp
new file mode 100644
index 000000000..12b428099
--- /dev/null
+++ b/src/agnocast_cuda/src/get_backend.cpp
@@ -0,0 +1,65 @@
+#include "agnocast/gpu_transfer_backend.hpp"
+#include "cuda_ipc_backend.hpp"
+#include "cudart_loader.hpp"
+#include "nvscibuf_backend.hpp"
+#include "unified_memory_backend.hpp"
+#include "vmm_backend.hpp"
+
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+
+namespace agnocast::cuda
+{
+
+namespace
+{
+
+std::unique_ptr<GpuTransferBackend> select_backend()
+{
+  const auto & api = CudartLoader::instance();
+
+  int device = 0;
+  cudaError_t err = api.cudaGetDevice(&device);
+  if (err != cudaSuccess) {
+    std::fprintf(
+      stderr, "[agnocast_cuda] FATAL: cudaGetDevice failed: %s\n", api.cudaGetErrorString(err));
+    std::abort();
+  }
+
+  int is_integrated = 0;
+  err = api.cudaDeviceGetAttribute(&is_integrated, cudaDevAttrIntegrated, device);
+  if (err != cudaSuccess) {
+    std::fprintf(
+      stderr, "[agnocast_cuda] FATAL: cudaDeviceGetAttribute failed: %s\n",
+      api.cudaGetErrorString(err));
+    std::abort();
+  }
+
+  if (!is_integrated) {
+    // Discrete GPU (GeForce, Quadro, Tesla, A/H series) — CUDA IPC is supported.
+    std::fprintf(stderr, "[agnocast_cuda] Discrete GPU detected, using CudaIpcBackend.\n");
+    return std::make_unique<CudaIpcBackend>();
+  }
+
+  // Integrated GPU (Jetson Xavier/Orin/Thor, DRIVE).
+  // TODO(agnocast): Implement and select the appropriate backend.
+  //   - Jetson Thor (CUDA 13.0+): CudaIpcBackend may work via OpenRM.
+  //   - Jetson Xavier/Orin: NvSciBufBackend or UnifiedMemoryBackend.
+  //   - DRIVE: NvSciBufBackend.
+  std::fprintf(
+    stderr,
+    "[agnocast_cuda] FATAL: Integrated GPU detected (Jetson/DRIVE). "
+    "No backend is implemented yet for this platform.\n");
+  std::abort();
+}
+
+}  // namespace
+
+GpuTransferBackend & get_backend()
+{
+  static auto instance = select_backend();
+  return *instance;
+}
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/nvscibuf_backend.cpp b/src/agnocast_cuda/src/nvscibuf_backend.cpp
new file mode 100644
index 000000000..0e900cedf
--- /dev/null
+++ b/src/agnocast_cuda/src/nvscibuf_backend.cpp
@@ -0,0 +1,30 @@
+#include "nvscibuf_backend.hpp"
+
+#include <stdexcept>
+
+namespace agnocast::cuda
+{
+
+GpuHandle NvSciBufBackend::export_handle(void * /*device_ptr*/, size_t /*size*/)
+{
+  throw std::runtime_error(
+    "[agnocast_cuda] NvSciBufBackend is not yet implemented. "
+    "Requires NvSciBuf (Jetson Xavier/Orin, NVIDIA DRIVE).");
+}
+
+void NvSciBufBackend::free_device_memory(void * /*device_ptr*/)
+{
+  throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented.");
+}
+
+void * NvSciBufBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/)
+{
+  throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented.");
+}
+
+void NvSciBufBackend::release_handle(void * /*local_ptr*/)
+{
+  throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented.");
+}
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/nvscibuf_backend.hpp b/src/agnocast_cuda/src/nvscibuf_backend.hpp
new file mode 100644
index 000000000..b631d1ca4
--- /dev/null
+++ b/src/agnocast_cuda/src/nvscibuf_backend.hpp
@@ -0,0 +1,21 @@
+// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
+// Only get_backend.cpp includes this to instantiate the singleton.
+#pragma once
+
+#include "agnocast/gpu_transfer_backend.hpp"
+
+namespace agnocast::cuda
+{
+
+// Placeholder backend using NvSciBuf / NvSciSync.
+// Target platforms: Jetson Xavier/Orin, NVIDIA DRIVE.
+class NvSciBufBackend : public GpuTransferBackend
+{
+public:
+  GpuHandle export_handle(void * device_ptr, size_t size) override;
+  void free_device_memory(void * device_ptr) override;
+  void * import_handle(const GpuHandle & handle, size_t size) override;
+  void release_handle(void * local_ptr) override;
+};
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/unified_memory_backend.cpp b/src/agnocast_cuda/src/unified_memory_backend.cpp
new file mode 100644
index 000000000..af97fff3b
--- /dev/null
+++ b/src/agnocast_cuda/src/unified_memory_backend.cpp
@@ -0,0 +1,30 @@
+#include "unified_memory_backend.hpp"
+
+#include <stdexcept>
+
+namespace agnocast::cuda
+{
+
+GpuHandle UnifiedMemoryBackend::export_handle(void * /*device_ptr*/, size_t /*size*/)
+{
+  throw std::runtime_error(
+    "[agnocast_cuda] UnifiedMemoryBackend is not yet implemented. "
+    "Requires POSIX shm + cudaHostRegister (Jetson unified memory).");
+}
+
+void UnifiedMemoryBackend::free_device_memory(void * /*device_ptr*/)
+{
+  throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented.");
+}
+
+void * UnifiedMemoryBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/)
+{
+  throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented.");
+}
+
+void UnifiedMemoryBackend::release_handle(void * /*local_ptr*/)
+{
+  throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented.");
+}
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/unified_memory_backend.hpp b/src/agnocast_cuda/src/unified_memory_backend.hpp
new file mode 100644
index 000000000..f9916f203
--- /dev/null
+++ b/src/agnocast_cuda/src/unified_memory_backend.hpp
@@ -0,0 +1,21 @@
+// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
+// Only get_backend.cpp includes this to instantiate the singleton.
+#pragma once
+
+#include "agnocast/gpu_transfer_backend.hpp"
+
+namespace agnocast::cuda
+{
+
+// Placeholder backend using POSIX shared memory + cudaHostRegister for Jetson platforms
+// where CPU and GPU share the same physical memory (unified memory architecture).
+class UnifiedMemoryBackend : public GpuTransferBackend
+{
+public:
+  GpuHandle export_handle(void * device_ptr, size_t size) override;
+  void free_device_memory(void * device_ptr) override;
+  void * import_handle(const GpuHandle & handle, size_t size) override;
+  void release_handle(void * local_ptr) override;
+};
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/vmm_backend.cpp b/src/agnocast_cuda/src/vmm_backend.cpp
new file mode 100644
index 000000000..4c14cbcd9
--- /dev/null
+++ b/src/agnocast_cuda/src/vmm_backend.cpp
@@ -0,0 +1,30 @@
+#include "vmm_backend.hpp"
+
+#include <stdexcept>
+
+namespace agnocast::cuda
+{
+
+GpuHandle VmmBackend::export_handle(void * /*device_ptr*/, size_t /*size*/)
+{
+  throw std::runtime_error(
+    "[agnocast_cuda] VmmBackend is not yet implemented. "
+    "Requires cuMemExportToShareableHandle (CUDA Driver API).");
+}
+
+void VmmBackend::free_device_memory(void * /*device_ptr*/)
+{
+  throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented.");
+}
+
+void * VmmBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/)
+{
+  throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented.");
+}
+
+void VmmBackend::release_handle(void * /*local_ptr*/)
+{
+  throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented.");
+}
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_cuda/src/vmm_backend.hpp b/src/agnocast_cuda/src/vmm_backend.hpp
new file mode 100644
index 000000000..491e5544a
--- /dev/null
+++ b/src/agnocast_cuda/src/vmm_backend.hpp
@@ -0,0 +1,22 @@
+// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
+// Only get_backend.cpp includes this to instantiate the singleton.
+#pragma once
+
+#include "agnocast/gpu_transfer_backend.hpp"
+
+namespace agnocast::cuda
+{
+
+// Placeholder backend using CUDA Virtual Memory Management (VMM) API.
+// Uses cuMemExportToShareableHandle / cuMemImportFromShareableHandle.
+// Supported on Jetson Orin (newer CUDA) and discrete GPUs with CUDA 10.2+.
+class VmmBackend : public GpuTransferBackend
+{
+public:
+  GpuHandle export_handle(void * device_ptr, size_t size) override;
+  void free_device_memory(void * device_ptr) override;
+  void * import_handle(const GpuHandle & handle, size_t size) override;
+  void release_handle(void * local_ptr) override;
+};
+
+}  // namespace agnocast::cuda
diff --git a/src/agnocast_sample_application/CMakeLists.txt b/src/agnocast_sample_application/CMakeLists.txt
index 41d2aa423..7195e2e5a 100644
--- a/src/agnocast_sample_application/CMakeLists.txt
+++ b/src/agnocast_sample_application/CMakeLists.txt
@@ -165,6 +165,41 @@ install(TARGETS no_rclcpp_client
 install(TARGETS sim_time_timer
   DESTINATION lib/${PROJECT_NAME})
 
+# CUDA sample applications (only built when the CUDA compiler, CUDA toolkit, and
+# agnocast_cuda are all available). Note: agnocast_cuda itself has no build-time CUDA
+# dependency (it dlopen's libcudart.so at runtime), but these sample apps compile .cu
+# files with CUDA kernels, so they need the full CUDA toolchain.
+# These binaries are NOT included in the pre-built deb from the ROS build farm (which
+# has no CUDA). Users who want to try the CUDA samples must build from source with CUDA.
+include(CheckLanguage)
+check_language(CUDA)
+find_package(agnocast_cuda QUIET)
+if(CMAKE_CUDA_COMPILER AND agnocast_cuda_FOUND)
+  enable_language(CUDA)
+  find_package(CUDAToolkit REQUIRED)
+
+  add_executable(cuda_talker src/cuda_publisher.cpp)
+  set_source_files_properties(src/cuda_publisher.cpp PROPERTIES LANGUAGE CUDA)
+  ament_target_dependencies(cuda_talker agnocastlib agnocast_cuda sensor_msgs)
+  target_link_libraries(cuda_talker CUDA::cudart)
+  target_include_directories(cuda_talker PRIVATE
+    ${agnocastlib_INCLUDE_DIRS}
+    ${agnocast_cuda_INCLUDE_DIRS}
+  )
+
+  add_executable(cuda_listener src/cuda_subscriber.cpp)
+  set_source_files_properties(src/cuda_subscriber.cpp PROPERTIES LANGUAGE CUDA)
+  ament_target_dependencies(cuda_listener agnocastlib agnocast_cuda sensor_msgs)
+  target_link_libraries(cuda_listener CUDA::cudart)
+  target_include_directories(cuda_listener PRIVATE
+    ${agnocastlib_INCLUDE_DIRS}
+    ${agnocast_cuda_INCLUDE_DIRS}
+  )
+
+  install(TARGETS cuda_talker cuda_listener
+    DESTINATION lib/${PROJECT_NAME})
+endif()
+
 install(DIRECTORY launch
   DESTINATION share/${PROJECT_NAME}/
 )
diff --git a/src/agnocast_sample_application/launch/cuda_listener.launch.xml b/src/agnocast_sample_application/launch/cuda_listener.launch.xml
new file mode 100644
index 000000000..4136b9649
--- /dev/null
+++ b/src/agnocast_sample_application/launch/cuda_listener.launch.xml
@@ -0,0 +1,5 @@
+<launch>
+    <node pkg="agnocast_sample_application" exec="cuda_listener" name="cuda_listener_node" output="screen">
+        <env name="LD_PRELOAD" value="libagnocast_heaphook.so:$(env LD_PRELOAD '')" />
+    </node>
+</launch>
diff --git a/src/agnocast_sample_application/launch/cuda_talker.launch.xml b/src/agnocast_sample_application/launch/cuda_talker.launch.xml
new file mode 100644
index 000000000..ffad40cb2
--- /dev/null
+++ b/src/agnocast_sample_application/launch/cuda_talker.launch.xml
@@ -0,0 +1,5 @@
+<launch>
+    <node pkg="agnocast_sample_application" exec="cuda_talker" name="cuda_talker_node" output="screen">
+        <env name="LD_PRELOAD" value="libagnocast_heaphook.so:$(env LD_PRELOAD '')" />
+    </node>
+</launch>
diff --git a/src/agnocast_sample_application/package.xml b/src/agnocast_sample_application/package.xml
index cf534190d..1315e81e3 100644
--- a/src/agnocast_sample_application/package.xml
+++ b/src/agnocast_sample_application/package.xml
@@ -31,6 +31,13 @@
   <test_depend>ament_lint_auto</test_depend>
   <test_depend>ament_lint_common</test_depend>
 
+  <!-- agnocast_cuda has no build-time CUDA dependency (it dlopen's libcudart.so at runtime),
+       so it is safe to declare as a dependency on all systems including non-CUDA ones.
+       Note: linking against agnocast_cuda does NOT mean CUDA is available at runtime.
+       Packages that compile .cu files must also check for CUDAToolkit in CMakeLists.txt. -->
+  <depend>agnocast_cuda</depend>
+  <depend>sensor_msgs</depend>
+
   <export>
     <build_type>ament_cmake</build_type>
   </export>
diff --git a/src/agnocast_sample_application/src/cuda_publisher.cpp b/src/agnocast_sample_application/src/cuda_publisher.cpp
new file mode 100644
index 000000000..43775e30c
--- /dev/null
+++ b/src/agnocast_sample_application/src/cuda_publisher.cpp
@@ -0,0 +1,82 @@
+#include "agnocast/agnocast.hpp"
+#include "agnocast/cuda/message_types.hpp"
+
+#include <cuda_runtime.h>
+
+using namespace std::chrono_literals;
+
+// Simple CUDA kernel: fills GPU buffer with incrementing values.
+__global__ void fill_kernel(uint8_t * data, size_t size, uint8_t offset)
+{
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    data[idx] = static_cast<uint8_t>((idx + offset) % 256);
+  }
+}
+
+class CudaPublisher : public agnocast::Node
+{
+  int64_t count_ = 0;
+  agnocast::Publisher<agnocast::cuda::PointCloud2>::SharedPtr pub_;
+  agnocast::TimerBase::SharedPtr timer_;
+
+  void timer_callback()
+  {
+    auto msg = pub_->borrow_loaned_message();
+
+    // Set CPU metadata
+    msg->header.stamp.sec = static_cast<int32_t>(count_);
+    msg->header.frame_id = "lidar";
+    msg->height = 1;
+    msg->width = 1024;
+    msg->point_step = 16;
+    msg->row_step = msg->width * msg->point_step;
+    msg->is_dense = true;
+
+    // Allocate and fill GPU data
+    const size_t gpu_size = static_cast<size_t>(msg->height) * static_cast<size_t>(msg->width) *
+                            static_cast<size_t>(msg->point_step);
+    const cudaError_t malloc_result = cudaMalloc(&msg->data, gpu_size);
+    if (malloc_result != cudaSuccess) {
+      RCLCPP_ERROR(get_logger(), "cudaMalloc failed: %s", cudaGetErrorString(malloc_result));
+      return;
+    }
+
+    const int threads = 256;
+    const int blocks = (gpu_size + threads - 1) / threads;
+    // cppcheck-suppress shiftTooManyBits  // false positive: <<< >>> is CUDA kernel launch syntax
+    fill_kernel<<<blocks, threads>>>(msg->data, gpu_size, static_cast<uint8_t>(count_));
+
+    const cudaError_t sync_result = cudaStreamSynchronize(nullptr);
+    if (sync_result != cudaSuccess) {
+      RCLCPP_ERROR(get_logger(), "kernel launch failed: %s", cudaGetErrorString(sync_result));
+      cudaFree(msg->data);
+      msg->data = nullptr;
+      return;
+    }
+
+    pub_->publish(std::move(msg));
+    RCLCPP_INFO(
+      get_logger(), "published CUDA PointCloud2: seq=%lld, gpu_size=%zu",
+      static_cast<long long>(count_++), gpu_size);
+  }
+
+public:
+  CudaPublisher() : Node("cuda_publisher")
+  {
+    pub_ = this->create_publisher<agnocast::cuda::PointCloud2>("/cuda_pointcloud", 1);
+    timer_ = agnocast::create_timer(
+      this, std::make_shared<rclcpp::Clock>(RCL_STEADY_TIME), rclcpp::Duration(100ms),
+      std::bind(&CudaPublisher::timer_callback, this));
+  }
+};
+
+int main(int argc, char ** argv)
+{
+  agnocast::init(argc, argv);
+  agnocast::AgnocastOnlySingleThreadedExecutor executor;
+  auto node = std::make_shared<CudaPublisher>();
+  executor.add_node(node);
+  executor.spin();
+  return 0;
+}
diff --git a/src/agnocast_sample_application/src/cuda_subscriber.cpp b/src/agnocast_sample_application/src/cuda_subscriber.cpp
new file mode 100644
index 000000000..9f1be14e9
--- /dev/null
+++ b/src/agnocast_sample_application/src/cuda_subscriber.cpp
@@ -0,0 +1,62 @@
+#include "agnocast/agnocast.hpp"
+#include "agnocast/cuda/message_types.hpp"
+
+#include <cuda_runtime.h>
+
+#include <algorithm>
+
+using std::placeholders::_1;
+
+class CudaSubscriber : public agnocast::Node
+{
+  agnocast::Subscription<agnocast::cuda::PointCloud2>::SharedPtr sub_;
+
+  void callback(agnocast::ipc_shared_ptr<const agnocast::cuda::PointCloud2> msg)
+  {
+    // Read CPU metadata from shared memory
+    const uint32_t width = msg->width;
+    const uint32_t point_step = msg->point_step;
+    const size_t gpu_size = static_cast<size_t>(msg->height) * static_cast<size_t>(width) *
+                            static_cast<size_t>(point_step);
+
+    // Get subscriber-local GPU pointer (mapped via CUDA IPC)
+    auto * gpu_ptr = static_cast<uint8_t *>(msg.gpu_data());
+    if (!gpu_ptr) {
+      RCLCPP_ERROR(get_logger(), "gpu_data() returned nullptr");
+      return;
+    }
+
+    // Read first few bytes from GPU to verify data
+    uint8_t host_buf[16]{};
+    const size_t copy_size = std::min(gpu_size, sizeof(host_buf));
+    const cudaError_t memcpy_result =
+      cudaMemcpy(host_buf, gpu_ptr, copy_size, cudaMemcpyDeviceToHost);
+    if (memcpy_result != cudaSuccess) {
+      RCLCPP_ERROR(get_logger(), "cudaMemcpy failed: %s", cudaGetErrorString(memcpy_result));
+      return;
+    }
+
+    RCLCPP_INFO(
+      get_logger(),
+      "received CUDA PointCloud2: width=%u, point_step=%u, gpu_size=%zu, "
+      "first_bytes=[%u,%u,%u,%u]",
+      width, point_step, gpu_size, host_buf[0], host_buf[1], host_buf[2], host_buf[3]);
+  }
+
+public:
+  CudaSubscriber() : Node("cuda_subscriber")
+  {
+    sub_ = this->create_subscription<agnocast::cuda::PointCloud2>(
+      "/cuda_pointcloud", 1, std::bind(&CudaSubscriber::callback, this, _1));
+  }
+};
+
+int main(int argc, char ** argv)
+{
+  agnocast::init(argc, argv);
+  agnocast::AgnocastOnlySingleThreadedExecutor executor;
+  auto node = std::make_shared<CudaSubscriber>();
+  executor.add_node(node);
+  executor.spin();
+  return 0;
+}
diff --git a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
index 13b66fbf7..ff7b183de 100644
--- a/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_callback_info.hpp
@@ -1,7 +1,12 @@
 #pragma once
 
 #include "agnocast/agnocast_smart_pointer.hpp"
+#include "agnocast/agnocast_utils.hpp"
+#include "agnocast/cuda_message_tag.hpp"
+#include "agnocast/gpu_metadata.hpp"
+#include "agnocast/gpu_transfer_backend.hpp"
 
+#include <cstdlib>
 #include <mutex>
 #include <type_traits>
 
@@ -44,7 +49,7 @@ struct CallbackInfo
   rclcpp::CallbackGroup::SharedPtr callback_group;
   TypeErasedCallback callback;
   std::function<std::unique_ptr<AnyObject>(
-    const void *, const std::string &, const topic_local_id_t, const uint64_t)>
+    const void *, const std::string &, const topic_local_id_t, const int64_t)>
     message_creator;
   bool need_epoll_update = true;
 };
@@ -61,6 +66,42 @@ extern std::atomic<bool> need_epoll_updates;
 
 uint32_t allocate_callback_info_id();
 
+// Creates an ipc_shared_ptr for a subscriber-received message.
+// For CUDA messages: imports the GPU handle, stores the subscriber-local GPU pointer in
+// control_block->gpu_data_ptr, and registers a gpu_release_fn to release the mapping
+// on last reference. The pointer is accessed via ipc_shared_ptr::gpu_data().
+// For non-CUDA messages: simply wraps the pointer.
+template <typename MessageT>
+agnocast::ipc_shared_ptr<MessageT> create_subscriber_ipc_ptr(
+  MessageT * msg, const std::string & topic_name, const topic_local_id_t subscriber_id,
+  const int64_t entry_id)
+{
+  if constexpr (is_cuda_message_v<MessageT>) {
+    auto * meta = static_cast<GpuMetadata *>(msg->gpu_metadata_);
+    if (!meta) {
+      RCLCPP_ERROR(
+        logger,
+        "CUDA message on topic '%s' has null gpu_metadata_. "
+        "The publisher may have failed to set GpuMetadata during publish().",
+        topic_name.c_str());
+      std::abort();
+    }
+    void * gpu_data_ptr =
+      agnocast::cuda::get_backend().import_handle(meta->handle, meta->gpu_data_size);
+    // NOTE: If import_handle() fails, the backend aborts (fail-fast). If a future backend
+    // returns nullptr instead, the subscriber would get a null gpu pointer. Callers should
+    // check gpu_data() != nullptr before use.
+
+    auto ipc_ptr = agnocast::ipc_shared_ptr<MessageT>(msg, topic_name, subscriber_id, entry_id);
+    ipc_ptr.set_gpu_data_ptr(gpu_data_ptr);
+    ipc_ptr.set_gpu_release_fn(
+      [](void * ptr) { agnocast::cuda::get_backend().release_handle(ptr); });
+    return ipc_ptr;
+  } else {
+    return agnocast::ipc_shared_ptr<MessageT>(msg, topic_name, subscriber_id, entry_id);
+  }
+}
+
 template <typename T, typename Func>
 TypeErasedCallback get_erased_callback(Func && callback)
 {
@@ -95,9 +136,9 @@ uint32_t register_callback(
   auto message_creator = [](
                            const void * ptr, const std::string & topic_name,
                            const topic_local_id_t subscriber_id, const int64_t entry_id) {
-    return std::make_unique<TypedMessagePtr<MessageT>>(agnocast::ipc_shared_ptr<MessageT>(
-      const_cast<MessageT *>(static_cast<const MessageT *>(ptr)), topic_name, subscriber_id,
-      entry_id));
+    auto * msg = const_cast<MessageT *>(static_cast<const MessageT *>(ptr));
+    return std::make_unique<TypedMessagePtr<MessageT>>(
+      create_subscriber_ipc_ptr(msg, topic_name, subscriber_id, entry_id));
   };
 
   uint32_t callback_info_id = allocate_callback_info_id();
diff --git a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
index 4e08394a0..720f14da7 100644
--- a/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_publisher.hpp
@@ -6,6 +6,9 @@
 #include "agnocast/agnocast_smart_pointer.hpp"
 #include "agnocast/agnocast_tracepoint_wrapper.h"
 #include "agnocast/agnocast_utils.hpp"
+#include "agnocast/cuda_message_tag.hpp"
+#include "agnocast/gpu_metadata.hpp"
+#include "agnocast/gpu_transfer_backend.hpp"
 #include "rclcpp/detail/qos_parameters.hpp"
 #include "rclcpp/rclcpp.hpp"
 
@@ -19,6 +22,7 @@
 #include <unistd.h>
 
 #include <cstdint>
+#include <cstdlib>
 #include <cstring>
 #include <functional>
 #include <mutex>
@@ -207,7 +211,38 @@ class BasicPublisher
     }
 
     // Capture raw pointer BEFORE invalidation (get() returns nullptr after invalidation).
-    const uint64_t msg_virtual_address = reinterpret_cast<uint64_t>(message.get());
+    MessageT * raw_ptr = message.get();
+    const uint64_t msg_virtual_address = reinterpret_cast<uint64_t>(raw_ptr);
+
+    // CUDA publish hook: export GPU handle and allocate GpuMetadata in shared memory.
+    // Runs while heaphook is still active, so GpuMetadata lands in the publisher's shared memory.
+    // NOTE: Assumes MessageT has a public `data` member (uint8_t*) pointing to the GPU allocation.
+    // All CUDA message types must provide this by shadowing the base ROS message's data field.
+    if constexpr (is_cuda_message_v<MessageT>) {
+      if (!raw_ptr->data) {
+        RCLCPP_ERROR(
+          logger,
+          "CUDA message on topic '%s' has null data pointer. "
+          "Did you forget to cudaMalloc(&msg->data, size) before publish()?",
+          topic_name_.c_str());
+        std::abort();
+      }
+      const size_t gpu_size = get_cuda_gpu_data_size(*raw_ptr);
+      if (gpu_size == 0) {
+        RCLCPP_ERROR(
+          logger,
+          "CUDA message on topic '%s' has gpu_data_size == 0. "
+          "Ensure message fields (height, width, point_step, etc.) are set before publish().",
+          topic_name_.c_str());
+        std::abort();
+      }
+      auto & backend = agnocast::cuda::get_backend();
+      auto * meta = new GpuMetadata();
+      meta->publisher_gpu_ptr = raw_ptr->data;
+      meta->gpu_data_size = gpu_size;
+      meta->handle = backend.export_handle(raw_ptr->data, gpu_size);
+      raw_ptr->gpu_metadata_ = meta;
+    }
 
     // Invalidate all references sharing this handle's control block.
     // Any remaining copies held elsewhere will fail-fast on dereference.
@@ -220,6 +255,16 @@ class BasicPublisher
 
     for (uint32_t i = 0; i < publish_msg_args.ret_released_num; i++) {
       MessageT * release_ptr = reinterpret_cast<MessageT *>(publish_msg_args.ret_released_addrs[i]);
+      // CUDA reclaim hook: free GPU buffer before deleting the message.
+      // On abnormal publisher exit, free_device_memory() is never called, but GPU device
+      // memory is reclaimed by the CUDA driver when the process exits.
+      if constexpr (is_cuda_message_v<MessageT>) {
+        if (release_ptr->gpu_metadata_) {
+          auto * meta = static_cast<GpuMetadata *>(release_ptr->gpu_metadata_);
+          agnocast::cuda::get_backend().free_device_memory(meta->publisher_gpu_ptr);
+          delete meta;
+        }
+      }
       delete release_ptr;
     }
 
diff --git a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
index c0181ceda..b3ffafe8f 100644
--- a/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_smart_pointer.hpp
@@ -40,10 +40,18 @@ extern int agnocast_fd;
 // Sentinel value indicating entry_id has not been assigned (publisher-side, before publish).
 constexpr int64_t ENTRY_ID_NOT_ASSIGNED = -1;
 
-// Forward declaration for friend access
+// Forward declarations for friend access
+template <typename T>
+class ipc_shared_ptr;
+
 template <typename MessageT, typename BridgeRequestPolicy>
 class BasicPublisher;
 
+template <typename MessageT>
+ipc_shared_ptr<MessageT> create_subscriber_ipc_ptr(
+  MessageT * msg, const std::string & topic_name, const topic_local_id_t subscriber_id,
+  const int64_t entry_id);
+
 namespace detail
 {
 
@@ -60,6 +68,18 @@ struct control_block
   topic_local_id_t pubsub_id;           // 4-byte alignment
   std::atomic<bool> valid{true};        // 1-byte alignment
 
+  // Optional GPU cleanup function pointer. Null for non-CUDA messages.
+  // Called as gpu_release_fn(gpu_data_ptr) before bitmap release in reset() to ensure GPU
+  // mappings are released before the publisher can free the underlying GPU buffer.
+  // Uses a plain function pointer instead of std::function to avoid heap allocation and
+  // minimize overhead for non-CUDA messages (16 bytes for two pointers vs ~40+ bytes).
+  void (*gpu_release_fn)(void *) = nullptr;
+
+  // Subscriber-local GPU device pointer obtained via import_handle().
+  // Stored here because the shared memory message is mapped read-only by the subscriber,
+  // so we cannot inject the local pointer into msg->data.
+  void * gpu_data_ptr = nullptr;
+
   control_block(std::string topic, topic_local_id_t pubsub, int64_t entry)
   : topic_name(std::move(topic)), entry_id(entry), pubsub_id(pubsub)
   {
@@ -105,6 +125,11 @@ class ipc_shared_ptr
   template <typename MessageT, typename BridgeRequestPolicy>
   friend class BasicPublisher;
 
+  // Allow create_subscriber_ipc_ptr to call set_gpu_release_fn() and set_gpu_data_ptr()
+  template <typename MessageT>
+  friend ipc_shared_ptr<MessageT> create_subscriber_ipc_ptr(
+    MessageT *, const std::string &, const topic_local_id_t, const int64_t);
+
   // Allow converting constructors to access private members of ipc_shared_ptr<U>
   template <typename U>
   friend class ipc_shared_ptr;
@@ -133,6 +158,24 @@ class ipc_shared_ptr
     }
   }
 
+  // Sets a GPU release function to be invoked as fn(gpu_data_ptr) when the last reference
+  // is released. Private: only create_subscriber_ipc_ptr() should call this.
+  void set_gpu_release_fn(void (*fn)(void *))
+  {
+    if (control_) {
+      control_->gpu_release_fn = fn;
+    }
+  }
+
+  // Sets the subscriber-local GPU pointer (obtained via import_handle).
+  // Private: only create_subscriber_ipc_ptr() should call this.
+  void set_gpu_data_ptr(void * ptr)
+  {
+    if (control_) {
+      control_->gpu_data_ptr = ptr;
+    }
+  }
+
   // Publisher-side constructor (entry_id not yet assigned).
   // Creates control block for reference counting and one-shot invalidation.
   // Private: users must call BasicPublisher::borrow_loaned_message() instead of constructing
@@ -339,6 +382,9 @@ class ipc_shared_ptr
   AGNOCAST_PUBLIC
   T * get() const noexcept { return is_invalidated_() ? nullptr : ptr_; }
 
+  // Returns the subscriber-local GPU device pointer, or nullptr for non-CUDA messages.
+  void * gpu_data() const noexcept { return control_ ? control_->gpu_data_ptr : nullptr; }
+
   /**
    * @brief Release ownership of the managed message. If this is the last reference: on the
    * subscriber side, notifies the kernel module that the message can be reclaimed; on the
@@ -354,6 +400,12 @@ class ipc_shared_ptr
     const bool was_last = control_->decrement_and_check();
 
     if (was_last) {
+      // GPU cleanup must run BEFORE bitmap release: unmapping the GPU buffer before
+      // the publisher is allowed to cudaFree the underlying allocation.
+      if (control_->gpu_release_fn) {
+        control_->gpu_release_fn(control_->gpu_data_ptr);
+      }
+
       if (control_->entry_id != ENTRY_ID_NOT_ASSIGNED) {
         // Subscriber side: notify kmod that all references are released.
         release_subscriber_reference(control_->topic_name, control_->pubsub_id, control_->entry_id);
diff --git a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp
index f2d296485..9c06196c9 100644
--- a/src/agnocastlib/include/agnocast/agnocast_subscription.hpp
+++ b/src/agnocastlib/include/agnocast/agnocast_subscription.hpp
@@ -356,7 +356,7 @@ class BasicTakeSubscription : public SubscriptionBase
 
         MessageT * ptr = reinterpret_cast<MessageT *>(take_args.ret_addr);
         auto result =
-          agnocast::ipc_shared_ptr<const MessageT>(ptr, topic_name_, id_, take_args.ret_entry_id);
+          create_subscriber_ipc_ptr<const MessageT>(ptr, topic_name_, id_, take_args.ret_entry_id);
         old_ptr = std::move(last_taken_ptr_);
         last_taken_ptr_ = result;
         return result;
@@ -364,7 +364,7 @@ class BasicTakeSubscription : public SubscriptionBase
     }
 
     MessageT * ptr = reinterpret_cast<MessageT *>(take_args.ret_addr);
-    return agnocast::ipc_shared_ptr<const MessageT>(ptr, topic_name_, id_, take_args.ret_entry_id);
+    return create_subscriber_ipc_ptr<const MessageT>(ptr, topic_name_, id_, take_args.ret_entry_id);
   }
 };
 
diff --git a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp
index 984930f6e..942ad7978 100644
--- a/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp
+++ b/src/agnocastlib/include/agnocast/bridge/agnocast_bridge_node.hpp
@@ -4,6 +4,7 @@
 #include "agnocast/agnocast_publisher.hpp"
 #include "agnocast/agnocast_subscription.hpp"
 #include "agnocast/bridge/agnocast_bridge_utils.hpp"
+#include "agnocast/cuda_message_tag.hpp"
 #include "rclcpp/rclcpp.hpp"
 
 #include <dlfcn.h>
@@ -39,11 +40,25 @@ template <typename MessageT>
 void request_bridge_core(
   const std::string & topic_name, topic_local_id_t id, BridgeDirection direction)
 {
-  auto bridge_mode = get_bridge_mode();
-  if (bridge_mode == BridgeMode::Standard) {
-    send_bridge_request<MessageT>(topic_name, id, direction);
-  } else if (bridge_mode == BridgeMode::Performance) {
-    send_performance_bridge_request<MessageT>(topic_name, id, direction);
+  // CUDA message types cannot be bridged to ROS 2 directly (GPU pointers are not serializable).
+  // Bridge support for CUDA types (via cudaMemcpy D2H) is future work.
+  if constexpr (is_cuda_message_v<MessageT>) {
+    static const auto logger = rclcpp::get_logger("agnocast_bridge_requester");
+    RCLCPP_WARN(
+      logger,
+      "Bridge skipped for CUDA topic '%s': GPU message types cannot be bridged to ROS 2. "
+      "Use cudaMemcpy to a standard ROS message if DDS bridging is needed.",
+      topic_name.c_str());
+    (void)id;
+    (void)direction;
+    return;
+  } else {
+    auto bridge_mode = get_bridge_mode();
+    if (bridge_mode == BridgeMode::Standard) {
+      send_bridge_request<MessageT>(topic_name, id, direction);
+    } else if (bridge_mode == BridgeMode::Performance) {
+      send_performance_bridge_request<MessageT>(topic_name, id, direction);
+    }
   }
 }
 
diff --git a/src/agnocastlib/include/agnocast/cuda_message_tag.hpp b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp
new file mode 100644
index 000000000..969221b01
--- /dev/null
+++ b/src/agnocastlib/include/agnocast/cuda_message_tag.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <cstddef>
+#include <type_traits>
+
+namespace agnocast
+{
+
+// Base class for CUDA message types. Provides a pointer to GpuMetadata stored in shared memory.
+// CUDA message types (e.g., agnocast::cuda::PointCloud2) inherit from both a ROS message type
+// and this tag. The publish/receive machinery uses is_cuda_message_v<T> to detect CUDA messages
+// at compile time via if constexpr, so no CUDA headers are needed in agnocastlib.
+struct cuda_message_tag
+{
+  void * gpu_metadata_ = nullptr;
+};
+
+template <typename T>
+inline constexpr bool is_cuda_message_v =
+  std::is_base_of_v<cuda_message_tag, std::remove_const_t<T>>;
+
+// Returns the GPU data size for a CUDA message. Must be specialized by each CUDA message type
+// in the agnocast_cuda package (e.g., for PointCloud2: height * width * point_step).
+template <typename T>
+size_t get_cuda_gpu_data_size(const T & msg);
+
+}  // namespace agnocast
diff --git a/src/agnocastlib/include/agnocast/gpu_metadata.hpp b/src/agnocastlib/include/agnocast/gpu_metadata.hpp
new file mode 100644
index 000000000..f4c860cef
--- /dev/null
+++ b/src/agnocastlib/include/agnocast/gpu_metadata.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace agnocast
+{
+
+// Opaque handle large enough for cudaIpcMemHandle_t (64 bytes).
+struct GpuHandle
+{
+  uint8_t opaque[64];
+};
+
+// GPU sharing metadata stored in shared memory alongside the message.
+// Allocated by the publish path (while heaphook is active) so it lands in the publisher's
+// shared memory region and is readable by subscribers.
+struct GpuMetadata
+{
+  GpuHandle handle;          // backend-specific shareable handle
+  size_t gpu_data_size;      // size of the GPU allocation in bytes
+  void * publisher_gpu_ptr;  // original device pointer for publisher-side free on reclaim
+};
+
+}  // namespace agnocast
diff --git a/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp b/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp
new file mode 100644
index 000000000..4f91aca30
--- /dev/null
+++ b/src/agnocastlib/include/agnocast/gpu_transfer_backend.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "agnocast/gpu_metadata.hpp"
+
+#include <cstddef>
+
+namespace agnocast::cuda
+{
+
+// Abstract interface for GPU memory sharing backends.
+// Concrete implementations (CudaIpcBackend, etc.) live in the agnocast_cuda package.
+class GpuTransferBackend
+{
+public:
+  virtual ~GpuTransferBackend() = default;
+
+  // Publisher side: create a shareable handle from a device pointer.
+  virtual GpuHandle export_handle(void * device_ptr, size_t size) = 0;
+
+  // Publisher side: free GPU buffer on reclaim.
+  virtual void free_device_memory(void * device_ptr) = 0;
+
+  // Subscriber side: map GPU buffer into this process.
+  virtual void * import_handle(const GpuHandle & handle, size_t size) = 0;
+
+  // Subscriber side: unmap GPU buffer from this process.
+  virtual void release_handle(void * local_ptr) = 0;
+};
+
+// Returns the singleton backend instance. Defined in the agnocast_cuda package and resolved
+// at link time. Only called from if-constexpr branches guarded by is_cuda_message_v<T>,
+// so the symbol is never referenced unless a CUDA message type is actually used.
+GpuTransferBackend & get_backend();
+
+}  // namespace agnocast::cuda