Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions src/agnocast_cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
cmake_minimum_required(VERSION 3.14)
project(agnocast_cuda LANGUAGES CXX CUDA)

if(NOT CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
endif()

if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
add_compile_options(-Wall -Wextra -Wpedantic)
endif()

find_package(ament_cmake REQUIRED)
find_package(agnocastlib REQUIRED)
find_package(sensor_msgs REQUIRED)
find_package(CUDAToolkit REQUIRED)

Comment thread
sykwer marked this conversation as resolved.
add_library(agnocast_cuda SHARED
src/cuda_ipc_backend.cpp
src/vmm_backend.cpp
src/nvscibuf_backend.cpp
src/unified_memory_backend.cpp
src/get_backend.cpp)

target_include_directories(agnocast_cuda PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)

ament_target_dependencies(agnocast_cuda agnocastlib sensor_msgs)
target_link_libraries(agnocast_cuda CUDA::cudart)

install(TARGETS agnocast_cuda
EXPORT export_${PROJECT_NAME}
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
RUNTIME DESTINATION bin
INCLUDES DESTINATION include)

install(
DIRECTORY include/
DESTINATION include)

ament_export_targets(export_${PROJECT_NAME} HAS_LIBRARY_TARGET)
ament_export_include_directories(include)
ament_export_dependencies(agnocastlib sensor_msgs)
ament_package()
42 changes: 42 additions & 0 deletions src/agnocast_cuda/include/agnocast/cuda/types.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#pragma once

#include "agnocast/cuda_message_tag.hpp"

#include <sensor_msgs/msg/image.hpp>
#include <sensor_msgs/msg/point_cloud2.hpp>

#include <cstddef>
#include <cstdint>

namespace agnocast::cuda
{

struct PointCloud2 : public sensor_msgs::msg::PointCloud2, public agnocast::cuda_message_tag
{
uint8_t * data = nullptr; // GPU device pointer (shadows base class std::vector<uint8_t> data)
};

struct Image : public sensor_msgs::msg::Image, public agnocast::cuda_message_tag
{
uint8_t * data = nullptr; // GPU device pointer (shadows base class std::vector<uint8_t> data)
};

} // namespace agnocast::cuda

namespace agnocast
{

template <>
inline size_t get_cuda_gpu_data_size(const agnocast::cuda::PointCloud2 & msg)
{
return static_cast<size_t>(msg.height) * static_cast<size_t>(msg.width) *
static_cast<size_t>(msg.point_step);
}

template <>
inline size_t get_cuda_gpu_data_size(const agnocast::cuda::Image & msg)
{
return static_cast<size_t>(msg.height) * msg.step;
}

} // namespace agnocast
20 changes: 20 additions & 0 deletions src/agnocast_cuda/package.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?xml version="1.0"?>
<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
<package format="3">
<name>agnocast_cuda</name>
<version>0.1.0</version>
<description>
CUDA IPC support for Agnocast. Provides GPU message types and cross-process GPU buffer sharing.
</description>
<maintainer email="sykwer@gmail.com">Takahiro Ishikawa-Aso</maintainer>
<license>Apache License 2.0</license>

<buildtool_depend>ament_cmake</buildtool_depend>

<depend>agnocastlib</depend>
<depend>sensor_msgs</depend>

<export>
<build_type>ament_cmake</build_type>
</export>
</package>
57 changes: 57 additions & 0 deletions src/agnocast_cuda/src/cuda_ipc_backend.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#include "cuda_ipc_backend.hpp"

#include <cuda_runtime.h>

#include <cstdio>
#include <cstdlib>
#include <cstring>

namespace
{

void check_cuda_error(cudaError_t err, const char * operation)
{
if (err != cudaSuccess) {
std::fprintf(
stderr, "[agnocast_cuda] FATAL: %s failed: %s\n", operation, cudaGetErrorString(err));
std::abort();
}
}

} // namespace

namespace agnocast::cuda
{

GpuHandle CudaIpcBackend::export_handle(void * device_ptr, size_t /*size*/)
{
GpuHandle h{};
static_assert(sizeof(cudaIpcMemHandle_t) <= sizeof(h.opaque));
cudaIpcMemHandle_t ipc_handle;
check_cuda_error(cudaIpcGetMemHandle(&ipc_handle, device_ptr), "cudaIpcGetMemHandle");
std::memcpy(h.opaque, &ipc_handle, sizeof(ipc_handle));
return h;
}

void CudaIpcBackend::free_device_memory(void * device_ptr)
{
check_cuda_error(cudaFree(device_ptr), "cudaFree");
}

void * CudaIpcBackend::import_handle(const GpuHandle & handle, size_t /*size*/)
{
cudaIpcMemHandle_t ipc_handle;
std::memcpy(&ipc_handle, handle.opaque, sizeof(ipc_handle));
void * ptr = nullptr;
check_cuda_error(
cudaIpcOpenMemHandle(&ptr, ipc_handle, cudaIpcMemLazyEnablePeerAccess),
"cudaIpcOpenMemHandle");
return ptr;
}

void CudaIpcBackend::release_handle(void * local_ptr)
{
check_cuda_error(cudaIpcCloseMemHandle(local_ptr), "cudaIpcCloseMemHandle");
}

} // namespace agnocast::cuda
19 changes: 19 additions & 0 deletions src/agnocast_cuda/src/cuda_ipc_backend.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
// Only get_backend.cpp includes this to instantiate the singleton.
#pragma once

#include "agnocast/gpu_transfer_backend.hpp"

namespace agnocast::cuda
{

class CudaIpcBackend : public GpuTransferBackend
{
public:
GpuHandle export_handle(void * device_ptr, size_t size) override;
void free_device_memory(void * device_ptr) override;
void * import_handle(const GpuHandle & handle, size_t size) override;
void release_handle(void * local_ptr) override;
};

} // namespace agnocast::cuda
64 changes: 64 additions & 0 deletions src/agnocast_cuda/src/get_backend.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#include "agnocast/gpu_transfer_backend.hpp"
#include "cuda_ipc_backend.hpp"
#include "nvscibuf_backend.hpp"
#include "unified_memory_backend.hpp"
#include "vmm_backend.hpp"

#include <cuda_runtime.h>

#include <cstdio>
#include <cstdlib>
#include <memory>

namespace agnocast::cuda
{

namespace
{

std::unique_ptr<GpuTransferBackend> select_backend()
{
int device = 0;
cudaError_t err = cudaGetDevice(&device);
if (err != cudaSuccess) {
std::fprintf(
stderr, "[agnocast_cuda] FATAL: cudaGetDevice failed: %s\n", cudaGetErrorString(err));
std::abort();
}

int is_integrated = 0;
err = cudaDeviceGetAttribute(&is_integrated, cudaDevAttrIntegrated, device);
if (err != cudaSuccess) {
std::fprintf(
stderr, "[agnocast_cuda] FATAL: cudaDeviceGetAttribute failed: %s\n",
cudaGetErrorString(err));
std::abort();
}

if (!is_integrated) {
// Discrete GPU (GeForce, Quadro, Tesla, A/H series) — CUDA IPC is supported.
std::fprintf(stderr, "[agnocast_cuda] Discrete GPU detected, using CudaIpcBackend.\n");
return std::make_unique<CudaIpcBackend>();
}

// Integrated GPU (Jetson Xavier/Orin/Thor, DRIVE).
// TODO(agnocast): Implement and select the appropriate backend.
// - Jetson Thor (CUDA 13.0+): CudaIpcBackend may work via OpenRM.
// - Jetson Xavier/Orin: NvSciBufBackend or UnifiedMemoryBackend.
// - DRIVE: NvSciBufBackend.
Comment on lines +45 to +49
Copy link
Copy Markdown

@nishikawa-masaki nishikawa-masaki Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After a long discussion with Gemini, it seems that CUDA VMM API is the best solution for implementing Agnocast on Tegra-family GPUs where CUDA IPC is not usable.

NvSciBuf has the following pros and cons.

Pros:

  • NVIDIA recommends NvSciBuf for sharing image data with no-copy.
    • Other data format also can be shared on NvSciBuf framework.
  • NvSciBuf is integrated with NvSciSync for timing control.
  • NvSciBuf can be used on non-GPU devices like DLA (Deep Learning Accelerator) and other hardware.
  • Camera Interface on DRIVE Thor is based on NvSciBuf. (See note A)
  • NvSciBuf is conformant to ISO26262 (access control)

Cons (big one):

  • NvSciBuf requires negotioation between the writer and the reader(s) before communication starts.
    • This makes it difficult to be used in ROS2 architecture.
      • Negotiation among multiple nodes can be complex.
      • The number of readers may change during operation.

Note A:
It is said to be possible to implement the following design with zero-copy (need more investigation)
Camera --NvSciBuf--> Node_A --VMM API --> Node_B

std::fprintf(
stderr,
"[agnocast_cuda] FATAL: Integrated GPU detected (Jetson/DRIVE). "
"No backend is implemented yet for this platform.\n");
std::abort();
}

} // namespace

GpuTransferBackend & get_backend()
{
static auto instance = select_backend();
return *instance;
}

} // namespace agnocast::cuda
30 changes: 30 additions & 0 deletions src/agnocast_cuda/src/nvscibuf_backend.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include "nvscibuf_backend.hpp"

#include <stdexcept>

namespace agnocast::cuda
{

GpuHandle NvSciBufBackend::export_handle(void * /*device_ptr*/, size_t /*size*/)
{
throw std::runtime_error(
"[agnocast_cuda] NvSciBufBackend is not yet implemented. "
"Requires NvSciBuf (Jetson Xavier/Orin, NVIDIA DRIVE).");
}

void NvSciBufBackend::free_device_memory(void * /*device_ptr*/)
{
throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented.");
}

void * NvSciBufBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/)
{
throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented.");
}

void NvSciBufBackend::release_handle(void * /*local_ptr*/)
{
throw std::runtime_error("[agnocast_cuda] NvSciBufBackend is not yet implemented.");
}

} // namespace agnocast::cuda
21 changes: 21 additions & 0 deletions src/agnocast_cuda/src/nvscibuf_backend.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
// Only get_backend.cpp includes this to instantiate the singleton.
#pragma once

#include "agnocast/gpu_transfer_backend.hpp"

namespace agnocast::cuda
{

// Placeholder backend using NvSciBuf / NvSciSync.
// Target platforms: Jetson Xavier/Orin, NVIDIA DRIVE.
class NvSciBufBackend : public GpuTransferBackend
{
public:
GpuHandle export_handle(void * device_ptr, size_t size) override;
void free_device_memory(void * device_ptr) override;
void * import_handle(const GpuHandle & handle, size_t size) override;
void release_handle(void * local_ptr) override;
};

} // namespace agnocast::cuda
30 changes: 30 additions & 0 deletions src/agnocast_cuda/src/unified_memory_backend.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include "unified_memory_backend.hpp"

#include <stdexcept>

namespace agnocast::cuda
{

GpuHandle UnifiedMemoryBackend::export_handle(void * /*device_ptr*/, size_t /*size*/)
{
throw std::runtime_error(
"[agnocast_cuda] UnifiedMemoryBackend is not yet implemented. "
"Requires POSIX shm + cudaHostRegister (Jetson unified memory).");
}

void UnifiedMemoryBackend::free_device_memory(void * /*device_ptr*/)
{
throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented.");
}

void * UnifiedMemoryBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/)
{
throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented.");
}

void UnifiedMemoryBackend::release_handle(void * /*local_ptr*/)
{
throw std::runtime_error("[agnocast_cuda] UnifiedMemoryBackend is not yet implemented.");
}

} // namespace agnocast::cuda
21 changes: 21 additions & 0 deletions src/agnocast_cuda/src/unified_memory_backend.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Internal header — kept in src/ so it is NOT installed or visible to downstream packages.
// Only get_backend.cpp includes this to instantiate the singleton.
#pragma once

#include "agnocast/gpu_transfer_backend.hpp"

namespace agnocast::cuda
{

// Placeholder backend using POSIX shared memory + cudaHostRegister for Jetson platforms
// where CPU and GPU share the same physical memory (unified memory architecture).
class UnifiedMemoryBackend : public GpuTransferBackend
{
public:
GpuHandle export_handle(void * device_ptr, size_t size) override;
void free_device_memory(void * device_ptr) override;
void * import_handle(const GpuHandle & handle, size_t size) override;
void release_handle(void * local_ptr) override;
};

} // namespace agnocast::cuda
30 changes: 30 additions & 0 deletions src/agnocast_cuda/src/vmm_backend.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include "vmm_backend.hpp"

#include <stdexcept>

namespace agnocast::cuda
{

GpuHandle VmmBackend::export_handle(void * /*device_ptr*/, size_t /*size*/)
{
throw std::runtime_error(
"[agnocast_cuda] VmmBackend is not yet implemented. "
"Requires cuMemExportToShareableHandle (CUDA Driver API).");
}

void VmmBackend::free_device_memory(void * /*device_ptr*/)
{
throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented.");
}

void * VmmBackend::import_handle(const GpuHandle & /*handle*/, size_t /*size*/)
{
throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented.");
}

void VmmBackend::release_handle(void * /*local_ptr*/)
{
throw std::runtime_error("[agnocast_cuda] VmmBackend is not yet implemented.");
}

} // namespace agnocast::cuda
Loading
Loading