Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions cub/cub/device/device_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
# pragma system_header
#endif // no system header

#include <cub/detail/env_dispatch.cuh>
#include <cub/device/dispatch/dispatch_batch_memcpy.cuh>

#include <cuda/std/__execution/env.h>
#include <cuda/std/__type_traits/is_pointer.h>
#include <cuda/std/cstdint>

Expand Down Expand Up @@ -169,6 +171,96 @@ struct DeviceMemcpy
DispatchBatchMemcpy<InputBufferIt, OutputBufferIt, BufferSizeIteratorT, BlockOffsetT, CopyAlg::Memcpy>::Dispatch(
d_temp_storage, temp_storage_bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream);
}

//! @rst
//! Copies data from a batch of given source buffers to their corresponding destination buffer.
//!
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
//!
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! .. note::
//!
//! If any input buffer aliases memory from any output buffer the behavior is undefined.
//! If any output buffer aliases memory of another output buffer the behavior is undefined.
//! Input buffers can alias one another.
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates usage of DeviceMemcpy::Batched with an environment:
//!
//! .. literalinclude:: ../../../cub/test/catch2_test_device_memcpy_env_api.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin memcpy-batched-env
//! :end-before: example-end memcpy-batched-env
//!
//! @endrst
//!
//! @tparam InputBufferIt
//! **[inferred]** Device-accessible random-access input iterator type providing the pointers to
//! the source memory buffers
//!
//! @tparam OutputBufferIt
//! **[inferred]** Device-accessible random-access input iterator type providing the pointers to
//! the destination memory buffers
//!
//! @tparam BufferSizeIteratorT
//! **[inferred]** Device-accessible random-access input iterator type providing the number of bytes
//! to be copied for each pair of buffers
//!
//! @tparam EnvT
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
//!
//! @param[in] input_buffer_it
//! Device-accessible iterator providing the pointers to the source memory buffers
//!
//! @param[in] output_buffer_it
//! Device-accessible iterator providing the pointers to the destination memory buffers
//!
//! @param[in] buffer_sizes
//! Device-accessible iterator providing the number of bytes to be copied for each pair of buffers
//!
//! @param[in] num_buffers
//! The total number of buffer pairs
//!
//! @param[in] env
//! @rst
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename InputBufferIt,
typename OutputBufferIt,
typename BufferSizeIteratorT,
typename EnvT = ::cuda::std::execution::env<>,
::cuda::std::enable_if_t<!::cuda::std::is_same_v<InputBufferIt, void*>, int> = 0>
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
Batched(InputBufferIt input_buffer_it,
OutputBufferIt output_buffer_it,
BufferSizeIteratorT buffer_sizes,
::cuda::std::int64_t num_buffers,
EnvT env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceMemcpy::Batched");
static_assert(::cuda::std::is_pointer_v<cub::detail::it_value_t<InputBufferIt>>,
"DeviceMemcpy::Batched only supports copying of memory buffers."
"Please consider using DeviceCopy::Batched instead.");
static_assert(::cuda::std::is_pointer_v<cub::detail::it_value_t<OutputBufferIt>>,
"DeviceMemcpy::Batched only supports copying of memory buffers."
"Please consider using DeviceCopy::Batched instead.");

using BlockOffsetT = uint32_t;

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
return detail::
DispatchBatchMemcpy<InputBufferIt, OutputBufferIt, BufferSizeIteratorT, BlockOffsetT, CopyAlg::Memcpy>::Dispatch(
storage, bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream);
});
}
};

CUB_NAMESPACE_END
131 changes: 131 additions & 0 deletions cub/test/catch2_test_device_memcpy_env.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

// Should precede any includes
struct stream_registry_factory_t;
#define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY stream_registry_factory_t

#include "insert_nested_NVTX_range_guard.h"

#include <cub/device/device_memcpy.cuh>

#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

#include "catch2_test_env_launch_helper.h"

DECLARE_LAUNCH_WRAPPER(cub::DeviceMemcpy::Batched, device_memcpy_batched);

// %PARAM% TEST_LAUNCH lid 0:1:2

#include <cuda/__execution/require.h>

#include <c2h/catch2_test_helper.h>

namespace stdexec = cuda::std::execution;

template <typename T>
struct index_to_ptr
{
T* base;
const int* offsets;
__host__ __device__ __forceinline__ T* operator()(int index) const
{
return base + offsets[index];
}
};

struct get_size
{
const int* offsets;
__host__ __device__ __forceinline__ int operator()(int index) const
{
return (offsets[index + 1] - offsets[index]) * static_cast<int>(sizeof(int));
}
};

#if TEST_LAUNCH == 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: What prevents us from running the below unit test for launch id 1 and 2?


TEST_CASE("DeviceMemcpy::Batched works with default environment", "[memcpy][device]")
{
// 3 buffers: [10, 20], [30, 40, 50], [60]
auto d_src = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
auto d_dst = c2h::device_vector<int>(6, 0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion:

Suggested change
auto d_dst = c2h::device_vector<int>(6, 0);
auto d_dst = c2h::device_vector<int>(6);

auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};

int num_buffers = 3;

thrust::counting_iterator<int> iota(0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: Please prefer cuda iterators over thrust iterators.

auto input_it = thrust::make_transform_iterator(
iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
auto output_it = thrust::make_transform_iterator(
iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});

REQUIRE(cudaSuccess == cub::DeviceMemcpy::Batched(input_it, output_it, sizes, num_buffers));

REQUIRE(d_dst == d_src);
}

#endif

C2H_TEST("DeviceMemcpy::Batched uses environment", "[memcpy][device]")
{
// 3 buffers: [10, 20], [30, 40, 50], [60]
auto d_src = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
auto d_dst = c2h::device_vector<int>(6, 0);
auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};

int num_buffers = 3;

thrust::counting_iterator<int> iota(0);
auto input_it = thrust::make_transform_iterator(
iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
auto output_it = thrust::make_transform_iterator(
iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});

size_t expected_bytes_allocated{};
REQUIRE(cudaSuccess
== cub::DeviceMemcpy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_buffers));

auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)};

device_memcpy_batched(input_it, output_it, sizes, num_buffers, env);

REQUIRE(d_dst == d_src);
}

TEST_CASE("DeviceMemcpy::Batched uses custom stream", "[memcpy][device]")
{
// 3 buffers: [10, 20], [30, 40, 50], [60]
auto d_src = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
auto d_dst = c2h::device_vector<int>(6, 0);
auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};

int num_buffers = 3;

thrust::counting_iterator<int> iota(0);
auto input_it = thrust::make_transform_iterator(
iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
auto output_it = thrust::make_transform_iterator(
iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});

cudaStream_t custom_stream;
REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream));
Comment on lines +116 to +117
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: Please use cuda::stream to increase its test coverage


size_t expected_bytes_allocated{};
REQUIRE(cudaSuccess
== cub::DeviceMemcpy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_buffers));

auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}};
auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)};

device_memcpy_batched(input_it, output_it, sizes, num_buffers, env);

REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream));
REQUIRE(d_dst == d_src);
REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream));
}
72 changes: 72 additions & 0 deletions cub/test/catch2_test_device_memcpy_env_api.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "insert_nested_NVTX_range_guard.h"

#include <cub/device/device_memcpy.cuh>

#include <thrust/detail/raw_pointer_cast.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

#include <cuda/devices>
#include <cuda/stream>

#include <iostream>

#include <c2h/catch2_test_helper.h>

template <typename T>
struct index_to_ptr
{
T* base;
const int* offsets;
__host__ __device__ __forceinline__ T* operator()(int index) const
{
return base + offsets[index];
}
};

struct get_size
{
const int* offsets;
__host__ __device__ __forceinline__ int operator()(int index) const
{
return (offsets[index + 1] - offsets[index]) * static_cast<int>(sizeof(int));
}
};

C2H_TEST("cub::DeviceMemcpy::Batched accepts env with stream", "[memcpy][env]")
{
// example-begin memcpy-batched-env
// 3 buffers of different sizes: [10, 20], [30, 40, 50], [60]
auto d_src = thrust::device_vector<int>{10, 20, 30, 40, 50, 60};
auto d_dst = thrust::device_vector<int>(6, 0);
auto d_offsets = thrust::device_vector<int>{0, 2, 5, 6};

int num_buffers = 3;

thrust::counting_iterator<int> iota(0);
auto input_it = thrust::make_transform_iterator(
iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
auto output_it = thrust::make_transform_iterator(
iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});

cuda::stream stream{cuda::devices[0]};
cuda::stream_ref stream_ref{stream};
auto env = cuda::std::execution::env{stream_ref};

auto error = cub::DeviceMemcpy::Batched(input_it, output_it, sizes, num_buffers, env);
if (error != cudaSuccess)
{
std::cerr << "cub::DeviceMemcpy::Batched failed with status: " << error << std::endl;
}

thrust::device_vector<int> expected{10, 20, 30, 40, 50, 60};
// example-end memcpy-batched-env
Comment on lines +43 to +68
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: I don't think this is a great API example. We should focus on the API's design of taking an iterator to pointers. Something like:

  auto d_src     = thrust::device_vector<int>{10, 20, 30, 40, 50, 60};
  auto d_src_pointers = thrust::device_vector<int*>{d_src[0], d_src[2], d_src[5]};
  • the raw pointer casts. You could even consider writing the results to two different buffers, to highlight the API's flexibility.


REQUIRE(error == cudaSuccess);
REQUIRE(d_dst == expected);
}
Loading