diff --git a/cub/cub/device/device_memcpy.cuh b/cub/cub/device/device_memcpy.cuh index cd57f11ac35..32679ce03e6 100644 --- a/cub/cub/device/device_memcpy.cuh +++ b/cub/cub/device/device_memcpy.cuh @@ -16,8 +16,10 @@ # pragma system_header #endif // no system header +#include #include +#include #include #include @@ -169,6 +171,96 @@ struct DeviceMemcpy DispatchBatchMemcpy::Dispatch( d_temp_storage, temp_storage_bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream); } + + //! @rst + //! Copies data from a batch of given source buffers to their corresponding destination buffer. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! .. note:: + //! + //! If any input buffer aliases memory from any output buffer the behavior is undefined. + //! If any output buffer aliases memory of another output buffer the behavior is undefined. + //! Input buffers can alias one another. + //! + //! Snippet + //! +++++++ + //! + //! The code snippet below illustrates usage of DeviceMemcpy::Batched with an environment: + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_memcpy_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin memcpy-batched-env + //! :end-before: example-end memcpy-batched-env + //! + //! @endrst + //! + //! @tparam InputBufferIt + //! **[inferred]** Device-accessible random-access input iterator type providing the pointers to + //! the source memory buffers + //! + //! @tparam OutputBufferIt + //! **[inferred]** Device-accessible random-access input iterator type providing the pointers to + //! the destination memory buffers + //! + //! @tparam BufferSizeIteratorT + //! **[inferred]** Device-accessible random-access input iterator type providing the number of bytes + //! to be copied for each pair of buffers + //! + //! @tparam EnvT + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! + //! @param[in] input_buffer_it + //! Device-accessible iterator providing the pointers to the source memory buffers + //! + //! @param[in] output_buffer_it + //! Device-accessible iterator providing the pointers to the destination memory buffers + //! + //! @param[in] buffer_sizes + //! Device-accessible iterator providing the number of bytes to be copied for each pair of buffers + //! + //! @param[in] num_buffers + //! The total number of buffer pairs + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template , + ::cuda::std::enable_if_t, int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t + Batched(InputBufferIt input_buffer_it, + OutputBufferIt output_buffer_it, + BufferSizeIteratorT buffer_sizes, + ::cuda::std::int64_t num_buffers, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceMemcpy::Batched"); + static_assert(::cuda::std::is_pointer_v>, + "DeviceMemcpy::Batched only supports copying of memory buffers." + "Please consider using DeviceCopy::Batched instead."); + static_assert(::cuda::std::is_pointer_v>, + "DeviceMemcpy::Batched only supports copying of memory buffers." + "Please consider using DeviceCopy::Batched instead."); + + using BlockOffsetT = uint32_t; + + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + return detail:: + DispatchBatchMemcpy::Dispatch( + storage, bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream); + }); + } }; CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_device_memcpy_env.cu b/cub/test/catch2_test_device_memcpy_env.cu new file mode 100644 index 00000000000..a3a4b428cd8 --- /dev/null +++ b/cub/test/catch2_test_device_memcpy_env.cu @@ -0,0 +1,130 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Should precede any includes +struct stream_registry_factory_t; +#define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY stream_registry_factory_t + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include + +#include +#include + +#include "catch2_test_env_launch_helper.h" + +DECLARE_LAUNCH_WRAPPER(cub::DeviceMemcpy::Batched, device_memcpy_batched); + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +#include + +#include + +namespace stdexec = cuda::std::execution; + +template +struct index_to_ptr +{ + T* base; + const int* offsets; + __host__ __device__ __forceinline__ T* operator()(int index) const + { + return base + offsets[index]; + } +}; + +struct get_size +{ + const int* offsets; + __host__ __device__ __forceinline__ int operator()(int index) const + { + return (offsets[index + 1] - offsets[index]) * static_cast(sizeof(int)); + } +}; + +#if TEST_LAUNCH == 0 + +TEST_CASE("DeviceMemcpy::Batched works with default environment", "[memcpy][device]") +{ + // 3 buffers: [10, 20], [30, 40, 50], [60] + auto d_src = c2h::device_vector{10, 20, 30, 40, 50, 60}; + auto d_dst = c2h::device_vector(6); + auto d_offsets = c2h::device_vector{0, 2, 5, 6}; + + int num_buffers = 3; + + cuda::counting_iterator iota(0); + auto input_it = cuda::transform_iterator( + iota, index_to_ptr{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())}); + auto output_it = cuda::transform_iterator( + iota, index_to_ptr{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())}); + auto sizes = cuda::transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())}); + + REQUIRE(cudaSuccess == cub::DeviceMemcpy::Batched(input_it, output_it, sizes, num_buffers)); + + REQUIRE(d_dst == d_src); +} + +#endif + +C2H_TEST("DeviceMemcpy::Batched uses environment", "[memcpy][device]") +{ + // 3 buffers: [10, 20], [30, 40, 50], [60] + auto d_src = c2h::device_vector{10, 20, 30, 40, 50, 60}; + auto d_dst = c2h::device_vector(6, 0); + auto d_offsets = c2h::device_vector{0, 2, 5, 6}; + + int num_buffers = 3; + + cuda::counting_iterator iota(0); + auto input_it = cuda::transform_iterator( + iota, index_to_ptr{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())}); + auto output_it = cuda::transform_iterator( + iota, index_to_ptr{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())}); + auto sizes = cuda::transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())}); + + size_t expected_bytes_allocated{}; + REQUIRE(cudaSuccess + == cub::DeviceMemcpy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_buffers)); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + device_memcpy_batched(input_it, output_it, sizes, num_buffers, env); + + REQUIRE(d_dst == d_src); +} + +TEST_CASE("DeviceMemcpy::Batched uses custom stream", "[memcpy][device]") +{ + // 3 buffers: [10, 20], [30, 40, 50], [60] + auto d_src = c2h::device_vector{10, 20, 30, 40, 50, 60}; + auto d_dst = c2h::device_vector(6, 0); + auto d_offsets = c2h::device_vector{0, 2, 5, 6}; + + int num_buffers = 3; + + cuda::counting_iterator iota(0); + auto input_it = cuda::transform_iterator( + iota, index_to_ptr{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())}); + auto output_it = cuda::transform_iterator( + iota, index_to_ptr{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())}); + auto sizes = cuda::transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())}); + + cuda::stream custom_stream(cuda::device_ref{0}); + + size_t expected_bytes_allocated{}; + REQUIRE(cudaSuccess + == cub::DeviceMemcpy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_buffers)); + + auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}}; + auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)}; + + device_memcpy_batched(input_it, output_it, sizes, num_buffers, env); + + custom_stream.sync(); + REQUIRE(d_dst == d_src); +} diff --git a/cub/test/catch2_test_device_memcpy_env_api.cu b/cub/test/catch2_test_device_memcpy_env_api.cu new file mode 100644 index 00000000000..7f11263591d --- /dev/null +++ b/cub/test/catch2_test_device_memcpy_env_api.cu @@ -0,0 +1,69 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include +#include + +#include +#include + +#include + +#include + +C2H_TEST("cub::DeviceMemcpy::Batched accepts env with stream", "[memcpy][env]") +{ + // example-begin memcpy-batched-env + // Source data: 3 buffers of different sizes laid out contiguously + // Buffer 0: [10, 20] Buffer 1: [30, 40, 50] Buffer 2: [60] + auto d_src = thrust::device_vector{10, 20, 30, 40, 50, 60}; + + // Copy into two separate destination buffers to highlight the API's flexibility + auto d_dst_a = thrust::device_vector(5, 0); + auto d_dst_b = thrust::device_vector(1, 0); + + // Source pointers: one per buffer, pointing into d_src + auto d_src_ptrs = thrust::device_vector{ + thrust::raw_pointer_cast(d_src.data()) + 0, + thrust::raw_pointer_cast(d_src.data()) + 2, + thrust::raw_pointer_cast(d_src.data()) + 5}; + + // Destination pointers: buffers 0,1 go to d_dst_a, buffer 2 goes to d_dst_b + auto d_dst_ptrs = thrust::device_vector{ + thrust::raw_pointer_cast(d_dst_a.data()) + 0, + thrust::raw_pointer_cast(d_dst_a.data()) + 2, + thrust::raw_pointer_cast(d_dst_b.data()) + 0}; + + // Sizes in bytes for each buffer + auto d_sizes = thrust::device_vector{ + 2 * static_cast(sizeof(int)), 3 * static_cast(sizeof(int)), 1 * static_cast(sizeof(int))}; + + int num_buffers = 3; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceMemcpy::Batched( + thrust::raw_pointer_cast(d_src_ptrs.data()), + thrust::raw_pointer_cast(d_dst_ptrs.data()), + thrust::raw_pointer_cast(d_sizes.data()), + num_buffers, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceMemcpy::Batched failed with status: " << error << std::endl; + } + + thrust::device_vector expected_a{10, 20, 30, 40, 50}; + thrust::device_vector expected_b{60}; + // example-end memcpy-batched-env + + REQUIRE(error == cudaSuccess); + REQUIRE(d_dst_a == expected_a); + REQUIRE(d_dst_b == expected_b); +}