NVIDIA · gonidelis · Mar 10, 2026 · bernhardmgruber · Mar 10, 2026 · bernhardmgruber
@@ -16,8 +16,10 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/env_dispatch.cuh>
 #include <cub/device/dispatch/dispatch_batch_memcpy.cuh>
 
+#include <cuda/std/__execution/env.h>
 #include <cuda/std/__type_traits/is_pointer.h>
 #include <cuda/std/cstdint>
 
@@ -169,6 +171,96 @@ struct DeviceMemcpy
       DispatchBatchMemcpy<InputBufferIt, OutputBufferIt, BufferSizeIteratorT, BlockOffsetT, CopyAlg::Memcpy>::Dispatch(
         d_temp_storage, temp_storage_bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream);
   }
+
+  //! @rst
+  //! Copies data from a batch of given source buffers to their corresponding destination buffer.
+  //!
+  //! .. versionadded:: 3.4.0
+  //!    First appears in CUDA Toolkit 13.4.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! .. note::
+  //!
+  //!    If any input buffer aliases memory from any output buffer the behavior is undefined.
+  //!    If any output buffer aliases memory of another output buffer the behavior is undefined.
+  //!    Input buffers can alias one another.
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates usage of DeviceMemcpy::Batched with an environment:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_memcpy_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin memcpy-batched-env
+  //!     :end-before: example-end memcpy-batched-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputBufferIt
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the pointers to
+  //!   the source memory buffers
+  //!
+  //! @tparam OutputBufferIt
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the pointers to
+  //!   the destination memory buffers
+  //!
+  //! @tparam BufferSizeIteratorT
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the number of bytes
+  //!   to be copied for each pair of buffers
+  //!
+  //! @tparam EnvT
+  //!   **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+  //!
+  //! @param[in] input_buffer_it
+  //!   Device-accessible iterator providing the pointers to the source memory buffers
+  //!
+  //! @param[in] output_buffer_it
+  //!   Device-accessible iterator providing the pointers to the destination memory buffers
+  //!
+  //! @param[in] buffer_sizes
+  //!   Device-accessible iterator providing the number of bytes to be copied for each pair of buffers
+  //!
+  //! @param[in] num_buffers
+  //!   The total number of buffer pairs
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  template <typename InputBufferIt,
+            typename OutputBufferIt,
+            typename BufferSizeIteratorT,
+            typename EnvT = ::cuda::std::execution::env<>,
+            ::cuda::std::enable_if_t<!::cuda::std::is_same_v<InputBufferIt, void*>, int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
+  Batched(InputBufferIt input_buffer_it,
+          OutputBufferIt output_buffer_it,
+          BufferSizeIteratorT buffer_sizes,
+          ::cuda::std::int64_t num_buffers,
+          EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceMemcpy::Batched");
+    static_assert(::cuda::std::is_pointer_v<cub::detail::it_value_t<InputBufferIt>>,
+                  "DeviceMemcpy::Batched only supports copying of memory buffers."
+                  "Please consider using DeviceCopy::Batched instead.");
+    static_assert(::cuda::std::is_pointer_v<cub::detail::it_value_t<OutputBufferIt>>,
+                  "DeviceMemcpy::Batched only supports copying of memory buffers."
+                  "Please consider using DeviceCopy::Batched instead.");
+
+    using BlockOffsetT = uint32_t;
+
+    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
+      return detail::
+        DispatchBatchMemcpy<InputBufferIt, OutputBufferIt, BufferSizeIteratorT, BlockOffsetT, CopyAlg::Memcpy>::Dispatch(
+          storage, bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream);
+    });
+  }
 };
 
 CUB_NAMESPACE_END
@@ -0,0 +1,131 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Should precede any includes
+struct stream_registry_factory_t;
+#define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY stream_registry_factory_t
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_memcpy.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include "catch2_test_env_launch_helper.h"
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMemcpy::Batched, device_memcpy_batched);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+#include <cuda/__execution/require.h>
+
+#include <c2h/catch2_test_helper.h>
+
+namespace stdexec = cuda::std::execution;
+
+template <typename T>
+struct index_to_ptr
+{
+  T* base;
+  const int* offsets;
+  __host__ __device__ __forceinline__ T* operator()(int index) const
+  {
+    return base + offsets[index];
+  }
+};
+
+struct get_size
+{
+  const int* offsets;
+  __host__ __device__ __forceinline__ int operator()(int index) const
+  {
+    return (offsets[index + 1] - offsets[index]) * static_cast<int>(sizeof(int));
+  }
+};
+
+#if TEST_LAUNCH == 0
+
+TEST_CASE("DeviceMemcpy::Batched works with default environment", "[memcpy][device]")
+{
+  // 3 buffers: [10, 20], [30, 40, 50], [60]
+  auto d_src     = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
+  auto d_dst     = c2h::device_vector<int>(6, 0);
-  auto d_dst     = c2h::device_vector<int>(6, 0);
+  auto d_dst     = c2h::device_vector<int>(6);
-  auto d_dst     = c2h::device_vector<int>(6, 0);
+  auto d_dst     = c2h::device_vector<int>(6);
+  auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};
+
+  int num_buffers = 3;
+
+  thrust::counting_iterator<int> iota(0);
+  auto input_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto output_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});
+
+  REQUIRE(cudaSuccess == cub::DeviceMemcpy::Batched(input_it, output_it, sizes, num_buffers));
+
+  REQUIRE(d_dst == d_src);
+}
+
+#endif
+
+C2H_TEST("DeviceMemcpy::Batched uses environment", "[memcpy][device]")
+{
+  // 3 buffers: [10, 20], [30, 40, 50], [60]
+  auto d_src     = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
+  auto d_dst     = c2h::device_vector<int>(6, 0);
+  auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};
+
+  int num_buffers = 3;
+
+  thrust::counting_iterator<int> iota(0);
+  auto input_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto output_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});
+
+  size_t expected_bytes_allocated{};
+  REQUIRE(cudaSuccess
+          == cub::DeviceMemcpy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_buffers));
+
+  auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)};
+
+  device_memcpy_batched(input_it, output_it, sizes, num_buffers, env);
+
+  REQUIRE(d_dst == d_src);
+}
+
+TEST_CASE("DeviceMemcpy::Batched uses custom stream", "[memcpy][device]")
+{
+  // 3 buffers: [10, 20], [30, 40, 50], [60]
+  auto d_src     = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
+  auto d_dst     = c2h::device_vector<int>(6, 0);
+  auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};
+
+  int num_buffers = 3;
+
+  thrust::counting_iterator<int> iota(0);
+  auto input_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto output_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});
+
+  cudaStream_t custom_stream;
+  REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream));
+
+  size_t expected_bytes_allocated{};
+  REQUIRE(cudaSuccess
+          == cub::DeviceMemcpy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_buffers));
+
+  auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}};
+  auto env         = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)};
+
+  device_memcpy_batched(input_it, output_it, sizes, num_buffers, env);
+
+  REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream));
+  REQUIRE(d_dst == d_src);
+  REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream));
+}
@@ -0,0 +1,72 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_memcpy.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <cuda/devices>
+#include <cuda/stream>
+
+#include <iostream>
+
+#include <c2h/catch2_test_helper.h>
+
+template <typename T>
+struct index_to_ptr
+{
+  T* base;
+  const int* offsets;
+  __host__ __device__ __forceinline__ T* operator()(int index) const
+  {
+    return base + offsets[index];
+  }
+};
+
+struct get_size
+{
+  const int* offsets;
+  __host__ __device__ __forceinline__ int operator()(int index) const
+  {
+    return (offsets[index + 1] - offsets[index]) * static_cast<int>(sizeof(int));
+  }
+};
+
+C2H_TEST("cub::DeviceMemcpy::Batched accepts env with stream", "[memcpy][env]")
+{
+  // example-begin memcpy-batched-env
+  // 3 buffers of different sizes: [10, 20], [30, 40, 50], [60]
+  auto d_src     = thrust::device_vector<int>{10, 20, 30, 40, 50, 60};
+  auto d_dst     = thrust::device_vector<int>(6, 0);
+  auto d_offsets = thrust::device_vector<int>{0, 2, 5, 6};
+
+  int num_buffers = 3;
+
+  thrust::counting_iterator<int> iota(0);
+  auto input_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto output_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});
+
+  cuda::stream stream{cuda::devices[0]};
+  cuda::stream_ref stream_ref{stream};
+  auto env = cuda::std::execution::env{stream_ref};
+
+  auto error = cub::DeviceMemcpy::Batched(input_it, output_it, sizes, num_buffers, env);
+  if (error != cudaSuccess)
+  {
+    std::cerr << "cub::DeviceMemcpy::Batched failed with status: " << error << std::endl;
+  }
+
+  thrust::device_vector<int> expected{10, 20, 30, 40, 50, 60};
+  // example-end memcpy-batched-env
+
+  REQUIRE(error == cudaSuccess);
+  REQUIRE(d_dst == expected);
+}