Add env DeviceCopy::* and tests

gonidelis · gonidelis · commit 35a7116aab0f · 2026-03-09T18:39:14.000-07:00
diff --git a/cub/cub/device/device_copy.cuh b/cub/cub/device/device_copy.cuh
@@ -16,12 +16,14 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/env_dispatch.cuh>
 #include <cub/device/dispatch/dispatch_batch_memcpy.cuh>
 #include <cub/device/dispatch/dispatch_copy_mdspan.cuh>
 #include <cub/device/dispatch/tuning/tuning_batch_memcpy.cuh>
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
+#include <cuda/std/__execution/env.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/mdspan>
 
@@ -164,6 +166,85 @@ struct DeviceCopy
       d_temp_storage, temp_storage_bytes, input_it, output_it, sizes, num_ranges, stream);
   }
 
+  //! @rst
+  //! Copies data from a batch of given source ranges to their corresponding destination ranges.
+  //!
+  //! .. versionadded:: 3.4.0
+  //!    First appears in CUDA Toolkit 13.4.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! - This operation provides ``gpu_to_gpu`` determinism: results are identical across different GPU architectures.
+  //!
+  //! .. note::
+  //!
+  //!    If any input range aliases any output range the behavior is undefined.
+  //!    If any output range aliases another output range the behavior is undefined.
+  //!    Input ranges can alias one another.
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates usage of DeviceCopy::Batched with an environment:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_copy_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin copy-batched-env
+  //!     :end-before: example-end copy-batched-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIt
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the iterators to the source ranges
+  //!
+  //! @tparam OutputIt
+  //!  **[inferred]** Device-accessible random-access input iterator type providing the iterators to
+  //!  the destination ranges
+  //!
+  //! @tparam SizeIteratorT
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the number of items to be
+  //!   copied for each pair of ranges
+  //!
+  //! @tparam EnvT
+  //!   **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+  //!
+  //! @param[in] input_it
+  //!   Device-accessible iterator providing the iterators to the source ranges
+  //!
+  //! @param[in] output_it
+  //!   Device-accessible iterator providing the iterators to the destination ranges
+  //!
+  //! @param[in] sizes
+  //!   Device-accessible iterator providing the number of elements to be copied for each pair of ranges
+  //!
+  //! @param[in] num_ranges
+  //!   The total number of range pairs
+  //!
+  //! @param[in] env
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  template <typename InputIt,
+            typename OutputIt,
+            typename SizeIteratorT,
+            typename EnvT                                                          = ::cuda::std::execution::env<>,
+            ::cuda::std::enable_if_t<!::cuda::std::is_same_v<InputIt, void*>, int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
+  Batched(InputIt input_it, OutputIt output_it, SizeIteratorT sizes, ::cuda::std::int64_t num_ranges, EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceCopy::Batched");
+
+    using BlockOffsetT = uint32_t;
+
+    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
+      return detail::DispatchBatchMemcpy<InputIt, OutputIt, SizeIteratorT, BlockOffsetT, CopyAlg::Copy>::Dispatch(
+        storage, bytes, input_it, output_it, sizes, num_ranges, stream);
+    });
+  }
+
   //! @rst
   //! Copies data from a multidimensional source mdspan to a destination mdspan.
   //!
@@ -277,6 +358,110 @@ struct DeviceCopy
     }
     return detail::copy_mdspan::copy(mdspan_in, mdspan_out, stream);
   }
+
+  //! @rst
+  //! Copies data from a multidimensional source mdspan to a destination mdspan.
+  //!
+  //! .. versionadded:: 3.4.0
+  //!    First appears in CUDA Toolkit 13.4.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! - This operation provides ``gpu_to_gpu`` determinism: results are identical across different GPU architectures.
+  //!
+  //! This function performs a parallel copy operation between two mdspan objects with potentially different layouts but
+  //! identical extents. The copy operation handles arbitrary-dimensional arrays and automatically manages layout
+  //! transformations.
+  //!
+  //! Preconditions
+  //! +++++++++++++
+  //!
+  //!    * The source and destination mdspans must have identical extents (same ranks and sizes).
+  //!    * The source and destination mdspans data handle must not be nullptr if the size is not 0.
+  //!    * The underlying memory of the source and destination must not overlap.
+  //!    * Both mdspans must point to device memory.
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates usage of DeviceCopy::Copy with an environment:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_copy_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin copy-mdspan-env
+  //!     :end-before: example-end copy-mdspan-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam T_In
+  //!   **[inferred]** The element type of the source mdspan
+  //!
+  //! @tparam Extents_In
+  //!   **[inferred]** The extents type of the source mdspan
+  //!
+  //! @tparam Layout_In
+  //!   **[inferred]** The layout type of the source mdspan
+  //!
+  //! @tparam Accessor_In
+  //!   **[inferred]** The accessor type of the source mdspan
+  //!
+  //! @tparam T_Out
+  //!   **[inferred]** The element type of the destination mdspan
+  //!
+  //! @tparam Extents_Out
+  //!   **[inferred]** The extents type of the destination mdspan
+  //!
+  //! @tparam Layout_Out
+  //!   **[inferred]** The layout type of the destination mdspan
+  //!
+  //! @tparam Accessor_Out
+  //!   **[inferred]** The accessor type of the destination mdspan
+  //!
+  //! @tparam EnvT
+  //!   **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+  //!
+  //! @param[in] mdspan_in
+  //!   Source mdspan containing the data to be copied
+  //!
+  //! @param[out] mdspan_out
+  //!   Destination mdspan where the data will be copied
+  //!
+  //! @param[in] env
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  template <typename T_In,
+            typename Extents_In,
+            typename Layout_In,
+            typename Accessor_In,
+            typename T_Out,
+            typename Extents_Out,
+            typename Layout_Out,
+            typename Accessor_Out,
+            typename EnvT = ::cuda::std::execution::env<>>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
+  Copy(::cuda::std::mdspan<T_In, Extents_In, Layout_In, Accessor_In> mdspan_in,
+       ::cuda::std::mdspan<T_Out, Extents_Out, Layout_Out, Accessor_Out> mdspan_out,
+       EnvT env = {})
+  {
+    // no nvtx range because Copy delegates to Transform/ForEachInExtents which emit their own NVTX ranges
+    _CCCL_ASSERT(mdspan_in.extents() == mdspan_out.extents(), "mdspan extents must be equal");
+    _CCCL_ASSERT((mdspan_in.data_handle() != nullptr && mdspan_out.data_handle() != nullptr) || mdspan_in.size() == 0,
+                 "mdspan data handle must not be nullptr if the size is not 0");
+    if (mdspan_in.size() != 0)
+    {
+      auto in_start  = mdspan_in.data_handle();
+      auto in_end    = in_start + mdspan_in.mapping().required_span_size();
+      auto out_start = mdspan_out.data_handle();
+      auto out_end   = out_start + mdspan_out.mapping().required_span_size();
+      _CCCL_ASSERT(!(in_end >= out_start && out_end >= in_start), "mdspan memory ranges must not overlap");
+    }
+
+    return detail::copy_mdspan::copy(mdspan_in, mdspan_out, env);
+  }
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/device/dispatch/dispatch_copy_mdspan.cuh b/cub/cub/device/dispatch/dispatch_copy_mdspan.cuh
@@ -52,11 +52,12 @@ template <typename T_In,
           typename T_Out,
           typename E_Out,
           typename L_Out,
-          typename A_Out>
+          typename A_Out,
+          typename EnvT = ::cuda::std::execution::env<>>
 [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
 copy(::cuda::std::mdspan<T_In, E_In, L_In, A_In> mdspan_in,
      ::cuda::std::mdspan<T_Out, E_Out, L_Out, A_Out> mdspan_out,
-     ::cudaStream_t stream)
+     EnvT env = {})
 {
   if (mdspan_in.is_exhaustive() && mdspan_out.is_exhaustive()
       && detail::have_same_strides(mdspan_in.mapping(), mdspan_out.mapping()))
@@ -66,11 +67,11 @@ copy(::cuda::std::mdspan<T_In, E_In, L_In, A_In> mdspan_in,
       mdspan_out.data_handle(),
       mdspan_in.size(),
       ::cuda::proclaim_copyable_arguments(::cuda::std::identity{}),
-      stream);
+      env);
   }
   // TODO (fbusato): add ForEachInLayout when mdspan_in and mdspan_out have compatible layouts
   // Compatible layouts could use more efficient iteration patterns
-  return cub::DeviceFor::ForEachInExtents(mdspan_in.extents(), copy_mdspan_t{mdspan_in, mdspan_out}, stream);
+  return cub::DeviceFor::ForEachInExtents(mdspan_in.extents(), copy_mdspan_t{mdspan_in, mdspan_out}, env);
 }
 } // namespace detail::copy_mdspan
 
diff --git a/cub/test/catch2_test_device_copy_env.cu b/cub/test/catch2_test_device_copy_env.cu
@@ -0,0 +1,132 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Should precede any includes
+struct stream_registry_factory_t;
+#define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY stream_registry_factory_t
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_copy.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include "catch2_test_env_launch_helper.h"
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceCopy::Batched, device_copy_batched);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+#include <cuda/__execution/require.h>
+
+#include <c2h/catch2_test_helper.h>
+
+namespace stdexec = cuda::std::execution;
+
+template <typename T>
+struct index_to_ptr
+{
+  T* base;
+  const int* offsets;
+  __host__ __device__ __forceinline__ T* operator()(int index) const
+  {
+    return base + offsets[index];
+  }
+};
+
+struct get_size
+{
+  const int* offsets;
+  __host__ __device__ __forceinline__ int operator()(int index) const
+  {
+    return offsets[index + 1] - offsets[index];
+  }
+};
+
+#if TEST_LAUNCH == 0
+
+TEST_CASE("DeviceCopy::Batched works with default environment", "[copy][device]")
+{
+  // 3 ranges: [10, 20], [30, 40, 50], [60]
+  auto d_src     = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
+  auto d_dst     = c2h::device_vector<int>(6, 0);
+  auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};
+
+  int num_ranges = 3;
+
+  thrust::counting_iterator<int> iota(0);
+  auto input_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto output_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});
+
+  REQUIRE(cudaSuccess == cub::DeviceCopy::Batched(input_it, output_it, sizes, num_ranges));
+
+  REQUIRE(d_dst == d_src);
+}
+
+#endif
+
+C2H_TEST("DeviceCopy::Batched uses environment", "[copy][device]")
+{
+  // 3 ranges: [10, 20], [30, 40, 50], [60]
+  auto d_src     = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
+  auto d_dst     = c2h::device_vector<int>(6, 0);
+  auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};
+
+  int num_ranges = 3;
+
+  thrust::counting_iterator<int> iota(0);
+  auto input_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto output_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});
+
+  size_t expected_bytes_allocated{};
+  REQUIRE(
+    cudaSuccess == cub::DeviceCopy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_ranges));
+
+  auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)};
+
+  device_copy_batched(input_it, output_it, sizes, num_ranges, env);
+
+  REQUIRE(d_dst == d_src);
+}
+
+TEST_CASE("DeviceCopy::Batched uses custom stream", "[copy][device]")
+{
+  // 3 ranges: [10, 20], [30, 40, 50], [60]
+  auto d_src     = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
+  auto d_dst     = c2h::device_vector<int>(6, 0);
+  auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};
+
+  int num_ranges = 3;
+
+  thrust::counting_iterator<int> iota(0);
+  auto input_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto output_it = thrust::make_transform_iterator(
+    iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});
+
+  cudaStream_t custom_stream;
+  REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream));
+
+  size_t expected_bytes_allocated{};
+  REQUIRE(
+    cudaSuccess == cub::DeviceCopy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_ranges));
+
+  auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}};
+  auto env         = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)};
+
+  device_copy_batched(input_it, output_it, sizes, num_ranges, env);
+
+  REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream));
+  REQUIRE(d_dst == d_src);
+  REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream));
+}
diff --git a/cub/test/catch2_test_device_copy_env_api.cu b/cub/test/catch2_test_device_copy_env_api.cu