Skip to content

Commit bce1b13

Browse files
committed
Add env DeviceCopy::* and tests
1 parent d0befe5 commit bce1b13

File tree

4 files changed

+426
-4
lines changed

4 files changed

+426
-4
lines changed

cub/cub/device/device_copy.cuh

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@
1616
# pragma system_header
1717
#endif // no system header
1818

19+
#include <cub/detail/env_dispatch.cuh>
1920
#include <cub/device/dispatch/dispatch_batch_memcpy.cuh>
2021
#include <cub/device/dispatch/dispatch_copy_mdspan.cuh>
2122
#include <cub/device/dispatch/tuning/tuning_batch_memcpy.cuh>
2223

2324
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
2425

26+
#include <cuda/std/__execution/env.h>
2527
#include <cuda/std/cstdint>
2628
#include <cuda/std/mdspan>
2729

@@ -164,6 +166,85 @@ struct DeviceCopy
164166
d_temp_storage, temp_storage_bytes, input_it, output_it, sizes, num_ranges, stream);
165167
}
166168

169+
//! @rst
170+
//! Copies data from a batch of given source ranges to their corresponding destination ranges.
171+
//!
172+
//! .. versionadded:: 3.4.0
173+
//! First appears in CUDA Toolkit 13.4.
174+
//!
175+
//! This is an environment-based API that allows customization of:
176+
//!
177+
//! - Stream: Query via ``cuda::get_stream``
178+
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
179+
//!
180+
//! - This operation provides ``gpu_to_gpu`` determinism: results are identical across different GPU architectures.
181+
//!
182+
//! .. note::
183+
//!
184+
//! If any input range aliases any output range the behavior is undefined.
185+
//! If any output range aliases another output range the behavior is undefined.
186+
//! Input ranges can alias one another.
187+
//!
188+
//! Snippet
189+
//! +++++++
190+
//!
191+
//! The code snippet below illustrates usage of DeviceCopy::Batched with an environment:
192+
//!
193+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_copy_env_api.cu
194+
//! :language: c++
195+
//! :dedent:
196+
//! :start-after: example-begin copy-batched-env
197+
//! :end-before: example-end copy-batched-env
198+
//!
199+
//! @endrst
200+
//!
201+
//! @tparam InputIt
202+
//! **[inferred]** Device-accessible random-access input iterator type providing the iterators to the source ranges
203+
//!
204+
//! @tparam OutputIt
205+
//! **[inferred]** Device-accessible random-access input iterator type providing the iterators to
206+
//! the destination ranges
207+
//!
208+
//! @tparam SizeIteratorT
209+
//! **[inferred]** Device-accessible random-access input iterator type providing the number of items to be
210+
//! copied for each pair of ranges
211+
//!
212+
//! @tparam EnvT
213+
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
214+
//!
215+
//! @param[in] input_it
216+
//! Device-accessible iterator providing the iterators to the source ranges
217+
//!
218+
//! @param[in] output_it
219+
//! Device-accessible iterator providing the iterators to the destination ranges
220+
//!
221+
//! @param[in] sizes
222+
//! Device-accessible iterator providing the number of elements to be copied for each pair of ranges
223+
//!
224+
//! @param[in] num_ranges
225+
//! The total number of range pairs
226+
//!
227+
//! @param[in] env
228+
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
229+
//! @endrst
230+
template <typename InputIt,
231+
typename OutputIt,
232+
typename SizeIteratorT,
233+
typename EnvT = ::cuda::std::execution::env<>,
234+
::cuda::std::enable_if_t<!::cuda::std::is_same_v<InputIt, void*>, int> = 0>
235+
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
236+
Batched(InputIt input_it, OutputIt output_it, SizeIteratorT sizes, ::cuda::std::int64_t num_ranges, EnvT env = {})
237+
{
238+
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceCopy::Batched");
239+
240+
using BlockOffsetT = uint32_t;
241+
242+
return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
243+
return detail::DispatchBatchMemcpy<InputIt, OutputIt, SizeIteratorT, BlockOffsetT, CopyAlg::Copy>::Dispatch(
244+
storage, bytes, input_it, output_it, sizes, num_ranges, stream);
245+
});
246+
}
247+
167248
//! @rst
168249
//! Copies data from a multidimensional source mdspan to a destination mdspan.
169250
//!
@@ -277,6 +358,110 @@ struct DeviceCopy
277358
}
278359
return detail::copy_mdspan::copy(mdspan_in, mdspan_out, stream);
279360
}
361+
362+
//! @rst
363+
//! Copies data from a multidimensional source mdspan to a destination mdspan.
364+
//!
365+
//! .. versionadded:: 3.4.0
366+
//! First appears in CUDA Toolkit 13.4.
367+
//!
368+
//! This is an environment-based API that allows customization of:
369+
//!
370+
//! - Stream: Query via ``cuda::get_stream``
371+
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
372+
//!
373+
//! - This operation provides ``gpu_to_gpu`` determinism: results are identical across different GPU architectures.
374+
//!
375+
//! This function performs a parallel copy operation between two mdspan objects with potentially different layouts but
376+
//! identical extents. The copy operation handles arbitrary-dimensional arrays and automatically manages layout
377+
//! transformations.
378+
//!
379+
//! Preconditions
380+
//! +++++++++++++
381+
//!
382+
//! * The source and destination mdspans must have identical extents (same ranks and sizes).
383+
//! * The source and destination mdspans data handle must not be nullptr if the size is not 0.
384+
//! * The underlying memory of the source and destination must not overlap.
385+
//! * Both mdspans must point to device memory.
386+
//!
387+
//! Snippet
388+
//! +++++++
389+
//!
390+
//! The code snippet below illustrates usage of DeviceCopy::Copy with an environment:
391+
//!
392+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_copy_env_api.cu
393+
//! :language: c++
394+
//! :dedent:
395+
//! :start-after: example-begin copy-mdspan-env
396+
//! :end-before: example-end copy-mdspan-env
397+
//!
398+
//! @endrst
399+
//!
400+
//! @tparam T_In
401+
//! **[inferred]** The element type of the source mdspan
402+
//!
403+
//! @tparam Extents_In
404+
//! **[inferred]** The extents type of the source mdspan
405+
//!
406+
//! @tparam Layout_In
407+
//! **[inferred]** The layout type of the source mdspan
408+
//!
409+
//! @tparam Accessor_In
410+
//! **[inferred]** The accessor type of the source mdspan
411+
//!
412+
//! @tparam T_Out
413+
//! **[inferred]** The element type of the destination mdspan
414+
//!
415+
//! @tparam Extents_Out
416+
//! **[inferred]** The extents type of the destination mdspan
417+
//!
418+
//! @tparam Layout_Out
419+
//! **[inferred]** The layout type of the destination mdspan
420+
//!
421+
//! @tparam Accessor_Out
422+
//! **[inferred]** The accessor type of the destination mdspan
423+
//!
424+
//! @tparam EnvT
425+
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
426+
//!
427+
//! @param[in] mdspan_in
428+
//! Source mdspan containing the data to be copied
429+
//!
430+
//! @param[out] mdspan_out
431+
//! Destination mdspan where the data will be copied
432+
//!
433+
//! @param[in] env
434+
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
435+
//! @endrst
436+
template <typename T_In,
437+
typename Extents_In,
438+
typename Layout_In,
439+
typename Accessor_In,
440+
typename T_Out,
441+
typename Extents_Out,
442+
typename Layout_Out,
443+
typename Accessor_Out,
444+
typename EnvT = ::cuda::std::execution::env<>>
445+
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
446+
Copy(::cuda::std::mdspan<T_In, Extents_In, Layout_In, Accessor_In> mdspan_in,
447+
::cuda::std::mdspan<T_Out, Extents_Out, Layout_Out, Accessor_Out> mdspan_out,
448+
EnvT env = {})
449+
{
450+
// no nvtx range because Copy delegates to Transform/ForEachInExtents which emit their own NVTX ranges
451+
_CCCL_ASSERT(mdspan_in.extents() == mdspan_out.extents(), "mdspan extents must be equal");
452+
_CCCL_ASSERT((mdspan_in.data_handle() != nullptr && mdspan_out.data_handle() != nullptr) || mdspan_in.size() == 0,
453+
"mdspan data handle must not be nullptr if the size is not 0");
454+
if (mdspan_in.size() != 0)
455+
{
456+
auto in_start = mdspan_in.data_handle();
457+
auto in_end = in_start + mdspan_in.mapping().required_span_size();
458+
auto out_start = mdspan_out.data_handle();
459+
auto out_end = out_start + mdspan_out.mapping().required_span_size();
460+
_CCCL_ASSERT(!(in_end >= out_start && out_end >= in_start), "mdspan memory ranges must not overlap");
461+
}
462+
463+
return detail::copy_mdspan::copy(mdspan_in, mdspan_out, env);
464+
}
280465
};
281466

282467
CUB_NAMESPACE_END

cub/cub/device/dispatch/dispatch_copy_mdspan.cuh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,12 @@ template <typename T_In,
5252
typename T_Out,
5353
typename E_Out,
5454
typename L_Out,
55-
typename A_Out>
55+
typename A_Out,
56+
typename EnvT = ::cuda::std::execution::env<>>
5657
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
5758
copy(::cuda::std::mdspan<T_In, E_In, L_In, A_In> mdspan_in,
5859
::cuda::std::mdspan<T_Out, E_Out, L_Out, A_Out> mdspan_out,
59-
::cudaStream_t stream)
60+
EnvT env = {})
6061
{
6162
if (mdspan_in.is_exhaustive() && mdspan_out.is_exhaustive()
6263
&& detail::have_same_strides(mdspan_in.mapping(), mdspan_out.mapping()))
@@ -66,11 +67,11 @@ copy(::cuda::std::mdspan<T_In, E_In, L_In, A_In> mdspan_in,
6667
mdspan_out.data_handle(),
6768
mdspan_in.size(),
6869
::cuda::proclaim_copyable_arguments(::cuda::std::identity{}),
69-
stream);
70+
env);
7071
}
7172
// TODO (fbusato): add ForEachInLayout when mdspan_in and mdspan_out have compatible layouts
7273
// Compatible layouts could use more efficient iteration patterns
73-
return cub::DeviceFor::ForEachInExtents(mdspan_in.extents(), copy_mdspan_t{mdspan_in, mdspan_out}, stream);
74+
return cub::DeviceFor::ForEachInExtents(mdspan_in.extents(), copy_mdspan_t{mdspan_in, mdspan_out}, env);
7475
}
7576
} // namespace detail::copy_mdspan
7677

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3+
4+
// Should precede any includes
5+
struct stream_registry_factory_t;
6+
#define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY stream_registry_factory_t
7+
8+
#include "insert_nested_NVTX_range_guard.h"
9+
10+
#include <cub/device/device_copy.cuh>
11+
12+
#include <thrust/device_vector.h>
13+
#include <thrust/iterator/constant_iterator.h>
14+
#include <thrust/iterator/counting_iterator.h>
15+
#include <thrust/iterator/transform_iterator.h>
16+
#include "catch2_test_env_launch_helper.h"
17+
18+
DECLARE_LAUNCH_WRAPPER(cub::DeviceCopy::Batched, device_copy_batched);
19+
20+
// %PARAM% TEST_LAUNCH lid 0:1:2
21+
22+
#include <cuda/__execution/require.h>
23+
24+
#include <c2h/catch2_test_helper.h>
25+
26+
namespace stdexec = cuda::std::execution;
27+
28+
template <typename T>
29+
struct index_to_ptr
30+
{
31+
T* base;
32+
const int* offsets;
33+
__host__ __device__ __forceinline__ T* operator()(int index) const
34+
{
35+
return base + offsets[index];
36+
}
37+
};
38+
39+
struct get_size
40+
{
41+
const int* offsets;
42+
__host__ __device__ __forceinline__ int operator()(int index) const
43+
{
44+
return offsets[index + 1] - offsets[index];
45+
}
46+
};
47+
48+
#if TEST_LAUNCH == 0
49+
50+
TEST_CASE("DeviceCopy::Batched works with default environment", "[copy][device]")
51+
{
52+
// 3 ranges: [10, 20], [30, 40, 50], [60]
53+
auto d_src = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
54+
auto d_dst = c2h::device_vector<int>(6, 0);
55+
auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};
56+
57+
int num_ranges = 3;
58+
59+
thrust::counting_iterator<int> iota(0);
60+
auto input_it = thrust::make_transform_iterator(iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
61+
auto output_it = thrust::make_transform_iterator(iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
62+
auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});
63+
64+
REQUIRE(cudaSuccess == cub::DeviceCopy::Batched(input_it, output_it, sizes, num_ranges));
65+
66+
REQUIRE(d_dst == d_src);
67+
}
68+
69+
#endif
70+
71+
C2H_TEST("DeviceCopy::Batched uses environment", "[copy][device]")
72+
{
73+
// 3 ranges: [10, 20], [30, 40, 50], [60]
74+
auto d_src = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
75+
auto d_dst = c2h::device_vector<int>(6, 0);
76+
auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};
77+
78+
int num_ranges = 3;
79+
80+
thrust::counting_iterator<int> iota(0);
81+
auto input_it = thrust::make_transform_iterator(iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
82+
auto output_it = thrust::make_transform_iterator(iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
83+
auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});
84+
85+
size_t expected_bytes_allocated{};
86+
REQUIRE(
87+
cudaSuccess
88+
== cub::DeviceCopy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_ranges));
89+
90+
auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)};
91+
92+
device_copy_batched(input_it, output_it, sizes, num_ranges, env);
93+
94+
REQUIRE(d_dst == d_src);
95+
}
96+
97+
TEST_CASE("DeviceCopy::Batched uses custom stream", "[copy][device]")
98+
{
99+
// 3 ranges: [10, 20], [30, 40, 50], [60]
100+
auto d_src = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
101+
auto d_dst = c2h::device_vector<int>(6, 0);
102+
auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};
103+
104+
int num_ranges = 3;
105+
106+
thrust::counting_iterator<int> iota(0);
107+
auto input_it = thrust::make_transform_iterator(iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
108+
auto output_it = thrust::make_transform_iterator(iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
109+
auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});
110+
111+
cudaStream_t custom_stream;
112+
REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream));
113+
114+
size_t expected_bytes_allocated{};
115+
REQUIRE(
116+
cudaSuccess
117+
== cub::DeviceCopy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_ranges));
118+
119+
auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}};
120+
auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)};
121+
122+
device_copy_batched(input_it, output_it, sizes, num_ranges, env);
123+
124+
REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream));
125+
REQUIRE(d_dst == d_src);
126+
REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream));
127+
}
128+

0 commit comments

Comments
 (0)