cccl/cub/cub/device/device_memcpy.cuh at f1ddc074b0ef47bfd06ae2253649c94f8065d27c · NVIDIA/cccl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
// SPDX-FileCopyrightText: Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3

//! @file
//! cub::DeviceMemcpy provides device-wide, parallel operations for copying data.

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cub/detail/env_dispatch.cuh>
#include <cub/device/dispatch/dispatch_batch_memcpy.cuh>

#include <cuda/std/__execution/env.h>
#include <cuda/std/__type_traits/is_pointer.h>
#include <cuda/std/cstdint>

CUB_NAMESPACE_BEGIN

//! @brief cub::DeviceMemcpy provides device-wide, parallel operations for copying data.
struct DeviceMemcpy
{
  //! @rst
  //! Copies data from a batch of given source buffers to their corresponding destination buffer.
  //!
  //! .. versionadded:: 2.2.0
  //!    First appears in CUDA Toolkit 12.3.
  //!
  //! .. note::
  //!
  //!    If any input buffer aliases memory from any output buffer the behavior is undefined.
  //!    If any output buffer aliases memory of another output buffer the behavior is undefined.
  //!    Input buffers can alias one another.
  //!
  //! Snippet
  //! +++++++
  //!
  //! The code snippet below illustrates usage of DeviceMemcpy::Batched for mutating strings withing
  //! a single string buffer.
  //!
  //! .. code-block:: c++
  //!
  //!    struct GetPtrToStringItem
  //!    {
  //!      __host__ __device__ __forceinline__ void *operator()(uint32_t index)
  //!      {
  //!        return &d_string_data_in[d_string_offsets[index]];
  //!      }
  //!      char *d_string_data_in;
  //!      uint32_t *d_string_offsets;
  //!    };
  //!
  //!    struct GetStringItemSize
  //!    {
  //!      __host__ __device__ __forceinline__ uint32_t operator()(uint32_t index)
  //!      {
  //!        return d_string_offsets[index + 1] - d_string_offsets[index];
  //!      }
  //!      uint32_t *d_string_offsets;
  //!    };
  //!
  //!    uint32_t num_strings = 5;
  //!    char *d_string_data_in;         // e.g., "TomatoesBananasApplesOrangesGrapes"
  //!    char *d_string_data_out;        // e.g., "                ...               "
  //!    uint32_t *d_string_offsets_old; // e.g., [0, 8, 15, 21, 28, 34]
  //!    uint32_t *d_string_offsets_new; // e.g., [0, 6, 13, 19, 26, 34]
  //!    uint32_t *d_gather_index;       // e.g., [2, 1, 4, 3, 0]
  //!
  //!    // Initialize an iterator that returns d_gather_index[i] when the i-th item is dereferenced
  //!    auto gather_iterator = thrust::make_permutation_iterator(thrust::make_counting_iterator(0),
  //!    d_gather_index);
  //!
  //!    // Returns pointers to the input buffer for each string
  //!    auto str_ptrs_in = thrust::make_transform_iterator(gather_iterator,
  //!                                                       GetPtrToStringItem{d_string_data_in,
  //!    d_string_offsets_old});
  //!
  //!    // Returns the string size of the i-th string
  //!    auto str_sizes = thrust::make_transform_iterator(gather_iterator,
  //!    GetStringItemSize{d_string_offsets_old});
  //!
  //!    // Returns pointers to the output buffer for each string
  //!    auto str_ptrs_out = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
  //!                                                        GetPtrToStringItem{d_string_data_out,
  //!    d_string_offsets_new});
  //!
  //!    // Determine temporary device storage requirements
  //!    void *d_temp_storage      = nullptr;
  //!    size_t temp_storage_bytes = 0;
  //!    cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
  //!    str_sizes, num_strings);
  //!
  //!    // Allocate temporary storage
  //!    cudaMalloc(&d_temp_storage, temp_storage_bytes);
  //!
  //!    // Run batched copy algorithm (used to permute strings)
  //!    cub::DeviceMemcpy::Batched(d_temp_storage, temp_storage_bytes, str_ptrs_in, str_ptrs_out,
  //!    str_sizes, num_strings);
  //!
  //!    // d_string_data_out       <-- "ApplesBananasGrapesOrangesTomatoe"
  //!
  //! @endrst
  //!
  //! @tparam InputBufferIt
  //!   **[inferred]** Device-accessible random-access input iterator type providing the pointers to
  //!   the source memory buffers
  //!
  //! @tparam OutputBufferIt
  //!   **[inferred]** Device-accessible random-access input iterator type providing the pointers to
  //!   the destination memory buffers
  //!
  //! @tparam BufferSizeIteratorT
  //!   **[inferred]** Device-accessible random-access input iterator type providing the number of bytes
  //!   to be copied for each pair of buffers
  //!
  //! @param[in] d_temp_storage
  //!   Device-accessible allocation of temporary storage. When `nullptr`, the
  //!   required allocation size is written to `temp_storage_bytes` and no work is done.
  //!
  //! @param[in,out] temp_storage_bytes
  //!   Reference to size in bytes of `d_temp_storage` allocation
  //!
  //! @param[in] input_buffer_it
  //!   Device-accessible iterator providing the pointers to the source memory buffers
  //!
  //! @param[in] output_buffer_it
  //!   Device-accessible iterator providing the pointers to the destination memory buffers
  //!
  //! @param[in] buffer_sizes
  //!   Device-accessible iterator providing the number of bytes to be copied for each pair of buffers
  //!
  //! @param[in] num_buffers
  //!   The total number of buffer pairs
  //!
  //! @param[in] stream
  //!   @rst
  //!   **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
  //!   @endrst
  template <typename InputBufferIt, typename OutputBufferIt, typename BufferSizeIteratorT>
  CUB_RUNTIME_FUNCTION static cudaError_t Batched(
    void* d_temp_storage,
    size_t& temp_storage_bytes,
    InputBufferIt input_buffer_it,
    OutputBufferIt output_buffer_it,
    BufferSizeIteratorT buffer_sizes,
    ::cuda::std::int64_t num_buffers,
    cudaStream_t stream = 0)
  {
    _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceMemcpy::Batched");
    static_assert(::cuda::std::is_pointer_v<cub::detail::it_value_t<InputBufferIt>>,
                  "DeviceMemcpy::Batched only supports copying of memory buffers."
                  "Please consider using DeviceCopy::Batched instead.");
    static_assert(::cuda::std::is_pointer_v<cub::detail::it_value_t<OutputBufferIt>>,
                  "DeviceMemcpy::Batched only supports copying of memory buffers."
                  "Please consider using DeviceCopy::Batched instead.");

    // Integer type large enough to hold any offset in [0, num_thread_blocks_launched), where a safe
    // upper bound on num_thread_blocks_launched can be assumed to be given by
    // IDIV_CEIL(num_buffers, 64)
    using BlockOffsetT = uint32_t;

    return detail::
      DispatchBatchMemcpy<InputBufferIt, OutputBufferIt, BufferSizeIteratorT, BlockOffsetT, CopyAlg::Memcpy>::Dispatch(
        d_temp_storage, temp_storage_bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream);
  }

  //! @rst
  //! Copies data from a batch of given source buffers to their corresponding destination buffer.
  //!
  //! .. versionadded:: 3.4.0
  //!    First appears in CUDA Toolkit 13.4.
  //!
  //! This is an environment-based API that allows customization of:
  //!
  //! - Stream: Query via ``cuda::get_stream``
  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
  //!
  //! .. note::
  //!
  //!    If any input buffer aliases memory from any output buffer the behavior is undefined.
  //!    If any output buffer aliases memory of another output buffer the behavior is undefined.
  //!    Input buffers can alias one another.
  //!
  //! Snippet
  //! +++++++
  //!
  //! The code snippet below illustrates usage of DeviceMemcpy::Batched with an environment:
  //!
  //! .. literalinclude:: ../../../cub/test/catch2_test_device_memcpy_env_api.cu
  //!     :language: c++
  //!     :dedent:
  //!     :start-after: example-begin memcpy-batched-env
  //!     :end-before: example-end memcpy-batched-env
  //!
  //! @endrst
  //!
  //! @tparam InputBufferIt
  //!   **[inferred]** Device-accessible random-access input iterator type providing the pointers to
  //!   the source memory buffers
  //!
  //! @tparam OutputBufferIt
  //!   **[inferred]** Device-accessible random-access input iterator type providing the pointers to
  //!   the destination memory buffers
  //!
  //! @tparam BufferSizeIteratorT
  //!   **[inferred]** Device-accessible random-access input iterator type providing the number of bytes
  //!   to be copied for each pair of buffers
  //!
  //! @tparam EnvT
  //!   **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
  //!
  //! @param[in] input_buffer_it
  //!   Device-accessible iterator providing the pointers to the source memory buffers
  //!
  //! @param[in] output_buffer_it
  //!   Device-accessible iterator providing the pointers to the destination memory buffers
  //!
  //! @param[in] buffer_sizes
  //!   Device-accessible iterator providing the number of bytes to be copied for each pair of buffers
  //!
  //! @param[in] num_buffers
  //!   The total number of buffer pairs
  //!
  //! @param[in] env
  //!   @rst
  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
  //!   @endrst
  template <typename InputBufferIt,
            typename OutputBufferIt,
            typename BufferSizeIteratorT,
            typename EnvT = ::cuda::std::execution::env<>,
            ::cuda::std::enable_if_t<!::cuda::std::is_same_v<InputBufferIt, void*>, int> = 0>
  [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
  Batched(InputBufferIt input_buffer_it,
          OutputBufferIt output_buffer_it,
          BufferSizeIteratorT buffer_sizes,
          ::cuda::std::int64_t num_buffers,
          EnvT env = {})
  {
    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceMemcpy::Batched");
    static_assert(::cuda::std::is_pointer_v<cub::detail::it_value_t<InputBufferIt>>,
                  "DeviceMemcpy::Batched only supports copying of memory buffers."
                  "Please consider using DeviceCopy::Batched instead.");
    static_assert(::cuda::std::is_pointer_v<cub::detail::it_value_t<OutputBufferIt>>,
                  "DeviceMemcpy::Batched only supports copying of memory buffers."
                  "Please consider using DeviceCopy::Batched instead.");

    using BlockOffsetT = uint32_t;

    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
      return detail::
        DispatchBatchMemcpy<InputBufferIt, OutputBufferIt, BufferSizeIteratorT, BlockOffsetT, CopyAlg::Memcpy>::Dispatch(
          storage, bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream);
    });
  }
};

CUB_NAMESPACE_END