|
16 | 16 | # pragma system_header |
17 | 17 | #endif // no system header |
18 | 18 |
|
| 19 | +#include <cub/detail/env_dispatch.cuh> |
19 | 20 | #include <cub/device/dispatch/dispatch_batch_memcpy.cuh> |
20 | 21 | #include <cub/device/dispatch/dispatch_copy_mdspan.cuh> |
21 | 22 | #include <cub/device/dispatch/tuning/tuning_batch_memcpy.cuh> |
22 | 23 |
|
23 | 24 | #include <thrust/system/cuda/detail/core/triple_chevron_launch.h> |
24 | 25 |
|
| 26 | +#include <cuda/std/__execution/env.h> |
25 | 27 | #include <cuda/std/cstdint> |
26 | 28 | #include <cuda/std/mdspan> |
27 | 29 |
|
@@ -164,6 +166,85 @@ struct DeviceCopy |
164 | 166 | d_temp_storage, temp_storage_bytes, input_it, output_it, sizes, num_ranges, stream); |
165 | 167 | } |
166 | 168 |
|
| 169 | + //! @rst |
| 170 | + //! Copies data from a batch of given source ranges to their corresponding destination ranges. |
| 171 | + //! |
| 172 | + //! .. versionadded:: 3.4.0 |
| 173 | + //! First appears in CUDA Toolkit 13.4. |
| 174 | + //! |
| 175 | + //! This is an environment-based API that allows customization of: |
| 176 | + //! |
| 177 | + //! - Stream: Query via ``cuda::get_stream`` |
| 178 | + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` |
| 179 | + //! |
| 180 | + //! - This operation provides ``gpu_to_gpu`` determinism: results are identical across different GPU architectures. |
| 181 | + //! |
| 182 | + //! .. note:: |
| 183 | + //! |
| 184 | + //! If any input range aliases any output range the behavior is undefined. |
| 185 | + //! If any output range aliases another output range the behavior is undefined. |
| 186 | + //! Input ranges can alias one another. |
| 187 | + //! |
| 188 | + //! Snippet |
| 189 | + //! +++++++ |
| 190 | + //! |
| 191 | + //! The code snippet below illustrates usage of DeviceCopy::Batched with an environment: |
| 192 | + //! |
| 193 | + //! .. literalinclude:: ../../../cub/test/catch2_test_device_copy_env_api.cu |
| 194 | + //! :language: c++ |
| 195 | + //! :dedent: |
| 196 | + //! :start-after: example-begin copy-batched-env |
| 197 | + //! :end-before: example-end copy-batched-env |
| 198 | + //! |
| 199 | + //! @endrst |
| 200 | + //! |
| 201 | + //! @tparam InputIt |
| 202 | + //! **[inferred]** Device-accessible random-access input iterator type providing the iterators to the source ranges |
| 203 | + //! |
| 204 | + //! @tparam OutputIt |
| 205 | + //! **[inferred]** Device-accessible random-access input iterator type providing the iterators to |
| 206 | + //! the destination ranges |
| 207 | + //! |
| 208 | + //! @tparam SizeIteratorT |
| 209 | + //! **[inferred]** Device-accessible random-access input iterator type providing the number of items to be |
| 210 | + //! copied for each pair of ranges |
| 211 | + //! |
| 212 | + //! @tparam EnvT |
| 213 | + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) |
| 214 | + //! |
| 215 | + //! @param[in] input_it |
| 216 | + //! Device-accessible iterator providing the iterators to the source ranges |
| 217 | + //! |
| 218 | + //! @param[in] output_it |
| 219 | + //! Device-accessible iterator providing the iterators to the destination ranges |
| 220 | + //! |
| 221 | + //! @param[in] sizes |
| 222 | + //! Device-accessible iterator providing the number of elements to be copied for each pair of ranges |
| 223 | + //! |
| 224 | + //! @param[in] num_ranges |
| 225 | + //! The total number of range pairs |
| 226 | + //! |
| 227 | + //! @param[in] env |
| 228 | + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. |
| 229 | + //! @endrst |
| 230 | + template <typename InputIt, |
| 231 | + typename OutputIt, |
| 232 | + typename SizeIteratorT, |
| 233 | + typename EnvT = ::cuda::std::execution::env<>, |
| 234 | + ::cuda::std::enable_if_t<!::cuda::std::is_same_v<InputIt, void*>, int> = 0> |
| 235 | + [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t |
| 236 | + Batched(InputIt input_it, OutputIt output_it, SizeIteratorT sizes, ::cuda::std::int64_t num_ranges, EnvT env = {}) |
| 237 | + { |
| 238 | + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceCopy::Batched"); |
| 239 | + |
| 240 | + using BlockOffsetT = uint32_t; |
| 241 | + |
| 242 | + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { |
| 243 | + return detail::DispatchBatchMemcpy<InputIt, OutputIt, SizeIteratorT, BlockOffsetT, CopyAlg::Copy>::Dispatch( |
| 244 | + storage, bytes, input_it, output_it, sizes, num_ranges, stream); |
| 245 | + }); |
| 246 | + } |
| 247 | + |
167 | 248 | //! @rst |
168 | 249 | //! Copies data from a multidimensional source mdspan to a destination mdspan. |
169 | 250 | //! |
@@ -277,6 +358,110 @@ struct DeviceCopy |
277 | 358 | } |
278 | 359 | return detail::copy_mdspan::copy(mdspan_in, mdspan_out, stream); |
279 | 360 | } |
| 361 | + |
| 362 | + //! @rst |
| 363 | + //! Copies data from a multidimensional source mdspan to a destination mdspan. |
| 364 | + //! |
| 365 | + //! .. versionadded:: 3.4.0 |
| 366 | + //! First appears in CUDA Toolkit 13.4. |
| 367 | + //! |
| 368 | + //! This is an environment-based API that allows customization of: |
| 369 | + //! |
| 370 | + //! - Stream: Query via ``cuda::get_stream`` |
| 371 | + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` |
| 372 | + //! |
| 373 | + //! - This operation provides ``gpu_to_gpu`` determinism: results are identical across different GPU architectures. |
| 374 | + //! |
| 375 | + //! This function performs a parallel copy operation between two mdspan objects with potentially different layouts but |
| 376 | + //! identical extents. The copy operation handles arbitrary-dimensional arrays and automatically manages layout |
| 377 | + //! transformations. |
| 378 | + //! |
| 379 | + //! Preconditions |
| 380 | + //! +++++++++++++ |
| 381 | + //! |
| 382 | + //! * The source and destination mdspans must have identical extents (same ranks and sizes). |
| 383 | + //! * The source and destination mdspans data handle must not be nullptr if the size is not 0. |
| 384 | + //! * The underlying memory of the source and destination must not overlap. |
| 385 | + //! * Both mdspans must point to device memory. |
| 386 | + //! |
| 387 | + //! Snippet |
| 388 | + //! +++++++ |
| 389 | + //! |
| 390 | + //! The code snippet below illustrates usage of DeviceCopy::Copy with an environment: |
| 391 | + //! |
| 392 | + //! .. literalinclude:: ../../../cub/test/catch2_test_device_copy_env_api.cu |
| 393 | + //! :language: c++ |
| 394 | + //! :dedent: |
| 395 | + //! :start-after: example-begin copy-mdspan-env |
| 396 | + //! :end-before: example-end copy-mdspan-env |
| 397 | + //! |
| 398 | + //! @endrst |
| 399 | + //! |
| 400 | + //! @tparam T_In |
| 401 | + //! **[inferred]** The element type of the source mdspan |
| 402 | + //! |
| 403 | + //! @tparam Extents_In |
| 404 | + //! **[inferred]** The extents type of the source mdspan |
| 405 | + //! |
| 406 | + //! @tparam Layout_In |
| 407 | + //! **[inferred]** The layout type of the source mdspan |
| 408 | + //! |
| 409 | + //! @tparam Accessor_In |
| 410 | + //! **[inferred]** The accessor type of the source mdspan |
| 411 | + //! |
| 412 | + //! @tparam T_Out |
| 413 | + //! **[inferred]** The element type of the destination mdspan |
| 414 | + //! |
| 415 | + //! @tparam Extents_Out |
| 416 | + //! **[inferred]** The extents type of the destination mdspan |
| 417 | + //! |
| 418 | + //! @tparam Layout_Out |
| 419 | + //! **[inferred]** The layout type of the destination mdspan |
| 420 | + //! |
| 421 | + //! @tparam Accessor_Out |
| 422 | + //! **[inferred]** The accessor type of the destination mdspan |
| 423 | + //! |
| 424 | + //! @tparam EnvT |
| 425 | + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) |
| 426 | + //! |
| 427 | + //! @param[in] mdspan_in |
| 428 | + //! Source mdspan containing the data to be copied |
| 429 | + //! |
| 430 | + //! @param[out] mdspan_out |
| 431 | + //! Destination mdspan where the data will be copied |
| 432 | + //! |
| 433 | + //! @param[in] env |
| 434 | + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. |
| 435 | + //! @endrst |
| 436 | + template <typename T_In, |
| 437 | + typename Extents_In, |
| 438 | + typename Layout_In, |
| 439 | + typename Accessor_In, |
| 440 | + typename T_Out, |
| 441 | + typename Extents_Out, |
| 442 | + typename Layout_Out, |
| 443 | + typename Accessor_Out, |
| 444 | + typename EnvT = ::cuda::std::execution::env<>> |
| 445 | + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t |
| 446 | + Copy(::cuda::std::mdspan<T_In, Extents_In, Layout_In, Accessor_In> mdspan_in, |
| 447 | + ::cuda::std::mdspan<T_Out, Extents_Out, Layout_Out, Accessor_Out> mdspan_out, |
| 448 | + EnvT env = {}) |
| 449 | + { |
| 450 | + // no nvtx range because Copy delegates to Transform/ForEachInExtents which emit their own NVTX ranges |
| 451 | + _CCCL_ASSERT(mdspan_in.extents() == mdspan_out.extents(), "mdspan extents must be equal"); |
| 452 | + _CCCL_ASSERT((mdspan_in.data_handle() != nullptr && mdspan_out.data_handle() != nullptr) || mdspan_in.size() == 0, |
| 453 | + "mdspan data handle must not be nullptr if the size is not 0"); |
| 454 | + if (mdspan_in.size() != 0) |
| 455 | + { |
| 456 | + auto in_start = mdspan_in.data_handle(); |
| 457 | + auto in_end = in_start + mdspan_in.mapping().required_span_size(); |
| 458 | + auto out_start = mdspan_out.data_handle(); |
| 459 | + auto out_end = out_start + mdspan_out.mapping().required_span_size(); |
| 460 | + _CCCL_ASSERT(!(in_end >= out_start && out_end >= in_start), "mdspan memory ranges must not overlap"); |
| 461 | + } |
| 462 | + |
| 463 | + return detail::copy_mdspan::copy(mdspan_in, mdspan_out, env); |
| 464 | + } |
280 | 465 | }; |
281 | 466 |
|
282 | 467 | CUB_NAMESPACE_END |
0 commit comments