Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 182 additions & 0 deletions cub/cub/device/device_copy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
# pragma system_header
#endif // no system header

#include <cub/detail/env_dispatch.cuh>
#include <cub/device/dispatch/dispatch_batch_memcpy.cuh>
#include <cub/device/dispatch/dispatch_copy_mdspan.cuh>
#include <cub/device/dispatch/tuning/tuning_batch_memcpy.cuh>

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <cuda/std/__execution/env.h>
#include <cuda/std/cstdint>
#include <cuda/std/mdspan>

Expand Down Expand Up @@ -164,6 +166,81 @@ struct DeviceCopy
d_temp_storage, temp_storage_bytes, input_it, output_it, sizes, num_ranges, stream);
}

//! @rst
//! Copies data from a batch of given source ranges to their corresponding destination ranges.
//!
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
//!
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! - This operation provides ``gpu_to_gpu`` determinism: results are identical across different GPU architectures.
//!
//! .. note::
//!
//! If any input range aliases any output range the behavior is undefined.
//! If any output range aliases another output range the behavior is undefined.
//! Input ranges can alias one another.
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates usage of DeviceCopy::Batched with an environment:
//!
//! .. literalinclude:: ../../../cub/test/catch2_test_device_copy_env_api.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin copy-batched-env
//! :end-before: example-end copy-batched-env
//!
//! @endrst
//!
//! @tparam InputIt
//! **[inferred]** Device-accessible random-access input iterator type providing the iterators to the source ranges
//!
//! @tparam OutputIt
//! **[inferred]** Device-accessible random-access input iterator type providing the iterators to
//! the destination ranges
//!
//! @tparam SizeIteratorT
//! **[inferred]** Device-accessible random-access input iterator type providing the number of items to be
//! copied for each pair of ranges
//!
//! @tparam EnvT
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
//!
//! @param[in] input_it
//! Device-accessible iterator providing the iterators to the source ranges
//!
//! @param[in] output_it
//! Device-accessible iterator providing the iterators to the destination ranges
//!
//! @param[in] sizes
//! Device-accessible iterator providing the number of elements to be copied for each pair of ranges
//!
//! @param[in] num_ranges
//! The total number of range pairs
//!
//! @param[in] env
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename InputIt, typename OutputIt, typename SizeIteratorT, typename EnvT = ::cuda::std::execution::env<>>
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
Batched(InputIt input_it, OutputIt output_it, SizeIteratorT sizes, ::cuda::std::int64_t num_ranges, EnvT env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceCopy::Batched");

using BlockOffsetT = uint32_t;

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
return detail::DispatchBatchMemcpy<InputIt, OutputIt, SizeIteratorT, BlockOffsetT, CopyAlg::Copy>::Dispatch(
storage, bytes, input_it, output_it, sizes, num_ranges, stream);
});
}

//! @rst
//! Copies data from a multidimensional source mdspan to a destination mdspan.
//!
Expand Down Expand Up @@ -277,6 +354,111 @@ struct DeviceCopy
}
return detail::copy_mdspan::copy(mdspan_in, mdspan_out, stream);
}

//! @rst
//! Copies data from a multidimensional source mdspan to a destination mdspan.
//!
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
//!
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! - This operation provides ``gpu_to_gpu`` determinism: results are identical across different GPU architectures.
//!
//! This function performs a parallel copy operation between two mdspan objects with potentially different layouts but
//! identical extents. The copy operation handles arbitrary-dimensional arrays and automatically manages layout
//! transformations.
//!
//! Preconditions
//! +++++++++++++
//!
//! * The source and destination mdspans must have identical extents (same ranks and sizes).
//! * The source and destination mdspans data handle must not be nullptr if the size is not 0.
//! * The underlying memory of the source and destination must not overlap.
//! * Both mdspans must point to device memory.
//!
//! Snippet
//! +++++++
//!
//! The code snippet below illustrates usage of DeviceCopy::Copy with an environment:
//!
//! .. literalinclude:: ../../../cub/test/catch2_test_device_copy_env_api.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin copy-mdspan-env
//! :end-before: example-end copy-mdspan-env
//!
//! @endrst
//!
//! @tparam T_In
//! **[inferred]** The element type of the source mdspan
//!
//! @tparam Extents_In
//! **[inferred]** The extents type of the source mdspan
//!
//! @tparam Layout_In
//! **[inferred]** The layout type of the source mdspan
//!
//! @tparam Accessor_In
//! **[inferred]** The accessor type of the source mdspan
//!
//! @tparam T_Out
//! **[inferred]** The element type of the destination mdspan
//!
//! @tparam Extents_Out
//! **[inferred]** The extents type of the destination mdspan
//!
//! @tparam Layout_Out
//! **[inferred]** The layout type of the destination mdspan
//!
//! @tparam Accessor_Out
//! **[inferred]** The accessor type of the destination mdspan
//!
//! @tparam EnvT
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
//!
//! @param[in] mdspan_in
//! Source mdspan containing the data to be copied
//!
//! @param[out] mdspan_out
//! Destination mdspan where the data will be copied
//!
//! @param[in] env
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename T_In,
typename Extents_In,
typename Layout_In,
typename Accessor_In,
typename T_Out,
typename Extents_Out,
typename Layout_Out,
typename Accessor_Out,
typename EnvT = ::cuda::std::execution::env<>>
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t
Copy(::cuda::std::mdspan<T_In, Extents_In, Layout_In, Accessor_In> mdspan_in,
::cuda::std::mdspan<T_Out, Extents_Out, Layout_Out, Accessor_Out> mdspan_out,
EnvT env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceCopy::Copy");
_CCCL_ASSERT(mdspan_in.extents() == mdspan_out.extents(), "mdspan extents must be equal");
_CCCL_ASSERT((mdspan_in.data_handle() != nullptr && mdspan_out.data_handle() != nullptr) || mdspan_in.size() == 0,
"mdspan data handle must not be nullptr if the size is not 0");
if (mdspan_in.size() != 0)
{
auto in_start = mdspan_in.data_handle();
auto in_end = in_start + mdspan_in.mapping().required_span_size();
auto out_start = mdspan_out.data_handle();
auto out_end = out_start + mdspan_out.mapping().required_span_size();
// TODO(fbusato): replace with __are_ptrs_overlapping
_CCCL_ASSERT(!(in_end >= out_start && out_end >= in_start), "mdspan memory ranges must not overlap");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please retain the comment

Suggested change
_CCCL_ASSERT(!(in_end >= out_start && out_end >= in_start), "mdspan memory ranges must not overlap");
// TODO(fbusato): replace with __are_ptrs_overlapping
_CCCL_ASSERT(!(in_end >= out_start && out_end >= in_start), "mdspan memory ranges must not overlap");

}

return detail::copy_mdspan::copy(mdspan_in, mdspan_out, env);
}
};

CUB_NAMESPACE_END
25 changes: 25 additions & 0 deletions cub/cub/device/device_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1318,6 +1318,31 @@ public:
return ForEachInLayout(layout_mapping, op, ::cuda::stream_ref{stream});
}

// Internal version of ForEachInExtents without NVTX range, for use by DeviceCopy::Copy
template <typename IndexType, size_t... Extents, typename OpType, typename EnvT = ::cuda::std::execution::env<>>
CUB_RUNTIME_FUNCTION static cudaError_t
__for_each_in_extents_internal(const ::cuda::std::extents<IndexType, Extents...>& extents, OpType op, EnvT env = {})
{
using namespace cub::detail;
using extents_type = ::cuda::std::extents<IndexType, Extents...>;
using extent_index_type = typename extents_type::index_type;
using fast_mod_array_t = ::cuda::std::array<fast_div_mod<extent_index_type>, extents_type::rank()>;
static constexpr auto seq = ::cuda::std::make_index_sequence<extents_type::rank()>{};
constexpr bool is_layout_right = true;
fast_mod_array_t sub_sizes_div_array = cub::detail::sub_sizes_fast_div_mod<is_layout_right>(extents, seq);
fast_mod_array_t extents_div_array = cub::detail::extents_fast_div_mod(extents, seq);
for_each::op_wrapper_extents_t<OpType, extents_type, is_layout_right, fast_mod_array_t> op_wrapper{
op, extents, sub_sizes_div_array, extents_div_array};
using ShapeT = implicit_prom_t<extent_index_type>;
auto shape = static_cast<ShapeT>(cub::detail::size(extents));
if (shape == 0)
{
return cudaSuccess;
}
auto stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStream_t{}}, env);
return detail::for_each::dispatch<ShapeT>(shape, op_wrapper, stream.get());
}

#ifndef _CCCL_DOXYGEN_INVOKED

_CCCL_TEMPLATE(typename LayoutMapping, typename OpType)
Expand Down
22 changes: 10 additions & 12 deletions cub/cub/device/device_transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,14 @@ struct tuning
//! sequences into an output sequence.
struct DeviceTransform
{
private:
template <detail::transform::requires_stable_address StableAddress = detail::transform::requires_stable_address::no,
typename... RandomAccessIteratorsIn,
typename RandomAccessIteratorOut,
typename NumItemsT,
typename Predicate,
typename TransformOp,
typename Env>
CUB_RUNTIME_FUNCTION static cudaError_t TransformInternal(
CUB_RUNTIME_FUNCTION static cudaError_t __transform_internal(
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
RandomAccessIteratorOut output,
NumItemsT num_items,
Expand Down Expand Up @@ -130,15 +129,15 @@ private:
typename Predicate,
typename TransformOp,
typename Env>
CUB_RUNTIME_FUNCTION static cudaError_t TransformInternal(
CUB_RUNTIME_FUNCTION static cudaError_t __transform_internal(
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
::cuda::std::tuple<RandomAccessIteratorsOut...> outputs,
NumItemsT num_items,
Predicate predicate,
TransformOp transform_op,
Env env)
{
return TransformInternal<StableAddress>(
return __transform_internal<StableAddress>(
::cuda::std::move(inputs),
::cuda::make_zip_iterator(::cuda::std::move(outputs)),
num_items,
Expand All @@ -147,7 +146,6 @@ private:
::cuda::std::move(env));
}

public:
//! @rst
//! Overview
//! +++++++++++++++++++++++++++++++++++++++++++++
Expand Down Expand Up @@ -191,7 +189,7 @@ public:
Env env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
return TransformInternal(
return __transform_internal(
::cuda::std::move(inputs),
::cuda::std::move(outputs),
num_items,
Expand Down Expand Up @@ -270,7 +268,7 @@ public:
Env env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Transform");
return TransformInternal(
return __transform_internal(
::cuda::std::move(inputs),
::cuda::std::move(output),
num_items,
Expand Down Expand Up @@ -398,7 +396,7 @@ public:
"The return value of the generator's call operator must be assignable to the dereferenced output iterator");

_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Generate");
return TransformInternal(
return __transform_internal(
::cuda::std::make_tuple(),
::cuda::std::move(output),
num_items,
Expand Down Expand Up @@ -454,7 +452,7 @@ public:
"The passed value must be assignable to the dereferenced output iterator");

_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::Fill");
return TransformInternal(
return __transform_internal(
::cuda::std::make_tuple(),
::cuda::std::move(output),
num_items,
Expand Down Expand Up @@ -535,7 +533,7 @@ public:
Env env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformIf");
return TransformInternal(
return __transform_internal(
::cuda::std::move(inputs),
::cuda::std::move(output),
num_items,
Expand Down Expand Up @@ -711,7 +709,7 @@ public:
Env env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformStableArgumentAddresses");
return TransformInternal<detail::transform::requires_stable_address::yes>(
return __transform_internal<detail::transform::requires_stable_address::yes>(
::cuda::std::move(inputs),
::cuda::std::move(output),
num_items,
Expand Down Expand Up @@ -823,7 +821,7 @@ public:
Env env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceTransform::TransformIfStableArgumentAddresses");
return TransformInternal<detail::transform::requires_stable_address::yes>(
return __transform_internal<detail::transform::requires_stable_address::yes>(
::cuda::std::move(inputs),
::cuda::std::move(output),
num_items,
Expand Down
17 changes: 10 additions & 7 deletions cub/cub/device/dispatch/dispatch_copy_mdspan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cub/device/device_for.cuh>
#include <cub/device/device_transform.cuh>
#include <cub/device/dispatch/tuning/tuning_transform.cuh>
#include <cub/util_debug.cuh>

#include <cuda/std/functional>
Expand Down Expand Up @@ -52,25 +53,27 @@ template <typename T_In,
typename T_Out,
typename E_Out,
typename L_Out,
typename A_Out>
typename A_Out,
typename EnvT = ::cuda::std::execution::env<>>
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
copy(::cuda::std::mdspan<T_In, E_In, L_In, A_In> mdspan_in,
::cuda::std::mdspan<T_Out, E_Out, L_Out, A_Out> mdspan_out,
::cudaStream_t stream)
EnvT env = {})
{
if (mdspan_in.is_exhaustive() && mdspan_out.is_exhaustive()
&& detail::have_same_strides(mdspan_in.mapping(), mdspan_out.mapping()))
{
return cub::DeviceTransform::Transform(
mdspan_in.data_handle(),
return cub::DeviceTransform::__transform_internal(
::cuda::std::make_tuple(mdspan_in.data_handle()),
mdspan_out.data_handle(),
mdspan_in.size(),
::cuda::proclaim_copyable_arguments(::cuda::std::identity{}),
stream);
detail::transform::always_true_predicate{},
::cuda::std::identity{},
env);
}
// TODO (fbusato): add ForEachInLayout when mdspan_in and mdspan_out have compatible layouts
// Compatible layouts could use more efficient iteration patterns
return cub::DeviceFor::ForEachInExtents(mdspan_in.extents(), copy_mdspan_t{mdspan_in, mdspan_out}, stream);
return cub::DeviceFor::__for_each_in_extents_internal(mdspan_in.extents(), copy_mdspan_t{mdspan_in, mdspan_out}, env);
}
} // namespace detail::copy_mdspan

Expand Down
Loading
Loading