Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
331 changes: 331 additions & 0 deletions cub/cub/device/device_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,14 @@
#endif // no system header

#include <cub/detail/choose_offset.cuh>
#include <cub/detail/env_dispatch.cuh>
#include <cub/device/dispatch/dispatch_merge_sort.cuh>
#include <cub/util_namespace.cuh>

#include <cuda/__execution/determinism.h>
#include <cuda/__execution/require.h>
#include <cuda/std/__execution/env.h>

CUB_NAMESPACE_BEGIN

/**
Expand Down Expand Up @@ -225,6 +230,94 @@ public:
return SortPairsNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
}

//! @rst
//! Sorts items using a merge sorting method.
//!
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
//!
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! - This operation provides ``run_to_run`` determinism.
//! - SortPairs is not guaranteed to be stable.
//!
//! Snippet
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
//! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_sort_env_api.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin sort-pairs-env
//! :end-before: example-end sort-pairs-env
//!
//! @endrst
//!
//! @tparam KeyIteratorT
//! **[inferred]** Random-access iterator type for keys @iterator
//!
//! @tparam ValueIteratorT
//! **[inferred]** Random-access iterator type for values @iterator
//!
//! @tparam OffsetT
//! **[inferred]** Integer type for offsets
//!
//! @tparam CompareOpT
//! **[inferred]** Comparison function object type
//!
//! @tparam EnvT
//! **[inferred]** Execution environment type
//!
//! @param[in,out] d_keys
//! Keys to sort
//!
//! @param[in,out] d_items
//! Values corresponding to keys
//!
//! @param[in] num_items
//! Number of items to sort
//!
//! @param[in] compare_op
//! Comparison function object
//!
//! @param[in] env
//! @rst
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename KeyIteratorT,
typename ValueIteratorT,
typename OffsetT,
typename CompareOpT,
typename EnvT = ::cuda::std::execution::env<>>
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
SortPairs(KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, EnvT env = {})
{
_CCCL_NVTX_RANGE_SCOPE(GetName());

static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
"Determinism should be used inside requires to have an effect.");
using requirements_t = ::cuda::std::execution::
__query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
using determinism_t =
::cuda::std::execution::__query_result_or_t<requirements_t,
::cuda::execution::determinism::__get_determinism_t,
::cuda::execution::determinism::run_to_run_t>;
static_assert(!::cuda::std::is_same_v<determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
"gpu_to_gpu determinism is not supported for unstable sort. Use StableSortPairs instead.");
Comment on lines +303 to +308
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: I need to figure out what kind of determinism we provide. The merge sort implementation is stable according to the docs, so it should provide gpu to gpu determinism. If you want to steer users away from SortPairs to StableSortPairs, if they need stabililty, that's fine. But even an unstable algorithm can be gpu_to_gpu deterministic.


using ChooseOffsetT = detail::choose_offset_t<OffsetT>;

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
using DispatchMergeSortT =
DispatchMergeSort<KeyIteratorT, ValueIteratorT, KeyIteratorT, ValueIteratorT, ChooseOffsetT, CompareOpT>;

return DispatchMergeSortT::Dispatch(
storage, bytes, d_keys, d_items, d_keys, d_items, num_items, compare_op, stream);
});
}

/**
* @brief Sorts items using a merge sorting method.
*
Expand Down Expand Up @@ -506,6 +599,92 @@ public:
return SortKeysNoNVTX(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
}

//! @rst
//! Sorts keys using a merge sorting method.
//!
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
//!
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! - This operation provides ``run_to_run`` determinism.
//! - SortKeys is not guaranteed to be stable.
//!
//! Snippet
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
//! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_sort_env_api.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin sort-keys-env
//! :end-before: example-end sort-keys-env
//!
//! @endrst
//!
//! @tparam KeyIteratorT
//! **[inferred]** Random-access iterator type for keys @iterator
//!
//! @tparam OffsetT
//! **[inferred]** Integer type for offsets
//!
//! @tparam CompareOpT
//! **[inferred]** Comparison function object type
//!
//! @tparam EnvT
//! **[inferred]** Execution environment type
//!
//! @param[in,out] d_keys
//! Keys to sort
//!
//! @param[in] num_items
//! Number of items to sort
//!
//! @param[in] compare_op
//! Comparison function object
//!
//! @param[in] env
//! @rst
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename EnvT = ::cuda::std::execution::env<>>
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
SortKeys(KeyIteratorT d_keys, OffsetT num_items, CompareOpT compare_op, EnvT env = {})
{
_CCCL_NVTX_RANGE_SCOPE(GetName());

static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
"Determinism should be used inside requires to have an effect.");
using requirements_t = ::cuda::std::execution::
__query_result_or_t<EnvT, ::cuda::execution::__get_requirements_t, ::cuda::std::execution::env<>>;
using determinism_t =
::cuda::std::execution::__query_result_or_t<requirements_t,
::cuda::execution::determinism::__get_determinism_t,
::cuda::execution::determinism::run_to_run_t>;
static_assert(!::cuda::std::is_same_v<determinism_t, ::cuda::execution::determinism::gpu_to_gpu_t>,
"gpu_to_gpu determinism is not supported for unstable sort. Use StableSortKeys instead.");

using ChooseOffsetT = detail::choose_offset_t<OffsetT>;

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
using DispatchMergeSortT =
DispatchMergeSort<KeyIteratorT, NullType*, KeyIteratorT, NullType*, ChooseOffsetT, CompareOpT>;

return DispatchMergeSortT::Dispatch(
storage,
bytes,
d_keys,
static_cast<NullType*>(nullptr),
d_keys,
static_cast<NullType*>(nullptr),
num_items,
compare_op,
stream);
});
}

private:
// Internal version without NVTX range
template <typename KeyInputIteratorT, typename KeyIteratorT, typename OffsetT, typename CompareOpT>
Expand Down Expand Up @@ -765,6 +944,83 @@ public:
d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream);
}

//! @rst
//! Stably sorts items using a merge sorting method.
//!
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
//!
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! - This operation provides ``gpu_to_gpu`` determinism.
//! - StableSortPairs preserves the relative ordering of equivalent elements.
//!
//! Snippet
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
//! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_sort_env_api.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin stable-sort-pairs-env
//! :end-before: example-end stable-sort-pairs-env
//!
//! @endrst
//!
//! @tparam KeyIteratorT
//! **[inferred]** Random-access iterator type for keys @iterator
//!
//! @tparam ValueIteratorT
//! **[inferred]** Random-access iterator type for values @iterator
//!
//! @tparam OffsetT
//! **[inferred]** Integer type for offsets
//!
//! @tparam CompareOpT
//! **[inferred]** Comparison function object type
//!
//! @tparam EnvT
//! **[inferred]** Execution environment type
//!
//! @param[in,out] d_keys
//! Keys to sort
//!
//! @param[in,out] d_items
//! Values corresponding to keys
//!
//! @param[in] num_items
//! Number of items to sort
//!
//! @param[in] compare_op
//! Comparison function object
//!
//! @param[in] env
//! @rst
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename KeyIteratorT,
typename ValueIteratorT,
typename OffsetT,
typename CompareOpT,
typename EnvT = ::cuda::std::execution::env<>>
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
StableSortPairs(KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, EnvT env = {})
{
_CCCL_NVTX_RANGE_SCOPE(GetName());

using ChooseOffsetT = detail::choose_offset_t<OffsetT>;

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
using DispatchMergeSortT =
DispatchMergeSort<KeyIteratorT, ValueIteratorT, KeyIteratorT, ValueIteratorT, ChooseOffsetT, CompareOpT>;

return DispatchMergeSortT::Dispatch(
storage, bytes, d_keys, d_items, d_keys, d_items, num_items, compare_op, stream);
});
}

/**
* @brief Sorts items using a merge sorting method.
*
Expand Down Expand Up @@ -869,6 +1125,81 @@ public:
d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream);
}

//! @rst
//! Stably sorts keys using a merge sorting method.
//!
//! .. versionadded:: 3.4.0
//! First appears in CUDA Toolkit 13.4.
//!
//! This is an environment-based API that allows customization of:
//!
//! - Stream: Query via ``cuda::get_stream``
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
//!
//! - This operation provides ``gpu_to_gpu`` determinism.
//! - StableSortKeys preserves the relative ordering of equivalent elements.
//!
//! Snippet
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
//! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_sort_env_api.cu
//! :language: c++
//! :dedent:
//! :start-after: example-begin stable-sort-keys-env
//! :end-before: example-end stable-sort-keys-env
//!
//! @endrst
//!
//! @tparam KeyIteratorT
//! **[inferred]** Random-access iterator type for keys @iterator
//!
//! @tparam OffsetT
//! **[inferred]** Integer type for offsets
//!
//! @tparam CompareOpT
//! **[inferred]** Comparison function object type
//!
//! @tparam EnvT
//! **[inferred]** Execution environment type
//!
//! @param[in,out] d_keys
//! Keys to sort
//!
//! @param[in] num_items
//! Number of items to sort
//!
//! @param[in] compare_op
//! Comparison function object
//!
//! @param[in] env
//! @rst
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename KeyIteratorT, typename OffsetT, typename CompareOpT, typename EnvT = ::cuda::std::execution::env<>>
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
StableSortKeys(KeyIteratorT d_keys, OffsetT num_items, CompareOpT compare_op, EnvT env = {})
{
_CCCL_NVTX_RANGE_SCOPE(GetName());

using ChooseOffsetT = detail::choose_offset_t<OffsetT>;

return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
using DispatchMergeSortT =
DispatchMergeSort<KeyIteratorT, NullType*, KeyIteratorT, NullType*, ChooseOffsetT, CompareOpT>;

return DispatchMergeSortT::Dispatch(
storage,
bytes,
d_keys,
static_cast<NullType*>(nullptr),
d_keys,
static_cast<NullType*>(nullptr),
num_items,
compare_op,
stream);
});
}

/**
* @brief Sorts items using a merge sorting method.
*
Expand Down
Loading