diff --git a/cub/cub/device/device_merge.cuh b/cub/cub/device/device_merge.cuh index 2d26bde0822..dc223c9f31b 100644 --- a/cub/cub/device/device_merge.cuh +++ b/cub/cub/device/device_merge.cuh @@ -13,9 +13,11 @@ # pragma system_header #endif // no system header +#include #include #include +#include #include #include @@ -103,6 +105,111 @@ struct DeviceMerge stream); } + //! @rst + //! Overview + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! Merges two sorted sequences of values (called keys) into a sorted output sequence. Merging is unstable, + //! which means any two equivalent values (neither value is ordered before the other) may be written to the output + //! sequence in any order. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Snippet + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin merge-keys-env + //! :end-before: example-end merge-keys-env + //! + //! @endrst + //! + //! @tparam KeyIteratorIn1 + //! **[deduced]** Random access iterator to the first sorted input sequence. Must have the same + //! value type as KeyIteratorIn2. + //! + //! @tparam KeyIteratorIn2 + //! **[deduced]** Random access iterator to the second sorted input sequence. Must have the + //! same value type as KeyIteratorIn1. + //! + //! @tparam KeyIteratorOut + //! **[deduced]** Random access iterator to the output sequence. + //! + //! @tparam CompareOp + //! **[deduced]** Binary predicate to compare the input iterator's value types. Must have a + //! signature equivalent to `bool operator()(Key lhs, Key rhs)` and establish a [strict weak ordering]. + //! + //! @tparam EnvT + //! **[deduced]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! + //! @param[in] keys_in1 + //! Iterator to the beginning of the first sorted input sequence. + //! + //! @param[in] num_keys1 + //! Number of keys in the first input sequence. + //! + //! @param[in] keys_in2 + //! Iterator to the beginning of the second sorted input sequence. + //! + //! @param[in] num_keys2 + //! Number of keys in the second input sequence. + //! + //! @param[out] keys_out + //! Iterator to the beginning of the output sequence. + //! + //! @param[in] compare_op + //! Comparison function object, returning true if the first argument is ordered before the + //! second. Must establish a [strict weak ordering]. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + //! [strict weak ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order + template , + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t + && !::cuda::std::is_same_v + && !::cuda::std::is_arithmetic_v, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MergeKeys( + KeyIteratorIn1 keys_in1, + ::cuda::std::int64_t num_keys1, + KeyIteratorIn2 keys_in2, + ::cuda::std::int64_t num_keys2, + KeyIteratorOut keys_out, + CompareOp compare_op = {}, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceMerge::MergeKeys"); + + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) { + return detail::merge::dispatch( + d_temp_storage, + temp_storage_bytes, + keys_in1, + static_cast(nullptr), + num_keys1, + keys_in2, + static_cast(nullptr), + num_keys2, + keys_out, + static_cast(nullptr), + compare_op, + stream); + }); + } + //! @rst //! Overview //! +++++++++++++++++++++++++++++++++++++++++++++ @@ -191,6 +298,138 @@ struct DeviceMerge compare_op, stream); } + + //! @rst + //! Overview + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! Merges two sorted sequences of key-value pairs into a sorted output sequence. Merging is unstable, + //! which means any two equivalent values (neither value is ordered before the other) may be written to the output + //! sequence in any order. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! Snippet + //! +++++++++++++++++++++++++++++++++++++++++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin merge-pairs-env + //! :end-before: example-end merge-pairs-env + //! + //! @endrst + //! + //! @tparam KeyIteratorIn1 + //! **[deduced]** Random access iterator to the keys of the first sorted input sequence. Must + //! have the same value type as KeyIteratorIn2. + //! + //! @tparam ValueIteratorIn1 + //! **[deduced]** Random access iterator to the values of the first sorted input sequence. + //! Must have the same value type as ValueIteratorIn2. + //! + //! @tparam KeyIteratorIn2 + //! **[deduced]** Random access iterator to the second sorted input sequence. Must have the + //! same value type as KeyIteratorIn1. + //! + //! @tparam ValueIteratorIn2 + //! **[deduced]** Random access iterator to the values of the second sorted input sequence. + //! Must have the same value type as ValueIteratorIn1. + //! + //! @tparam KeyIteratorOut + //! **[deduced]** Random access iterator to the keys of the output sequence. + //! + //! @tparam ValueIteratorOut + //! **[deduced]** Random access iterator to the values of the output sequence. + //! + //! @tparam CompareOp + //! **[deduced]** Binary predicate to compare the key input iterator's value types. Must have a + //! signature equivalent to `bool operator()(Key lhs, Key rhs)` and establish a [strict weak ordering]. + //! + //! @tparam EnvT + //! **[deduced]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! + //! @param[in] keys_in1 + //! Iterator to the beginning of the keys of the first sorted input sequence. + //! + //! @param[in] values_in1 + //! Iterator to the beginning of the values of the first sorted input sequence. + //! + //! @param[in] num_pairs1 + //! Number of key-value pairs in the first input sequence. + //! + //! @param[in] keys_in2 + //! Iterator to the beginning of the keys of the second sorted input sequence. + //! + //! @param[in] values_in2 + //! Iterator to the beginning of the values of the second sorted input sequence. + //! + //! @param[in] num_pairs2 + //! Number of key-value pairs in the second input sequence. + //! + //! @param[out] keys_out + //! Iterator to the beginning of the keys of the output sequence. + //! + //! @param[out] values_out + //! Iterator to the beginning of the values of the output sequence. + //! + //! @param[in] compare_op + //! Comparison function object, returning true if the first argument is ordered before the + //! second. Must establish a [strict weak ordering]. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + //! [strict weak ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order + template , + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t + && !::cuda::std::is_same_v + && !::cuda::std::is_arithmetic_v, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MergePairs( + KeyIteratorIn1 keys_in1, + ValueIteratorIn1 values_in1, + ::cuda::std::int64_t num_pairs1, + KeyIteratorIn2 keys_in2, + ValueIteratorIn2 values_in2, + ::cuda::std::int64_t num_pairs2, + KeyIteratorOut keys_out, + ValueIteratorOut values_out, + CompareOp compare_op = {}, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceMerge::MergePairs"); + + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) { + return detail::merge::dispatch( + d_temp_storage, + temp_storage_bytes, + keys_in1, + values_in1, + num_pairs1, + keys_in2, + values_in2, + num_pairs2, + keys_out, + values_out, + compare_op, + stream); + }); + } }; CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_device_merge_env.cu b/cub/test/catch2_test_device_merge_env.cu new file mode 100644 index 00000000000..dee1b9f2e61 --- /dev/null +++ b/cub/test/catch2_test_device_merge_env.cu @@ -0,0 +1,242 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Should precede any includes +struct stream_registry_factory_t; +#define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY stream_registry_factory_t + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include + +#include "catch2_test_env_launch_helper.h" + +DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergeKeys, merge_keys); +DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergePairs, merge_pairs); + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +#include + +#include + +namespace stdexec = cuda::std::execution; + +#if TEST_LAUNCH == 0 + +TEST_CASE("DeviceMerge::MergeKeys works with default environment", "[merge][device]") +{ + auto keys1 = c2h::device_vector{0, 2, 5}; + auto keys2 = c2h::device_vector{0, 3, 3, 4}; + auto result = c2h::device_vector(7); + + REQUIRE( + cudaSuccess + == cub::DeviceMerge::MergeKeys( + keys1.begin(), static_cast(keys1.size()), keys2.begin(), static_cast(keys2.size()), result.begin())); + + c2h::device_vector expected{0, 0, 2, 3, 3, 4, 5}; + REQUIRE(result == expected); +} + +TEST_CASE("DeviceMerge::MergePairs works with default environment", "[merge][device]") +{ + auto keys1 = c2h::device_vector{0, 2, 5}; + auto values1 = c2h::device_vector{'a', 'b', 'c'}; + auto keys2 = c2h::device_vector{0, 3, 3, 4}; + auto values2 = c2h::device_vector{'A', 'B', 'C', 'D'}; + + auto result_keys = c2h::device_vector(7); + auto result_values = c2h::device_vector(7); + + REQUIRE( + cudaSuccess + == cub::DeviceMerge::MergePairs( + keys1.begin(), + values1.begin(), + static_cast(keys1.size()), + keys2.begin(), + values2.begin(), + static_cast(keys2.size()), + result_keys.begin(), + result_values.begin())); + + c2h::device_vector expected_keys{0, 0, 2, 3, 3, 4, 5}; + c2h::device_vector expected_values{'a', 'A', 'b', 'B', 'C', 'D', 'c'}; + REQUIRE(result_keys == expected_keys); + REQUIRE(result_values == expected_values); +} + +#endif + +C2H_TEST("DeviceMerge::MergeKeys uses environment", "[merge][device]") +{ + auto keys1 = c2h::device_vector{0, 2, 5}; + auto keys2 = c2h::device_vector{0, 3, 3, 4}; + auto result = c2h::device_vector(7); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceMerge::MergeKeys( + nullptr, + expected_bytes_allocated, + keys1.begin(), + static_cast(keys1.size()), + keys2.begin(), + static_cast(keys2.size()), + result.begin())); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + merge_keys(keys1.begin(), + static_cast(keys1.size()), + keys2.begin(), + static_cast(keys2.size()), + result.begin(), + cuda::std::less<>{}, + env); + + c2h::device_vector expected{0, 0, 2, 3, 3, 4, 5}; + REQUIRE(result == expected); +} + +TEST_CASE("DeviceMerge::MergeKeys uses custom stream", "[merge][device]") +{ + auto keys1 = c2h::device_vector{0, 2, 5}; + auto keys2 = c2h::device_vector{0, 3, 3, 4}; + auto result = c2h::device_vector(7); + + cudaStream_t custom_stream; + REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream)); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceMerge::MergeKeys( + nullptr, + expected_bytes_allocated, + keys1.begin(), + static_cast(keys1.size()), + keys2.begin(), + static_cast(keys2.size()), + result.begin())); + + auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}}; + auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)}; + + merge_keys(keys1.begin(), + static_cast(keys1.size()), + keys2.begin(), + static_cast(keys2.size()), + result.begin(), + cuda::std::less<>{}, + env); + + REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream)); + + c2h::device_vector expected{0, 0, 2, 3, 3, 4, 5}; + REQUIRE(result == expected); + + REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); +} + +C2H_TEST("DeviceMerge::MergePairs uses environment", "[merge][device]") +{ + auto keys1 = c2h::device_vector{0, 2, 5}; + auto values1 = c2h::device_vector{'a', 'b', 'c'}; + auto keys2 = c2h::device_vector{0, 3, 3, 4}; + auto values2 = c2h::device_vector{'A', 'B', 'C', 'D'}; + + auto result_keys = c2h::device_vector(7); + auto result_values = c2h::device_vector(7); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceMerge::MergePairs( + nullptr, + expected_bytes_allocated, + keys1.begin(), + values1.begin(), + static_cast(keys1.size()), + keys2.begin(), + values2.begin(), + static_cast(keys2.size()), + result_keys.begin(), + result_values.begin())); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + merge_pairs( + keys1.begin(), + values1.begin(), + static_cast(keys1.size()), + keys2.begin(), + values2.begin(), + static_cast(keys2.size()), + result_keys.begin(), + result_values.begin(), + cuda::std::less<>{}, + env); + + c2h::device_vector expected_keys{0, 0, 2, 3, 3, 4, 5}; + c2h::device_vector expected_values{'a', 'A', 'b', 'B', 'C', 'D', 'c'}; + REQUIRE(result_keys == expected_keys); + REQUIRE(result_values == expected_values); +} + +TEST_CASE("DeviceMerge::MergePairs uses custom stream", "[merge][device]") +{ + auto keys1 = c2h::device_vector{0, 2, 5}; + auto values1 = c2h::device_vector{'a', 'b', 'c'}; + auto keys2 = c2h::device_vector{0, 3, 3, 4}; + auto values2 = c2h::device_vector{'A', 'B', 'C', 'D'}; + + auto result_keys = c2h::device_vector(7); + auto result_values = c2h::device_vector(7); + + cudaStream_t custom_stream; + REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream)); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceMerge::MergePairs( + nullptr, + expected_bytes_allocated, + keys1.begin(), + values1.begin(), + static_cast(keys1.size()), + keys2.begin(), + values2.begin(), + static_cast(keys2.size()), + result_keys.begin(), + result_values.begin())); + + auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}}; + auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)}; + + merge_pairs( + keys1.begin(), + values1.begin(), + static_cast(keys1.size()), + keys2.begin(), + values2.begin(), + static_cast(keys2.size()), + result_keys.begin(), + result_values.begin(), + cuda::std::less<>{}, + env); + + REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream)); + + c2h::device_vector expected_keys{0, 0, 2, 3, 3, 4, 5}; + c2h::device_vector expected_values{'a', 'A', 'b', 'B', 'C', 'D', 'c'}; + REQUIRE(result_keys == expected_keys); + REQUIRE(result_values == expected_values); + + REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); +} diff --git a/cub/test/catch2_test_device_merge_env_api.cu b/cub/test/catch2_test_device_merge_env_api.cu new file mode 100644 index 00000000000..6e75f2221fe --- /dev/null +++ b/cub/test/catch2_test_device_merge_env_api.cu @@ -0,0 +1,86 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include + +#include +#include + +#include + +#include + +C2H_TEST("cub::DeviceMerge::MergeKeys accepts env with stream", "[merge][env]") +{ + // example-begin merge-keys-env + auto keys1 = thrust::device_vector{0, 2, 5}; + auto keys2 = thrust::device_vector{0, 3, 3, 4}; + auto result = thrust::device_vector(7); + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceMerge::MergeKeys( + keys1.begin(), + static_cast(keys1.size()), + keys2.begin(), + static_cast(keys2.size()), + result.begin(), + cuda::std::less<>{}, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceMerge::MergeKeys failed with status: " << error << std::endl; + } + + thrust::device_vector expected{0, 0, 2, 3, 3, 4, 5}; + // example-end merge-keys-env + + REQUIRE(error == cudaSuccess); + REQUIRE(result == expected); +} + +C2H_TEST("cub::DeviceMerge::MergePairs accepts env with stream", "[merge][env]") +{ + // example-begin merge-pairs-env + auto keys1 = thrust::device_vector{0, 2, 5}; + auto values1 = thrust::device_vector{'a', 'b', 'c'}; + auto keys2 = thrust::device_vector{0, 3, 3, 4}; + auto values2 = thrust::device_vector{'A', 'B', 'C', 'D'}; + + auto result_keys = thrust::device_vector(7); + auto result_values = thrust::device_vector(7); + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceMerge::MergePairs( + keys1.begin(), + values1.begin(), + static_cast(keys1.size()), + keys2.begin(), + values2.begin(), + static_cast(keys2.size()), + result_keys.begin(), + result_values.begin(), + cuda::std::less<>{}, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceMerge::MergePairs failed with status: " << error << std::endl; + } + + thrust::device_vector expected_keys{0, 0, 2, 3, 3, 4, 5}; + thrust::device_vector expected_values{'a', 'A', 'b', 'B', 'C', 'D', 'c'}; + // example-end merge-pairs-env + + REQUIRE(error == cudaSuccess); + REQUIRE(result_keys == expected_keys); + REQUIRE(result_values == expected_values); +}