diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 3368c52cc98..d01613a4978 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -1,5 +1,5 @@ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved. -// SPDX-FileCopyrightText: Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 //! @file @@ -33,6 +33,9 @@ #include #include #include +#include +#include +#include CUB_NAMESPACE_BEGIN @@ -184,7 +187,6 @@ struct DeviceScan static_assert(!is_determinism_required || is_safe_integral_op, "run_to_run or gpu_to_gpu is only supported for integral types with known operators"); - // Dispatch with environment - handles all boilerplate return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { using tuning_t = decltype(tuning); return scan_impl_determinism< @@ -387,10 +389,10 @@ struct DeviceScan { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSum"); - using InitT = cub::detail::it_value_t; - InitT init_value{}; + using init_t = cub::detail::it_value_t; + init_t init_value{}; - return scan_impl_env(d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue(init_value), num_items, env); + return scan_impl_env(d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue(init_value), num_items, env); } //! @rst @@ -2368,6 +2370,481 @@ struct DeviceScan stream); } + //! @rst + //! Computes a device-wide exclusive prefix sum-by-key with key equality + //! defined by ``equality_op``. The value of ``0`` is applied as the initial + //! value, and is assigned to the beginning of each segment in ``d_values_out``. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! - Supports non-commutative sum operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - ``d_keys_in`` may equal ``d_values_out`` but the range + //! ``[d_keys_in, d_keys_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - ``d_values_in`` may equal ``d_values_out`` but the range + //! ``[d_values_in, d_values_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! + //! Snippet + //! + //! The code snippet below illustrates the exclusive prefix sum-by-key of an ``int`` device vector. + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_by_key_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin exclusive-sum-by-key-env + //! :end-before: example-end exclusive-sum-by-key-env + //! + //! @endrst + //! + //! @tparam KeysInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator + //! + //! @tparam ValuesInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator + //! + //! @tparam ValuesOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator + //! + //! @tparam EqualityOpT + //! **[inferred]** Functor type having member + //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys + //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`. + //! + //! @param[in] d_keys_in + //! Random-access input iterator to the input sequence of key items + //! + //! @param[in] d_values_in + //! Random-access input iterator to the input sequence of value items + //! + //! @param[out] d_values_out + //! Random-access output iterator to the output sequence of value items + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) + //! + //! @param[in] equality_op + //! Binary functor that defines the equality of keys. + //! Default is cuda::std::equal_to<>{}. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`. + //! @endrst + template , + typename NumItemsT = uint32_t, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t< + !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey( + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + NumItemsT num_items, + EqualityOpT equality_op = EqualityOpT(), + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSumByKey"); + + using init_t = cub::detail::it_value_t; + init_t init_value{}; + + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + using offset_t = detail::choose_offset_t; + return DispatchScanByKey< + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + EqualityOpT, + ::cuda::std::plus, + init_t, + offset_t>::Dispatch(storage, + bytes, + d_keys_in, + d_values_in, + d_values_out, + equality_op, + ::cuda::std::plus{}, + init_value, + static_cast(num_items), + stream); + }); + } + + //! @rst + //! Computes a device-wide exclusive prefix scan-by-key using the + //! specified binary associative ``scan_op`` functor. The key equality is defined by + //! ``equality_op``. The ``init_value`` value is applied as the initial + //! value, and is assigned to the beginning of each segment in ``d_values_out``. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! - Supports non-commutative scan operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - ``d_keys_in`` may equal ``d_values_out`` but the range + //! ``[d_keys_in, d_keys_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - ``d_values_in`` may equal ``d_values_out`` but the range + //! ``[d_values_in, d_values_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! + //! Snippet + //! + //! The code snippet below illustrates the exclusive prefix scan-by-key of an ``int`` device vector. + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_by_key_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin exclusive-scan-by-key-env + //! :end-before: example-end exclusive-scan-by-key-env + //! + //! @endrst + //! + //! @tparam KeysInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator + //! + //! @tparam ValuesInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator + //! + //! @tparam ValuesOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator + //! + //! @tparam ScanOpT + //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam InitValueT + //! **[inferred]** Type of the `init_value` + //! + //! @tparam EqualityOpT + //! **[inferred]** Functor type having member + //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys + //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`. + //! + //! @param[in] d_keys_in + //! Random-access input iterator to the input sequence of key items + //! + //! @param[in] d_values_in + //! Random-access input iterator to the input sequence of value items + //! + //! @param[out] d_values_out + //! Random-access output iterator to the output sequence of value items + //! + //! @param[in] scan_op + //! Binary associative scan functor + //! + //! @param[in] init_value + //! Initial value to seed the exclusive scan (and is assigned to the + //! beginning of each segment in `d_values_out`) + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) + //! + //! @param[in] equality_op + //! Binary functor that defines the equality of keys. + //! Default is cuda::std::equal_to<>{}. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`. + //! @endrst + template , + typename NumItemsT = uint32_t, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t< + !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey( + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + ScanOpT scan_op, + InitValueT init_value, + NumItemsT num_items, + EqualityOpT equality_op = EqualityOpT(), + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveScanByKey"); + + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + using offset_t = detail::choose_offset_t; + return DispatchScanByKey< + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + EqualityOpT, + ScanOpT, + InitValueT, + offset_t>::Dispatch(storage, + bytes, + d_keys_in, + d_values_in, + d_values_out, + equality_op, + scan_op, + init_value, + static_cast(num_items), + stream); + }); + } + + //! @rst + //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! - Supports non-commutative sum operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - ``d_keys_in`` may equal ``d_values_out`` but the range + //! ``[d_keys_in, d_keys_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - ``d_values_in`` may equal ``d_values_out`` but the range + //! ``[d_values_in, d_values_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! + //! Snippet + //! + //! The code snippet below illustrates the inclusive prefix sum-by-key of an ``int`` device vector. + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_by_key_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin inclusive-sum-by-key-env + //! :end-before: example-end inclusive-sum-by-key-env + //! + //! @endrst + //! + //! @tparam KeysInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator + //! + //! @tparam ValuesInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator + //! + //! @tparam ValuesOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator + //! + //! @tparam EqualityOpT + //! **[inferred]** Functor type having member + //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys + //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`. + //! + //! @param[in] d_keys_in + //! Random-access input iterator to the input sequence of key items + //! + //! @param[in] d_values_in + //! Random-access input iterator to the input sequence of value items + //! + //! @param[out] d_values_out + //! Random-access output iterator to the output sequence of value items + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) + //! + //! @param[in] equality_op + //! Binary functor that defines the equality of keys. + //! Default is cuda::std::equal_to<>{}. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`. + //! @endrst + template , + typename NumItemsT = uint32_t, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t< + !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey( + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + NumItemsT num_items, + EqualityOpT equality_op = EqualityOpT(), + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::InclusiveSumByKey"); + + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + using offset_t = detail::choose_offset_t; + using value_t = cub::detail::it_value_t; + return DispatchScanByKey< + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + EqualityOpT, + ::cuda::std::plus, + NullType, + offset_t>::Dispatch(storage, + bytes, + d_keys_in, + d_values_in, + d_values_out, + equality_op, + ::cuda::std::plus{}, + NullType{}, + static_cast(num_items), + stream); + }); + } + + //! @rst + //! Computes a device-wide inclusive prefix scan-by-key using the + //! specified binary associative ``scan_op`` functor. The key equality is defined by ``equality_op``. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! - Supports non-commutative scan operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - ``d_keys_in`` may equal ``d_values_out`` but the range + //! ``[d_keys_in, d_keys_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - ``d_values_in`` may equal ``d_values_out`` but the range + //! ``[d_values_in, d_values_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! + //! Snippet + //! + //! The code snippet below illustrates the inclusive prefix scan-by-key of an ``int`` device vector. + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_by_key_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin inclusive-scan-by-key-env + //! :end-before: example-end inclusive-scan-by-key-env + //! + //! @endrst + //! + //! @tparam KeysInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator + //! + //! @tparam ValuesInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator + //! + //! @tparam ValuesOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator + //! + //! @tparam ScanOpT + //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam EqualityOpT + //! **[inferred]** Functor type having member + //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys + //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`. + //! + //! @param[in] d_keys_in + //! Random-access input iterator to the input sequence of key items + //! + //! @param[in] d_values_in + //! Random-access input iterator to the input sequence of value items + //! + //! @param[out] d_values_out + //! Random-access output iterator to the output sequence of value items + //! + //! @param[in] scan_op + //! Binary associative scan functor + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) + //! + //! @param[in] equality_op + //! Binary functor that defines the equality of keys. + //! Default is cuda::std::equal_to<>{}. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`. + //! @endrst + template , + typename NumItemsT = uint32_t, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t< + !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey( + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + ScanOpT scan_op, + NumItemsT num_items, + EqualityOpT equality_op = EqualityOpT(), + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::InclusiveScanByKey"); + + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + using offset_t = detail::choose_offset_t; + return DispatchScanByKey< + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + EqualityOpT, + ScanOpT, + NullType, + offset_t>::Dispatch(storage, + bytes, + d_keys_in, + d_values_in, + d_values_out, + equality_op, + scan_op, + NullType{}, + static_cast(num_items), + stream); + }); + } + //! @} }; diff --git a/cub/test/catch2_test_device_scan_by_key_env.cu b/cub/test/catch2_test_device_scan_by_key_env.cu new file mode 100644 index 00000000000..31930801dc8 --- /dev/null +++ b/cub/test/catch2_test_device_scan_by_key_env.cu @@ -0,0 +1,210 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Should precede any includes +struct stream_registry_factory_t; +#define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY stream_registry_factory_t + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include + +#include "catch2_test_env_launch_helper.h" + +DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveSumByKey, device_scan_exclusive_sum_by_key); +DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScanByKey, device_scan_exclusive_scan_by_key); +DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveSumByKey, device_scan_inclusive_sum_by_key); +DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveScanByKey, device_scan_inclusive_scan_by_key); + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +#include + +namespace stdexec = cuda::std::execution; + +#if TEST_LAUNCH == 0 + +TEST_CASE("Device scan exclusive-sum-by-key works with default environment", "[scan][by_key][device]") +{ + auto num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto d_out = thrust::device_vector(num_items); + + REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveSumByKey(d_keys.begin(), d_in.begin(), d_out.begin(), num_items)); + + thrust::device_vector expected{0, 8, 0, 7, 12, 0, 0}; + REQUIRE(d_out == expected); +} + +TEST_CASE("Device scan exclusive-scan-by-key works with default environment", "[scan][by_key][device]") +{ + auto num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto d_out = thrust::device_vector(num_items); + auto init = 0; + + REQUIRE(cudaSuccess + == cub::DeviceScan::ExclusiveScanByKey( + d_keys.begin(), d_in.begin(), d_out.begin(), cuda::std::plus{}, init, num_items)); + + thrust::device_vector expected{0, 8, 0, 7, 12, 0, 0}; + REQUIRE(d_out == expected); +} + +TEST_CASE("Device scan inclusive-sum-by-key works with default environment", "[scan][by_key][device]") +{ + auto num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto d_out = thrust::device_vector(num_items); + + REQUIRE(cudaSuccess == cub::DeviceScan::InclusiveSumByKey(d_keys.begin(), d_in.begin(), d_out.begin(), num_items)); + + thrust::device_vector expected{8, 14, 7, 12, 15, 0, 9}; + REQUIRE(d_out == expected); +} + +TEST_CASE("Device scan inclusive-scan-by-key works with default environment", "[scan][by_key][device]") +{ + auto num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto d_out = thrust::device_vector(num_items); + + REQUIRE( + cudaSuccess + == cub::DeviceScan::InclusiveScanByKey(d_keys.begin(), d_in.begin(), d_out.begin(), cuda::std::plus{}, num_items)); + + thrust::device_vector expected{8, 14, 7, 12, 15, 0, 9}; + REQUIRE(d_out == expected); +} + +#endif + +C2H_TEST("Device scan exclusive-sum-by-key uses environment", "[scan][by_key][device]") +{ + using num_items_t = int; + + num_items_t num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto d_out = thrust::device_vector(num_items); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceScan::ExclusiveSumByKey( + nullptr, + expected_bytes_allocated, + d_keys.begin(), + d_in.begin(), + d_out.begin(), + num_items, + cuda::std::equal_to<>{})); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + device_scan_exclusive_sum_by_key(d_keys.begin(), d_in.begin(), d_out.begin(), num_items, cuda::std::equal_to<>{}, env); + + thrust::device_vector expected{0.0f, 8.0f, 0.0f, 7.0f, 12.0f, 0.0f, 0.0f}; + REQUIRE(d_out == expected); +} + +C2H_TEST("Device scan exclusive-scan-by-key uses environment", "[scan][by_key][device]") +{ + using scan_op_t = cuda::std::plus<>; + using num_items_t = int; + + num_items_t num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto d_out = thrust::device_vector(num_items); + auto init = 0.0f; + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceScan::ExclusiveScanByKey( + nullptr, + expected_bytes_allocated, + d_keys.begin(), + d_in.begin(), + d_out.begin(), + scan_op_t{}, + init, + num_items, + cuda::std::equal_to<>{})); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + device_scan_exclusive_scan_by_key( + d_keys.begin(), d_in.begin(), d_out.begin(), scan_op_t{}, init, num_items, cuda::std::equal_to<>{}, env); + + thrust::device_vector expected{0.0f, 8.0f, 0.0f, 7.0f, 12.0f, 0.0f, 0.0f}; + REQUIRE(d_out == expected); +} + +C2H_TEST("Device scan inclusive-sum-by-key uses environment", "[scan][by_key][device]") +{ + using num_items_t = int; + + num_items_t num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto d_out = thrust::device_vector(num_items); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceScan::InclusiveSumByKey( + nullptr, + expected_bytes_allocated, + d_keys.begin(), + d_in.begin(), + d_out.begin(), + num_items, + cuda::std::equal_to<>{})); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + device_scan_inclusive_sum_by_key(d_keys.begin(), d_in.begin(), d_out.begin(), num_items, cuda::std::equal_to<>{}, env); + + thrust::device_vector expected{8.0f, 14.0f, 7.0f, 12.0f, 15.0f, 0.0f, 9.0f}; + REQUIRE(d_out == expected); +} + +C2H_TEST("Device scan inclusive-scan-by-key uses environment", "[scan][by_key][device]") +{ + using scan_op_t = cuda::std::plus<>; + using num_items_t = int; + + num_items_t num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto d_out = thrust::device_vector(num_items); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceScan::InclusiveScanByKey( + nullptr, + expected_bytes_allocated, + d_keys.begin(), + d_in.begin(), + d_out.begin(), + scan_op_t{}, + num_items, + cuda::std::equal_to<>{})); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + device_scan_inclusive_scan_by_key( + d_keys.begin(), d_in.begin(), d_out.begin(), scan_op_t{}, num_items, cuda::std::equal_to<>{}, env); + + thrust::device_vector expected{8.0f, 14.0f, 7.0f, 12.0f, 15.0f, 0.0f, 9.0f}; + REQUIRE(d_out == expected); +} diff --git a/cub/test/catch2_test_device_scan_by_key_env_api.cu b/cub/test/catch2_test_device_scan_by_key_env_api.cu new file mode 100644 index 00000000000..351776aafa3 --- /dev/null +++ b/cub/test/catch2_test_device_scan_by_key_env_api.cu @@ -0,0 +1,197 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include + +#include +#include + +#include + +#include + +C2H_TEST("cub::DeviceScan::ExclusiveSumByKey accepts environment", "[scan][by_key][env]") +{ + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto output = thrust::device_vector(7); + + auto error = + cub::DeviceScan::ExclusiveSumByKey(keys.begin(), input.begin(), output.begin(), static_cast(input.size())); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::ExclusiveSumByKey failed with status: " << error << std::endl; + } + + thrust::device_vector expected{0, 8, 0, 7, 12, 0, 0}; + + REQUIRE(error == cudaSuccess); + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::ExclusiveSumByKey accepts stream environment", "[scan][by_key][env]") +{ + // example-begin exclusive-sum-by-key-env + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto output = thrust::device_vector(7); + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceScan::ExclusiveSumByKey( + keys.begin(), input.begin(), output.begin(), static_cast(input.size()), cuda::std::equal_to<>{}, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::ExclusiveSumByKey failed with status: " << error << std::endl; + } + + thrust::device_vector expected{0.0f, 8.0f, 0.0f, 7.0f, 12.0f, 0.0f, 0.0f}; + // example-end exclusive-sum-by-key-env + + REQUIRE(error == cudaSuccess); + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::ExclusiveScanByKey accepts environment", "[scan][by_key][env]") +{ + auto op = cuda::std::plus{}; + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto output = thrust::device_vector(7); + auto init = 0; + + auto error = cub::DeviceScan::ExclusiveScanByKey( + keys.begin(), input.begin(), output.begin(), op, init, static_cast(input.size())); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::ExclusiveScanByKey failed with status: " << error << std::endl; + } + + thrust::device_vector expected{0, 8, 0, 7, 12, 0, 0}; + + REQUIRE(error == cudaSuccess); + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::ExclusiveScanByKey accepts stream environment", "[scan][by_key][env]") +{ + // example-begin exclusive-scan-by-key-env + auto op = cuda::std::plus{}; + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto output = thrust::device_vector(7); + auto init = 0.0f; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceScan::ExclusiveScanByKey( + keys.begin(), input.begin(), output.begin(), op, init, static_cast(input.size()), cuda::std::equal_to<>{}, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::ExclusiveScanByKey failed with status: " << error << std::endl; + } + + thrust::device_vector expected{0.0f, 8.0f, 0.0f, 7.0f, 12.0f, 0.0f, 0.0f}; + // example-end exclusive-scan-by-key-env + + REQUIRE(error == cudaSuccess); + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::InclusiveSumByKey accepts environment", "[scan][by_key][env]") +{ + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto output = thrust::device_vector(7); + + auto error = + cub::DeviceScan::InclusiveSumByKey(keys.begin(), input.begin(), output.begin(), static_cast(input.size())); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::InclusiveSumByKey failed with status: " << error << std::endl; + } + + thrust::device_vector expected{8, 14, 7, 12, 15, 0, 9}; + + REQUIRE(error == cudaSuccess); + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::InclusiveSumByKey accepts stream environment", "[scan][by_key][env]") +{ + // example-begin inclusive-sum-by-key-env + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto output = thrust::device_vector(7); + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceScan::InclusiveSumByKey( + keys.begin(), input.begin(), output.begin(), static_cast(input.size()), cuda::std::equal_to<>{}, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::InclusiveSumByKey failed with status: " << error << std::endl; + } + + thrust::device_vector expected{8.0f, 14.0f, 7.0f, 12.0f, 15.0f, 0.0f, 9.0f}; + // example-end inclusive-sum-by-key-env + + REQUIRE(error == cudaSuccess); + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::InclusiveScanByKey accepts environment", "[scan][by_key][env]") +{ + auto op = cuda::std::plus{}; + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto output = thrust::device_vector(7); + + auto error = cub::DeviceScan::InclusiveScanByKey( + keys.begin(), input.begin(), output.begin(), op, static_cast(input.size())); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::InclusiveScanByKey failed with status: " << error << std::endl; + } + + thrust::device_vector expected{8, 14, 7, 12, 15, 0, 9}; + + REQUIRE(error == cudaSuccess); + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::InclusiveScanByKey accepts stream environment", "[scan][by_key][env]") +{ + // example-begin inclusive-scan-by-key-env + auto op = cuda::std::plus{}; + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto output = thrust::device_vector(7); + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceScan::InclusiveScanByKey( + keys.begin(), input.begin(), output.begin(), op, static_cast(input.size()), cuda::std::equal_to<>{}, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::InclusiveScanByKey failed with status: " << error << std::endl; + } + + thrust::device_vector expected{8.0f, 14.0f, 7.0f, 12.0f, 15.0f, 0.0f, 9.0f}; + // example-end inclusive-scan-by-key-env + + REQUIRE(error == cudaSuccess); + REQUIRE(output == expected); +}