From 58698d034ea7396cf41dd3a642a35edaf80fba8b Mon Sep 17 00:00:00 2001 From: gonidelis Date: Tue, 10 Mar 2026 01:12:36 -0700 Subject: [PATCH 1/5] Add DeviceScan::*ByKey env algorithms --- cub/cub/device/device_scan.cuh | 443 ++++++++++++++++++ .../catch2_test_device_scan_by_key_env.cu | 210 +++++++++ .../catch2_test_device_scan_by_key_env_api.cu | 177 +++++++ 3 files changed, 830 insertions(+) create mode 100644 cub/test/catch2_test_device_scan_by_key_env.cu create mode 100644 cub/test/catch2_test_device_scan_by_key_env_api.cu diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 3368c52cc98..7ed8fd0833d 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -33,6 +33,9 @@ #include #include #include +#include +#include +#include CUB_NAMESPACE_BEGIN @@ -198,6 +201,46 @@ struct DeviceScan EnforceInclusive>(storage, bytes, d_in, d_out, scan_op, init, num_items, requested_determinism_t{}, stream); }); } + template + CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t scan_bykey_impl_env( + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + EqualityOpT equality_op, + ScanOpT scan_op, + InitValueT init_value, + NumItemsT num_items, + EnvT env) + { + // Dispatch with environment - handles all boilerplate + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + using offset_t = detail::choose_offset_t; + return DispatchScanByKey< + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + EqualityOpT, + ScanOpT, + InitValueT, + offset_t>::Dispatch(storage, + bytes, + d_keys_in, + d_values_in, + d_values_out, + equality_op, + scan_op, + init_value, + static_cast(num_items), + stream); + }); + } //! @endcond //! @name Exclusive scans @@ -2368,6 +2411,406 @@ struct DeviceScan stream); } + //! @rst + //! Computes a device-wide exclusive prefix sum-by-key with key equality + //! defined by ``equality_op``. The value of ``0`` is applied as the initial + //! value, and is assigned to the beginning of each segment in ``d_values_out``. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! - Supports non-commutative sum operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - ``d_keys_in`` may equal ``d_values_out`` but the range + //! ``[d_keys_in, d_keys_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - ``d_values_in`` may equal ``d_values_out`` but the range + //! ``[d_values_in, d_values_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! + //! Snippet + //! + //! The code snippet below illustrates the exclusive prefix sum-by-key of an ``int`` device vector. + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_by_key_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin exclusive-sum-by-key-env + //! :end-before: example-end exclusive-sum-by-key-env + //! + //! @endrst + //! + //! @tparam KeysInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator + //! + //! @tparam ValuesInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator + //! + //! @tparam ValuesOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator + //! + //! @tparam EqualityOpT + //! **[inferred]** Functor type having member + //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys + //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`. + //! + //! @param[in] d_keys_in + //! Random-access input iterator to the input sequence of key items + //! + //! @param[in] d_values_in + //! Random-access input iterator to the input sequence of value items + //! + //! @param[out] d_values_out + //! Random-access output iterator to the output sequence of value items + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) + //! + //! @param[in] equality_op + //! Binary functor that defines the equality of keys. + //! Default is cuda::std::equal_to<>{}. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`. + //! @endrst + template , + typename NumItemsT = uint32_t, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t< + !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey( + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + NumItemsT num_items, + EqualityOpT equality_op = EqualityOpT(), + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSumByKey"); + + using InitT = cub::detail::it_value_t; + InitT init_value{}; + + return scan_bykey_impl_env( + d_keys_in, d_values_in, d_values_out, equality_op, ::cuda::std::plus<>{}, init_value, num_items, env); + } + + //! @rst + //! Computes a device-wide exclusive prefix scan-by-key using the + //! specified binary associative ``scan_op`` functor. The key equality is defined by + //! ``equality_op``. The ``init_value`` value is applied as the initial + //! value, and is assigned to the beginning of each segment in ``d_values_out``. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! - Supports non-commutative scan operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - ``d_keys_in`` may equal ``d_values_out`` but the range + //! ``[d_keys_in, d_keys_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - ``d_values_in`` may equal ``d_values_out`` but the range + //! ``[d_values_in, d_values_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! + //! Snippet + //! + //! The code snippet below illustrates the exclusive prefix scan-by-key of an ``int`` device vector. + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_by_key_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin exclusive-scan-by-key-env + //! :end-before: example-end exclusive-scan-by-key-env + //! + //! @endrst + //! + //! @tparam KeysInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator + //! + //! @tparam ValuesInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator + //! + //! @tparam ValuesOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator + //! + //! @tparam ScanOpT + //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam InitValueT + //! **[inferred]** Type of the `init_value` + //! + //! @tparam EqualityOpT + //! **[inferred]** Functor type having member + //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys + //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`. + //! + //! @param[in] d_keys_in + //! Random-access input iterator to the input sequence of key items + //! + //! @param[in] d_values_in + //! Random-access input iterator to the input sequence of value items + //! + //! @param[out] d_values_out + //! Random-access output iterator to the output sequence of value items + //! + //! @param[in] scan_op + //! Binary associative scan functor + //! + //! @param[in] init_value + //! Initial value to seed the exclusive scan (and is assigned to the + //! beginning of each segment in `d_values_out`) + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) + //! + //! @param[in] equality_op + //! Binary functor that defines the equality of keys. + //! Default is cuda::std::equal_to<>{}. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`. + //! @endrst + template , + typename NumItemsT = uint32_t, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t< + !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey( + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + ScanOpT scan_op, + InitValueT init_value, + NumItemsT num_items, + EqualityOpT equality_op = EqualityOpT(), + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveScanByKey"); + + return scan_bykey_impl_env(d_keys_in, d_values_in, d_values_out, equality_op, scan_op, init_value, num_items, env); + } + + //! @rst + //! Computes a device-wide inclusive prefix sum-by-key with key equality defined by ``equality_op``. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! - Supports non-commutative sum operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - ``d_keys_in`` may equal ``d_values_out`` but the range + //! ``[d_keys_in, d_keys_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - ``d_values_in`` may equal ``d_values_out`` but the range + //! ``[d_values_in, d_values_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! + //! Snippet + //! + //! The code snippet below illustrates the inclusive prefix sum-by-key of an ``int`` device vector. + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_by_key_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin inclusive-sum-by-key-env + //! :end-before: example-end inclusive-sum-by-key-env + //! + //! @endrst + //! + //! @tparam KeysInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator + //! + //! @tparam ValuesInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator + //! + //! @tparam ValuesOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator + //! + //! @tparam EqualityOpT + //! **[inferred]** Functor type having member + //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys + //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`. + //! + //! @param[in] d_keys_in + //! Random-access input iterator to the input sequence of key items + //! + //! @param[in] d_values_in + //! Random-access input iterator to the input sequence of value items + //! + //! @param[out] d_values_out + //! Random-access output iterator to the output sequence of value items + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) + //! + //! @param[in] equality_op + //! Binary functor that defines the equality of keys. + //! Default is cuda::std::equal_to<>{}. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`. + //! @endrst + template , + typename NumItemsT = uint32_t, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t< + !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey( + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + NumItemsT num_items, + EqualityOpT equality_op = EqualityOpT(), + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::InclusiveSumByKey"); + + return scan_bykey_impl_env( + d_keys_in, d_values_in, d_values_out, equality_op, ::cuda::std::plus<>{}, NullType{}, num_items, env); + } + + //! @rst + //! Computes a device-wide inclusive prefix scan-by-key using the + //! specified binary associative ``scan_op`` functor. The key equality is defined by ``equality_op``. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! - Supports non-commutative scan operators. + //! - Results are not deterministic for pseudo-associative operators (e.g., + //! addition of floating-point types). Results for pseudo-associative + //! operators may vary from run to run. Additional details can be found in + //! the @lookback description. + //! - ``d_keys_in`` may equal ``d_values_out`` but the range + //! ``[d_keys_in, d_keys_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! - ``d_values_in`` may equal ``d_values_out`` but the range + //! ``[d_values_in, d_values_in + num_items)`` and the range + //! ``[d_values_out, d_values_out + num_items)`` shall not overlap otherwise. + //! + //! Snippet + //! + //! The code snippet below illustrates the inclusive prefix scan-by-key of an ``int`` device vector. + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_scan_by_key_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin inclusive-scan-by-key-env + //! :end-before: example-end inclusive-scan-by-key-env + //! + //! @endrst + //! + //! @tparam KeysInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan keys inputs @iterator + //! + //! @tparam ValuesInputIteratorT + //! **[inferred]** Random-access input iterator type for reading scan values inputs @iterator + //! + //! @tparam ValuesOutputIteratorT + //! **[inferred]** Random-access output iterator type for writing scan values outputs @iterator + //! + //! @tparam ScanOpT + //! **[inferred]** Binary associative scan functor type having member `T operator()(const T &a, const T &b)` + //! + //! @tparam EqualityOpT + //! **[inferred]** Functor type having member + //! `T operator()(const T &a, const T &b)` for binary operations that defines the equality of keys + //! + //! @tparam NumItemsT + //! **[inferred]** An integral type representing the number of input elements + //! + //! @tparam EnvT + //! **[inferred]** Execution environment type. Default is `::cuda::std::execution::env<>`. + //! + //! @param[in] d_keys_in + //! Random-access input iterator to the input sequence of key items + //! + //! @param[in] d_values_in + //! Random-access input iterator to the input sequence of value items + //! + //! @param[out] d_values_out + //! Random-access output iterator to the output sequence of value items + //! + //! @param[in] scan_op + //! Binary associative scan functor + //! + //! @param[in] num_items + //! Total number of input items (i.e., the length of `d_keys_in` and `d_values_in`) + //! + //! @param[in] equality_op + //! Binary functor that defines the equality of keys. + //! Default is cuda::std::equal_to<>{}. + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is `::cuda::std::execution::env{}`. + //! @endrst + template , + typename NumItemsT = uint32_t, + typename EnvT = ::cuda::std::execution::env<>, + ::cuda::std::enable_if_t< + !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v, + int> = 0> + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey( + KeysInputIteratorT d_keys_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + ScanOpT scan_op, + NumItemsT num_items, + EqualityOpT equality_op = EqualityOpT(), + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::InclusiveScanByKey"); + + return scan_bykey_impl_env(d_keys_in, d_values_in, d_values_out, equality_op, scan_op, NullType{}, num_items, env); + } + //! @} }; diff --git a/cub/test/catch2_test_device_scan_by_key_env.cu b/cub/test/catch2_test_device_scan_by_key_env.cu new file mode 100644 index 00000000000..31930801dc8 --- /dev/null +++ b/cub/test/catch2_test_device_scan_by_key_env.cu @@ -0,0 +1,210 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Should precede any includes +struct stream_registry_factory_t; +#define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY stream_registry_factory_t + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include + +#include "catch2_test_env_launch_helper.h" + +DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveSumByKey, device_scan_exclusive_sum_by_key); +DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::ExclusiveScanByKey, device_scan_exclusive_scan_by_key); +DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveSumByKey, device_scan_inclusive_sum_by_key); +DECLARE_LAUNCH_WRAPPER(cub::DeviceScan::InclusiveScanByKey, device_scan_inclusive_scan_by_key); + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +#include + +namespace stdexec = cuda::std::execution; + +#if TEST_LAUNCH == 0 + +TEST_CASE("Device scan exclusive-sum-by-key works with default environment", "[scan][by_key][device]") +{ + auto num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto d_out = thrust::device_vector(num_items); + + REQUIRE(cudaSuccess == cub::DeviceScan::ExclusiveSumByKey(d_keys.begin(), d_in.begin(), d_out.begin(), num_items)); + + thrust::device_vector expected{0, 8, 0, 7, 12, 0, 0}; + REQUIRE(d_out == expected); +} + +TEST_CASE("Device scan exclusive-scan-by-key works with default environment", "[scan][by_key][device]") +{ + auto num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto d_out = thrust::device_vector(num_items); + auto init = 0; + + REQUIRE(cudaSuccess + == cub::DeviceScan::ExclusiveScanByKey( + d_keys.begin(), d_in.begin(), d_out.begin(), cuda::std::plus{}, init, num_items)); + + thrust::device_vector expected{0, 8, 0, 7, 12, 0, 0}; + REQUIRE(d_out == expected); +} + +TEST_CASE("Device scan inclusive-sum-by-key works with default environment", "[scan][by_key][device]") +{ + auto num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto d_out = thrust::device_vector(num_items); + + REQUIRE(cudaSuccess == cub::DeviceScan::InclusiveSumByKey(d_keys.begin(), d_in.begin(), d_out.begin(), num_items)); + + thrust::device_vector expected{8, 14, 7, 12, 15, 0, 9}; + REQUIRE(d_out == expected); +} + +TEST_CASE("Device scan inclusive-scan-by-key works with default environment", "[scan][by_key][device]") +{ + auto num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto d_out = thrust::device_vector(num_items); + + REQUIRE( + cudaSuccess + == cub::DeviceScan::InclusiveScanByKey(d_keys.begin(), d_in.begin(), d_out.begin(), cuda::std::plus{}, num_items)); + + thrust::device_vector expected{8, 14, 7, 12, 15, 0, 9}; + REQUIRE(d_out == expected); +} + +#endif + +C2H_TEST("Device scan exclusive-sum-by-key uses environment", "[scan][by_key][device]") +{ + using num_items_t = int; + + num_items_t num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto d_out = thrust::device_vector(num_items); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceScan::ExclusiveSumByKey( + nullptr, + expected_bytes_allocated, + d_keys.begin(), + d_in.begin(), + d_out.begin(), + num_items, + cuda::std::equal_to<>{})); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + device_scan_exclusive_sum_by_key(d_keys.begin(), d_in.begin(), d_out.begin(), num_items, cuda::std::equal_to<>{}, env); + + thrust::device_vector expected{0.0f, 8.0f, 0.0f, 7.0f, 12.0f, 0.0f, 0.0f}; + REQUIRE(d_out == expected); +} + +C2H_TEST("Device scan exclusive-scan-by-key uses environment", "[scan][by_key][device]") +{ + using scan_op_t = cuda::std::plus<>; + using num_items_t = int; + + num_items_t num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto d_out = thrust::device_vector(num_items); + auto init = 0.0f; + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceScan::ExclusiveScanByKey( + nullptr, + expected_bytes_allocated, + d_keys.begin(), + d_in.begin(), + d_out.begin(), + scan_op_t{}, + init, + num_items, + cuda::std::equal_to<>{})); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + device_scan_exclusive_scan_by_key( + d_keys.begin(), d_in.begin(), d_out.begin(), scan_op_t{}, init, num_items, cuda::std::equal_to<>{}, env); + + thrust::device_vector expected{0.0f, 8.0f, 0.0f, 7.0f, 12.0f, 0.0f, 0.0f}; + REQUIRE(d_out == expected); +} + +C2H_TEST("Device scan inclusive-sum-by-key uses environment", "[scan][by_key][device]") +{ + using num_items_t = int; + + num_items_t num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto d_out = thrust::device_vector(num_items); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceScan::InclusiveSumByKey( + nullptr, + expected_bytes_allocated, + d_keys.begin(), + d_in.begin(), + d_out.begin(), + num_items, + cuda::std::equal_to<>{})); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + device_scan_inclusive_sum_by_key(d_keys.begin(), d_in.begin(), d_out.begin(), num_items, cuda::std::equal_to<>{}, env); + + thrust::device_vector expected{8.0f, 14.0f, 7.0f, 12.0f, 15.0f, 0.0f, 9.0f}; + REQUIRE(d_out == expected); +} + +C2H_TEST("Device scan inclusive-scan-by-key uses environment", "[scan][by_key][device]") +{ + using scan_op_t = cuda::std::plus<>; + using num_items_t = int; + + num_items_t num_items = 7; + auto d_keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto d_in = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto d_out = thrust::device_vector(num_items); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceScan::InclusiveScanByKey( + nullptr, + expected_bytes_allocated, + d_keys.begin(), + d_in.begin(), + d_out.begin(), + scan_op_t{}, + num_items, + cuda::std::equal_to<>{})); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + device_scan_inclusive_scan_by_key( + d_keys.begin(), d_in.begin(), d_out.begin(), scan_op_t{}, num_items, cuda::std::equal_to<>{}, env); + + thrust::device_vector expected{8.0f, 14.0f, 7.0f, 12.0f, 15.0f, 0.0f, 9.0f}; + REQUIRE(d_out == expected); +} diff --git a/cub/test/catch2_test_device_scan_by_key_env_api.cu b/cub/test/catch2_test_device_scan_by_key_env_api.cu new file mode 100644 index 00000000000..969c0fc0d15 --- /dev/null +++ b/cub/test/catch2_test_device_scan_by_key_env_api.cu @@ -0,0 +1,177 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include + +#include +#include + +#include + +#include + +C2H_TEST("cub::DeviceScan::ExclusiveSumByKey accepts environment", "[scan][by_key][env]") +{ + // example-begin exclusive-sum-by-key-env + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto output = thrust::device_vector(7); + + auto error = + cub::DeviceScan::ExclusiveSumByKey(keys.begin(), input.begin(), output.begin(), static_cast(input.size())); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::ExclusiveSumByKey failed with status: " << error << std::endl; + } + + thrust::device_vector expected{0, 8, 0, 7, 12, 0, 0}; + // example-end exclusive-sum-by-key-env + + REQUIRE(error == cudaSuccess); + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::ExclusiveSumByKey accepts stream environment", "[scan][by_key][env]") +{ + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto output = thrust::device_vector(7); + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceScan::ExclusiveSumByKey( + keys.begin(), input.begin(), output.begin(), static_cast(input.size()), cuda::std::equal_to<>{}, env); + REQUIRE(error == cudaSuccess); + + thrust::device_vector expected{0.0f, 8.0f, 0.0f, 7.0f, 12.0f, 0.0f, 0.0f}; + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::ExclusiveScanByKey accepts environment", "[scan][by_key][env]") +{ + // example-begin exclusive-scan-by-key-env + auto op = cuda::std::plus{}; + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto output = thrust::device_vector(7); + auto init = 0; + + auto error = cub::DeviceScan::ExclusiveScanByKey( + keys.begin(), input.begin(), output.begin(), op, init, static_cast(input.size())); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::ExclusiveScanByKey failed with status: " << error << std::endl; + } + + thrust::device_vector expected{0, 8, 0, 7, 12, 0, 0}; + // example-end exclusive-scan-by-key-env + + REQUIRE(error == cudaSuccess); + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::ExclusiveScanByKey accepts stream environment", "[scan][by_key][env]") +{ + auto op = cuda::std::plus{}; + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto output = thrust::device_vector(7); + auto init = 0.0f; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceScan::ExclusiveScanByKey( + keys.begin(), input.begin(), output.begin(), op, init, static_cast(input.size()), cuda::std::equal_to<>{}, env); + REQUIRE(error == cudaSuccess); + + thrust::device_vector expected{0.0f, 8.0f, 0.0f, 7.0f, 12.0f, 0.0f, 0.0f}; + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::InclusiveSumByKey accepts environment", "[scan][by_key][env]") +{ + // example-begin inclusive-sum-by-key-env + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto output = thrust::device_vector(7); + + auto error = + cub::DeviceScan::InclusiveSumByKey(keys.begin(), input.begin(), output.begin(), static_cast(input.size())); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::InclusiveSumByKey failed with status: " << error << std::endl; + } + + thrust::device_vector expected{8, 14, 7, 12, 15, 0, 9}; + // example-end inclusive-sum-by-key-env + + REQUIRE(error == cudaSuccess); + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::InclusiveSumByKey accepts stream environment", "[scan][by_key][env]") +{ + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto output = thrust::device_vector(7); + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceScan::InclusiveSumByKey( + keys.begin(), input.begin(), output.begin(), static_cast(input.size()), cuda::std::equal_to<>{}, env); + REQUIRE(error == cudaSuccess); + + thrust::device_vector expected{8.0f, 14.0f, 7.0f, 12.0f, 15.0f, 0.0f, 9.0f}; + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::InclusiveScanByKey accepts environment", "[scan][by_key][env]") +{ + // example-begin inclusive-scan-by-key-env + auto op = cuda::std::plus{}; + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; + auto output = thrust::device_vector(7); + + auto error = cub::DeviceScan::InclusiveScanByKey( + keys.begin(), input.begin(), output.begin(), op, static_cast(input.size())); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::InclusiveScanByKey failed with status: " << error << std::endl; + } + + thrust::device_vector expected{8, 14, 7, 12, 15, 0, 9}; + // example-end inclusive-scan-by-key-env + + REQUIRE(error == cudaSuccess); + REQUIRE(output == expected); +} + +C2H_TEST("cub::DeviceScan::InclusiveScanByKey accepts stream environment", "[scan][by_key][env]") +{ + auto op = cuda::std::plus{}; + auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; + auto input = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; + auto output = thrust::device_vector(7); + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceScan::InclusiveScanByKey( + keys.begin(), input.begin(), output.begin(), op, static_cast(input.size()), cuda::std::equal_to<>{}, env); + REQUIRE(error == cudaSuccess); + + thrust::device_vector expected{8.0f, 14.0f, 7.0f, 12.0f, 15.0f, 0.0f, 9.0f}; + REQUIRE(output == expected); +} From bf70f6b9d2035a8866ea84631b81050d3098bfba Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 11 Mar 2026 19:14:38 -0700 Subject: [PATCH 2/5] * Remove scan_by_key_impl_env helper, inline dispatch_with_env directly * Rename InitT to init_t in env overloads * Use plus/plus instead of plus<> to avoid integer promotion --- cub/cub/device/device_scan.cuh | 140 ++++++++++++++++++++------------- 1 file changed, 87 insertions(+), 53 deletions(-) diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 7ed8fd0833d..d01613a4978 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -1,5 +1,5 @@ // SPDX-FileCopyrightText: Copyright (c) 2011, Duane Merrill. All rights reserved. -// SPDX-FileCopyrightText: Copyright (c) 2011-2025, NVIDIA CORPORATION. All rights reserved. +// SPDX-FileCopyrightText: Copyright (c) 2011-2026, NVIDIA CORPORATION. All rights reserved. // SPDX-License-Identifier: BSD-3 //! @file @@ -187,7 +187,6 @@ struct DeviceScan static_assert(!is_determinism_required || is_safe_integral_op, "run_to_run or gpu_to_gpu is only supported for integral types with known operators"); - // Dispatch with environment - handles all boilerplate return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { using tuning_t = decltype(tuning); return scan_impl_determinism< @@ -201,46 +200,6 @@ struct DeviceScan EnforceInclusive>(storage, bytes, d_in, d_out, scan_op, init, num_items, requested_determinism_t{}, stream); }); } - template - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t scan_bykey_impl_env( - KeysInputIteratorT d_keys_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - EqualityOpT equality_op, - ScanOpT scan_op, - InitValueT init_value, - NumItemsT num_items, - EnvT env) - { - // Dispatch with environment - handles all boilerplate - return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { - using offset_t = detail::choose_offset_t; - return DispatchScanByKey< - KeysInputIteratorT, - ValuesInputIteratorT, - ValuesOutputIteratorT, - EqualityOpT, - ScanOpT, - InitValueT, - offset_t>::Dispatch(storage, - bytes, - d_keys_in, - d_values_in, - d_values_out, - equality_op, - scan_op, - init_value, - static_cast(num_items), - stream); - }); - } //! @endcond //! @name Exclusive scans @@ -430,10 +389,10 @@ struct DeviceScan { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSum"); - using InitT = cub::detail::it_value_t; - InitT init_value{}; + using init_t = cub::detail::it_value_t; + init_t init_value{}; - return scan_impl_env(d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue(init_value), num_items, env); + return scan_impl_env(d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue(init_value), num_items, env); } //! @rst @@ -2501,11 +2460,29 @@ struct DeviceScan { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSumByKey"); - using InitT = cub::detail::it_value_t; - InitT init_value{}; + using init_t = cub::detail::it_value_t; + init_t init_value{}; - return scan_bykey_impl_env( - d_keys_in, d_values_in, d_values_out, equality_op, ::cuda::std::plus<>{}, init_value, num_items, env); + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + using offset_t = detail::choose_offset_t; + return DispatchScanByKey< + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + EqualityOpT, + ::cuda::std::plus, + init_t, + offset_t>::Dispatch(storage, + bytes, + d_keys_in, + d_values_in, + d_values_out, + equality_op, + ::cuda::std::plus{}, + init_value, + static_cast(num_items), + stream); + }); } //! @rst @@ -2616,7 +2593,26 @@ struct DeviceScan { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveScanByKey"); - return scan_bykey_impl_env(d_keys_in, d_values_in, d_values_out, equality_op, scan_op, init_value, num_items, env); + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + using offset_t = detail::choose_offset_t; + return DispatchScanByKey< + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + EqualityOpT, + ScanOpT, + InitValueT, + offset_t>::Dispatch(storage, + bytes, + d_keys_in, + d_values_in, + d_values_out, + equality_op, + scan_op, + init_value, + static_cast(num_items), + stream); + }); } //! @rst @@ -2707,8 +2703,27 @@ struct DeviceScan { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::InclusiveSumByKey"); - return scan_bykey_impl_env( - d_keys_in, d_values_in, d_values_out, equality_op, ::cuda::std::plus<>{}, NullType{}, num_items, env); + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + using offset_t = detail::choose_offset_t; + using value_t = cub::detail::it_value_t; + return DispatchScanByKey< + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + EqualityOpT, + ::cuda::std::plus, + NullType, + offset_t>::Dispatch(storage, + bytes, + d_keys_in, + d_values_in, + d_values_out, + equality_op, + ::cuda::std::plus{}, + NullType{}, + static_cast(num_items), + stream); + }); } //! @rst @@ -2808,7 +2823,26 @@ struct DeviceScan { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::InclusiveScanByKey"); - return scan_bykey_impl_env(d_keys_in, d_values_in, d_values_out, equality_op, scan_op, NullType{}, num_items, env); + return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { + using offset_t = detail::choose_offset_t; + return DispatchScanByKey< + KeysInputIteratorT, + ValuesInputIteratorT, + ValuesOutputIteratorT, + EqualityOpT, + ScanOpT, + NullType, + offset_t>::Dispatch(storage, + bytes, + d_keys_in, + d_values_in, + d_values_out, + equality_op, + scan_op, + NullType{}, + static_cast(num_items), + stream); + }); } //! @} From 3d4d2b4703ff2ffd0df3e7148da596a31ce805e8 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Wed, 11 Mar 2026 19:26:21 -0700 Subject: [PATCH 3/5] Fix api tests so that they use examples where env is passed epxlicitly with stream --- .../catch2_test_device_scan_by_key_env_api.cu | 44 ++++++++++++++----- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/cub/test/catch2_test_device_scan_by_key_env_api.cu b/cub/test/catch2_test_device_scan_by_key_env_api.cu index 969c0fc0d15..351776aafa3 100644 --- a/cub/test/catch2_test_device_scan_by_key_env_api.cu +++ b/cub/test/catch2_test_device_scan_by_key_env_api.cu @@ -16,7 +16,6 @@ C2H_TEST("cub::DeviceScan::ExclusiveSumByKey accepts environment", "[scan][by_key][env]") { - // example-begin exclusive-sum-by-key-env auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; auto output = thrust::device_vector(7); @@ -29,7 +28,6 @@ C2H_TEST("cub::DeviceScan::ExclusiveSumByKey accepts environment", "[scan][by_ke } thrust::device_vector expected{0, 8, 0, 7, 12, 0, 0}; - // example-end exclusive-sum-by-key-env REQUIRE(error == cudaSuccess); REQUIRE(output == expected); @@ -37,6 +35,7 @@ C2H_TEST("cub::DeviceScan::ExclusiveSumByKey accepts environment", "[scan][by_ke C2H_TEST("cub::DeviceScan::ExclusiveSumByKey accepts stream environment", "[scan][by_key][env]") { + // example-begin exclusive-sum-by-key-env auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; auto input = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; auto output = thrust::device_vector(7); @@ -47,15 +46,20 @@ C2H_TEST("cub::DeviceScan::ExclusiveSumByKey accepts stream environment", "[scan auto error = cub::DeviceScan::ExclusiveSumByKey( keys.begin(), input.begin(), output.begin(), static_cast(input.size()), cuda::std::equal_to<>{}, env); - REQUIRE(error == cudaSuccess); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::ExclusiveSumByKey failed with status: " << error << std::endl; + } thrust::device_vector expected{0.0f, 8.0f, 0.0f, 7.0f, 12.0f, 0.0f, 0.0f}; + // example-end exclusive-sum-by-key-env + + REQUIRE(error == cudaSuccess); REQUIRE(output == expected); } C2H_TEST("cub::DeviceScan::ExclusiveScanByKey accepts environment", "[scan][by_key][env]") { - // example-begin exclusive-scan-by-key-env auto op = cuda::std::plus{}; auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; @@ -70,7 +74,6 @@ C2H_TEST("cub::DeviceScan::ExclusiveScanByKey accepts environment", "[scan][by_k } thrust::device_vector expected{0, 8, 0, 7, 12, 0, 0}; - // example-end exclusive-scan-by-key-env REQUIRE(error == cudaSuccess); REQUIRE(output == expected); @@ -78,6 +81,7 @@ C2H_TEST("cub::DeviceScan::ExclusiveScanByKey accepts environment", "[scan][by_k C2H_TEST("cub::DeviceScan::ExclusiveScanByKey accepts stream environment", "[scan][by_key][env]") { + // example-begin exclusive-scan-by-key-env auto op = cuda::std::plus{}; auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; auto input = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; @@ -90,15 +94,20 @@ C2H_TEST("cub::DeviceScan::ExclusiveScanByKey accepts stream environment", "[sca auto error = cub::DeviceScan::ExclusiveScanByKey( keys.begin(), input.begin(), output.begin(), op, init, static_cast(input.size()), cuda::std::equal_to<>{}, env); - REQUIRE(error == cudaSuccess); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::ExclusiveScanByKey failed with status: " << error << std::endl; + } thrust::device_vector expected{0.0f, 8.0f, 0.0f, 7.0f, 12.0f, 0.0f, 0.0f}; + // example-end exclusive-scan-by-key-env + + REQUIRE(error == cudaSuccess); REQUIRE(output == expected); } C2H_TEST("cub::DeviceScan::InclusiveSumByKey accepts environment", "[scan][by_key][env]") { - // example-begin inclusive-sum-by-key-env auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; auto output = thrust::device_vector(7); @@ -111,7 +120,6 @@ C2H_TEST("cub::DeviceScan::InclusiveSumByKey accepts environment", "[scan][by_ke } thrust::device_vector expected{8, 14, 7, 12, 15, 0, 9}; - // example-end inclusive-sum-by-key-env REQUIRE(error == cudaSuccess); REQUIRE(output == expected); @@ -119,6 +127,7 @@ C2H_TEST("cub::DeviceScan::InclusiveSumByKey accepts environment", "[scan][by_ke C2H_TEST("cub::DeviceScan::InclusiveSumByKey accepts stream environment", "[scan][by_key][env]") { + // example-begin inclusive-sum-by-key-env auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; auto input = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; auto output = thrust::device_vector(7); @@ -129,15 +138,20 @@ C2H_TEST("cub::DeviceScan::InclusiveSumByKey accepts stream environment", "[scan auto error = cub::DeviceScan::InclusiveSumByKey( keys.begin(), input.begin(), output.begin(), static_cast(input.size()), cuda::std::equal_to<>{}, env); - REQUIRE(error == cudaSuccess); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::InclusiveSumByKey failed with status: " << error << std::endl; + } thrust::device_vector expected{8.0f, 14.0f, 7.0f, 12.0f, 15.0f, 0.0f, 9.0f}; + // example-end inclusive-sum-by-key-env + + REQUIRE(error == cudaSuccess); REQUIRE(output == expected); } C2H_TEST("cub::DeviceScan::InclusiveScanByKey accepts environment", "[scan][by_key][env]") { - // example-begin inclusive-scan-by-key-env auto op = cuda::std::plus{}; auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; @@ -151,7 +165,6 @@ C2H_TEST("cub::DeviceScan::InclusiveScanByKey accepts environment", "[scan][by_k } thrust::device_vector expected{8, 14, 7, 12, 15, 0, 9}; - // example-end inclusive-scan-by-key-env REQUIRE(error == cudaSuccess); REQUIRE(output == expected); @@ -159,6 +172,7 @@ C2H_TEST("cub::DeviceScan::InclusiveScanByKey accepts environment", "[scan][by_k C2H_TEST("cub::DeviceScan::InclusiveScanByKey accepts stream environment", "[scan][by_key][env]") { + // example-begin inclusive-scan-by-key-env auto op = cuda::std::plus{}; auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; auto input = thrust::device_vector{8.0f, 6.0f, 7.0f, 5.0f, 3.0f, 0.0f, 9.0f}; @@ -170,8 +184,14 @@ C2H_TEST("cub::DeviceScan::InclusiveScanByKey accepts stream environment", "[sca auto error = cub::DeviceScan::InclusiveScanByKey( keys.begin(), input.begin(), output.begin(), op, static_cast(input.size()), cuda::std::equal_to<>{}, env); - REQUIRE(error == cudaSuccess); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceScan::InclusiveScanByKey failed with status: " << error << std::endl; + } thrust::device_vector expected{8.0f, 14.0f, 7.0f, 12.0f, 15.0f, 0.0f, 9.0f}; + // example-end inclusive-scan-by-key-env + + REQUIRE(error == cudaSuccess); REQUIRE(output == expected); } From 5944679d8e052ea68f701c49dcde777fbd1a0f3f Mon Sep 17 00:00:00 2001 From: gonidelis Date: Thu, 12 Mar 2026 16:03:43 -0700 Subject: [PATCH 4/5] Improve sfinae more to conform to other elv algorithms enableifs --- cub/cub/device/device_scan.cuh | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index d01613a4978..6a1e6577ade 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -2448,7 +2448,8 @@ struct DeviceScan typename NumItemsT = uint32_t, typename EnvT = ::cuda::std::execution::env<>, ::cuda::std::enable_if_t< - !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v, + !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v + && !::cuda::std::is_same_v, int> = 0> [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveSumByKey( KeysInputIteratorT d_keys_in, @@ -2470,7 +2471,7 @@ struct DeviceScan ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, - ::cuda::std::plus, + ::cuda::std::plus<>, init_t, offset_t>::Dispatch(storage, bytes, @@ -2478,7 +2479,7 @@ struct DeviceScan d_values_in, d_values_out, equality_op, - ::cuda::std::plus{}, + ::cuda::std::plus<>{}, init_value, static_cast(num_items), stream); @@ -2579,7 +2580,8 @@ struct DeviceScan typename NumItemsT = uint32_t, typename EnvT = ::cuda::std::execution::env<>, ::cuda::std::enable_if_t< - !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v, + !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v + && !::cuda::std::is_same_v, int> = 0> [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t ExclusiveScanByKey( KeysInputIteratorT d_keys_in, @@ -2691,7 +2693,8 @@ struct DeviceScan typename NumItemsT = uint32_t, typename EnvT = ::cuda::std::execution::env<>, ::cuda::std::enable_if_t< - !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v, + !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v + && !::cuda::std::is_same_v, int> = 0> [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t InclusiveSumByKey( KeysInputIteratorT d_keys_in, @@ -2705,13 +2708,12 @@ struct DeviceScan return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { using offset_t = detail::choose_offset_t; - using value_t = cub::detail::it_value_t; return DispatchScanByKey< KeysInputIteratorT, ValuesInputIteratorT, ValuesOutputIteratorT, EqualityOpT, - ::cuda::std::plus, + ::cuda::std::plus<>, NullType, offset_t>::Dispatch(storage, bytes, @@ -2719,7 +2721,7 @@ struct DeviceScan d_values_in, d_values_out, equality_op, - ::cuda::std::plus{}, + ::cuda::std::plus<>{}, NullType{}, static_cast(num_items), stream); @@ -2810,7 +2812,8 @@ struct DeviceScan typename NumItemsT = uint32_t, typename EnvT = ::cuda::std::execution::env<>, ::cuda::std::enable_if_t< - !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v, + !::cuda::std::is_same_v && !::cuda::std::is_null_pointer_v + && !::cuda::std::is_same_v, int> = 0> [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t InclusiveScanByKey( KeysInputIteratorT d_keys_in, From 6e7d15c7a564e14f0faa64ada3ed03dae055a538 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Fri, 13 Mar 2026 19:11:51 -0700 Subject: [PATCH 5/5] Remove misplaced test from api examples --- .../catch2_test_device_scan_by_key_env_api.cu | 79 ------------------- 1 file changed, 79 deletions(-) diff --git a/cub/test/catch2_test_device_scan_by_key_env_api.cu b/cub/test/catch2_test_device_scan_by_key_env_api.cu index 351776aafa3..a4f08475894 100644 --- a/cub/test/catch2_test_device_scan_by_key_env_api.cu +++ b/cub/test/catch2_test_device_scan_by_key_env_api.cu @@ -14,25 +14,6 @@ #include -C2H_TEST("cub::DeviceScan::ExclusiveSumByKey accepts environment", "[scan][by_key][env]") -{ - auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; - auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; - auto output = thrust::device_vector(7); - - auto error = - cub::DeviceScan::ExclusiveSumByKey(keys.begin(), input.begin(), output.begin(), static_cast(input.size())); - if (error != cudaSuccess) - { - std::cerr << "cub::DeviceScan::ExclusiveSumByKey failed with status: " << error << std::endl; - } - - thrust::device_vector expected{0, 8, 0, 7, 12, 0, 0}; - - REQUIRE(error == cudaSuccess); - REQUIRE(output == expected); -} - C2H_TEST("cub::DeviceScan::ExclusiveSumByKey accepts stream environment", "[scan][by_key][env]") { // example-begin exclusive-sum-by-key-env @@ -58,27 +39,6 @@ C2H_TEST("cub::DeviceScan::ExclusiveSumByKey accepts stream environment", "[scan REQUIRE(output == expected); } -C2H_TEST("cub::DeviceScan::ExclusiveScanByKey accepts environment", "[scan][by_key][env]") -{ - auto op = cuda::std::plus{}; - auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; - auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; - auto output = thrust::device_vector(7); - auto init = 0; - - auto error = cub::DeviceScan::ExclusiveScanByKey( - keys.begin(), input.begin(), output.begin(), op, init, static_cast(input.size())); - if (error != cudaSuccess) - { - std::cerr << "cub::DeviceScan::ExclusiveScanByKey failed with status: " << error << std::endl; - } - - thrust::device_vector expected{0, 8, 0, 7, 12, 0, 0}; - - REQUIRE(error == cudaSuccess); - REQUIRE(output == expected); -} - C2H_TEST("cub::DeviceScan::ExclusiveScanByKey accepts stream environment", "[scan][by_key][env]") { // example-begin exclusive-scan-by-key-env @@ -106,25 +66,6 @@ C2H_TEST("cub::DeviceScan::ExclusiveScanByKey accepts stream environment", "[sca REQUIRE(output == expected); } -C2H_TEST("cub::DeviceScan::InclusiveSumByKey accepts environment", "[scan][by_key][env]") -{ - auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; - auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; - auto output = thrust::device_vector(7); - - auto error = - cub::DeviceScan::InclusiveSumByKey(keys.begin(), input.begin(), output.begin(), static_cast(input.size())); - if (error != cudaSuccess) - { - std::cerr << "cub::DeviceScan::InclusiveSumByKey failed with status: " << error << std::endl; - } - - thrust::device_vector expected{8, 14, 7, 12, 15, 0, 9}; - - REQUIRE(error == cudaSuccess); - REQUIRE(output == expected); -} - C2H_TEST("cub::DeviceScan::InclusiveSumByKey accepts stream environment", "[scan][by_key][env]") { // example-begin inclusive-sum-by-key-env @@ -150,26 +91,6 @@ C2H_TEST("cub::DeviceScan::InclusiveSumByKey accepts stream environment", "[scan REQUIRE(output == expected); } -C2H_TEST("cub::DeviceScan::InclusiveScanByKey accepts environment", "[scan][by_key][env]") -{ - auto op = cuda::std::plus{}; - auto keys = thrust::device_vector{0, 0, 1, 1, 1, 2, 2}; - auto input = thrust::device_vector{8, 6, 7, 5, 3, 0, 9}; - auto output = thrust::device_vector(7); - - auto error = cub::DeviceScan::InclusiveScanByKey( - keys.begin(), input.begin(), output.begin(), op, static_cast(input.size())); - if (error != cudaSuccess) - { - std::cerr << "cub::DeviceScan::InclusiveScanByKey failed with status: " << error << std::endl; - } - - thrust::device_vector expected{8, 14, 7, 12, 15, 0, 9}; - - REQUIRE(error == cudaSuccess); - REQUIRE(output == expected); -} - C2H_TEST("cub::DeviceScan::InclusiveScanByKey accepts stream environment", "[scan][by_key][env]") { // example-begin inclusive-scan-by-key-env