diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh index 9e3f114ade3..4436ae163ec 100644 --- a/cub/cub/device/device_histogram.cuh +++ b/cub/cub/device/device_histogram.cuh @@ -18,8 +18,10 @@ # pragma system_header #endif // no system header +#include #include +#include #include #include #include @@ -1504,6 +1506,971 @@ public: } //@} + + //! @name Environment-based overloads + //! @{ + + //! @rst + //! Computes an intensity histogram from a sequence of data samples using equal-width bins. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! - The number of histogram bins is (``num_levels - 1``) + //! - All bins comprise the same width of sample values: ``(upper_level - lower_level) / (num_levels - 1)``. + //! - If the common type of ``SampleT`` and ``LevelT`` is of integral type, the bin for a sample is + //! computed as ``(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)``, round + //! down to the nearest whole number. To protect against potential overflows, if the product + //! ``(upper_level - lower_level) * (num_levels - 1)`` exceeds the number representable by an + //! ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128 + //! bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only + //! be returned if bin computation would overflow for 128-bit arithmetic. + //! - The ranges ``[d_samples, d_samples + num_samples)`` and + //! ``[d_histogram, d_histogram + num_levels - 1)`` shall not overlap in any way. + //! - ``cuda::std::common_type`` must be valid, and both LevelT and SampleT must be valid + //! arithmetic types. The common type must be convertible to ``int`` and trivially copyable. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_histogram_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin histogram-even-env + //! :end-before: example-end histogram-even-env + //! + //! @endrst + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading input samples @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. + //! + //! @tparam EnvT + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! + //! @param[in] d_samples + //! The pointer to the input sequence of data samples. + //! + //! @param[out] d_histogram + //! The pointer to the histogram counter output array of length `num_levels - 1`. + //! + //! @param[in] num_levels + //! The number of boundaries (levels) for delineating histogram samples. + //! Implies that the number of bins is `num_levels - 1`. + //! + //! @param[in] lower_level + //! The lower sample value bound (inclusive) for the lowest histogram bin. + //! + //! @param[in] upper_level + //! The upper sample value bound (exclusive) for the highest histogram bin. + //! + //! @param[in] num_samples + //! The number of input samples (i.e., the length of `d_samples`) + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template > + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( + SampleIteratorT d_samples, + CounterT* d_histogram, + int num_levels, + LevelT lower_level, + LevelT upper_level, + OffsetT num_samples, + EnvT env = {}) + { + using SampleT = cub::detail::it_value_t; + return MultiHistogramEven<1, 1>( + d_samples, + ::cuda::std::array{d_histogram}, + ::cuda::std::array{num_levels}, + ::cuda::std::array{lower_level}, + ::cuda::std::array{upper_level}, + num_samples, + static_cast(1), + sizeof(SampleT) * num_samples, + env); + } + + //! @rst + //! Computes an intensity histogram from a 2D region of data samples using equal-width bins. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! - A two-dimensional *region of interest* within ``d_samples`` can be specified using + //! the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters. + //! - The row stride must be a whole multiple of the sample data type + //! size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``. + //! - The number of histogram bins is (``num_levels - 1``) + //! - All bins comprise the same width of sample values: ``(upper_level - lower_level) / (num_levels - 1)`` + //! - If the common type of ``SampleT`` and ``LevelT`` is of integral type, the bin for a sample is + //! computed as ``(sample - lower_level) * (num_levels - 1) / (upper_level - lower_level)``, round + //! down to the nearest whole number. To protect against potential overflows, if the product + //! ``(upper_level - lower_level) * (num_levels - 1)`` exceeds the number representable by an + //! ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128 + //! bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only + //! be returned if bin computation would overflow for 128-bit arithmetic. + //! - For a given row ``r`` in ``[0, num_rows)``, let + //! ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`` and + //! ``row_end = row_begin + num_row_samples``. The ranges + //! ``[row_begin, row_end)`` and ``[d_histogram, d_histogram + num_levels - 1)`` + //! shall not overlap in any way. + //! - ``cuda::std::common_type`` must be valid, and both LevelT + //! and SampleT must be valid arithmetic types. The common type must be + //! convertible to ``int`` and trivially copyable. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_histogram_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin histogram-even-2d-env + //! :end-before: example-end histogram-even-2d-env + //! + //! @endrst + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading input samples @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. + //! + //! @tparam EnvT + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! + //! @param[in] d_samples + //! The pointer to the input sequence of data samples. + //! + //! @param[out] d_histogram + //! The pointer to the histogram counter output array of length `num_levels - 1`. + //! + //! @param[in] num_levels + //! The number of boundaries (levels) for delineating histogram samples. + //! + //! @param[in] lower_level + //! The lower sample value bound (inclusive) for the lowest histogram bin. + //! + //! @param[in] upper_level + //! The upper sample value bound (exclusive) for the highest histogram bin. + //! + //! @param[in] num_row_samples + //! The number of data samples per row in the region of interest + //! + //! @param[in] num_rows + //! The number of rows in the region of interest + //! + //! @param[in] row_stride_bytes + //! The number of bytes between starts of consecutive rows in the region of interest + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template > + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( + SampleIteratorT d_samples, + CounterT* d_histogram, + int num_levels, + LevelT lower_level, + LevelT upper_level, + OffsetT num_row_samples, + OffsetT num_rows, + size_t row_stride_bytes, + EnvT env = {}) + { + return MultiHistogramEven<1, 1>( + d_samples, + ::cuda::std::array{d_histogram}, + ::cuda::std::array{num_levels}, + ::cuda::std::array{lower_level}, + ::cuda::std::array{upper_level}, + num_row_samples, + num_rows, + row_stride_bytes, + env); + } + + //! @rst + //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples + //! using equal-width bins. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! - The input is a sequence of *pixel* structures, where each pixel comprises + //! a record of ``NUM_CHANNELS`` consecutive data samples + //! (e.g., an *RGBA* pixel). + //! - ``NUM_CHANNELS`` can be up to 4. + //! - Of the ``NUM_CHANNELS`` specified, the function will only compute + //! histograms for the first ``NUM_ACTIVE_CHANNELS`` + //! (e.g., only *RGB* histograms from *RGBA* pixel samples). + //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``. + //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width: + //! ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`` + //! - If the common type of sample and level is of integral type, the bin for a sample is + //! computed as ``(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])``, round down + //! to the nearest whole number. To protect against potential overflows, if, for any channel ``i``, the product + //! ``(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)`` exceeds the number representable by an ``uint64_t``, + //! the cuda error ``cudaErrorInvalidValue`` is returned. If the common type is 128 bits wide, bin computation + //! will use 128-bit arithmetic and ``cudaErrorInvalidValue`` will only be returned if bin + //! computation would overflow for 128-bit arithmetic. + //! - For a given channel ``c`` in ``[0, NUM_ACTIVE_CHANNELS)``, the ranges + //! ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` and + //! ``[d_histogram[c], d_histogram[c] + num_levels[c] - 1)`` shall not overlap in any way. + //! - ``cuda::std::common_type`` must be valid, and both LevelT + //! and SampleT must be valid arithmetic types. + //! The common type must be convertible to ``int`` and trivially copyable. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_histogram_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin multi-histogram-even-1d-env + //! :end-before: example-end multi-histogram-even-1d-env + //! + //! @endrst + //! + //! @tparam NUM_CHANNELS + //! Number of channels interleaved in the input data (may be greater than the number of channels being + //! actively histogrammed) + //! + //! @tparam NUM_ACTIVE_CHANNELS + //! **[inferred]** Number of channels actively being histogrammed + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading input samples @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. + //! + //! @tparam EnvT + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! + //! @param[in] d_samples + //! The pointer to the multi-channel input sequence of data samples. + //! + //! @param[out] d_histogram + //! Array of active channel histogram counter output arrays, each of length `num_levels[channel] - 1`. + //! + //! @param[in] num_levels + //! Array of the number of boundaries (levels) for each active channel. + //! + //! @param[in] lower_level + //! Array of the lower sample value bound (inclusive) for the lowest bin of each active channel. + //! + //! @param[in] upper_level + //! Array of the upper sample value bound (exclusive) for the highest bin of each active channel. + //! + //! @param[in] num_pixels + //! The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`) + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template > + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven( + SampleIteratorT d_samples, + ::cuda::std::array d_histogram, + ::cuda::std::array num_levels, + ::cuda::std::array lower_level, + ::cuda::std::array upper_level, + OffsetT num_pixels, + EnvT env = {}) + { + using SampleT = cub::detail::it_value_t; + return MultiHistogramEven( + d_samples, + d_histogram, + num_levels, + lower_level, + upper_level, + num_pixels, + static_cast(1), + sizeof(SampleT) * NUM_CHANNELS * num_pixels, + env); + } + + //! @rst + //! Computes per-channel intensity histograms from a 2D region of multi-channel "pixel" data samples + //! using equal-width bins. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! - The input is a sequence of *pixel* structures, where each pixel + //! comprises a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel). + //! - ``NUM_CHANNELS`` can be up to 4. + //! - Of the ``NUM_CHANNELS`` specified, the function will only compute + //! histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., only *RGB* + //! histograms from *RGBA* pixel samples). + //! - A two-dimensional *region of interest* within ``d_samples`` can be + //! specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters. + //! - The row stride must be a whole multiple of the sample data type + //! size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``. + //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``. + //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width: + //! ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`` + //! - If the common type of sample and level is of integral type, the bin for a sample is + //! computed as ``(sample - lower_level[i]) * (num_levels - 1) / (upper_level[i] - lower_level[i])``, + //! round down to the nearest whole number. To protect against potential overflows, if, for any channel ``i``, + //! the product ``(upper_level[i] - lower_level[i]) * (num_levels[i] - 1)`` exceeds the number representable by + //! an ``uint64_t``, the cuda error ``cudaErrorInvalidValue`` is returned. + //! If the common type is 128 bits wide, bin computation will use 128-bit arithmetic and ``cudaErrorInvalidValue`` + //! will only be returned if bin computation would overflow for 128-bit arithmetic. + //! - For a given row ``r`` in ``[0, num_rows)``, and sample ``s`` in + //! ``[0, num_row_pixels)``, let + //! ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)``, + //! ``sample_begin = row_begin + s * NUM_CHANNELS``, and + //! ``sample_end = sample_begin + NUM_ACTIVE_CHANNELS``. For a given channel ``c`` in + //! ``[0, NUM_ACTIVE_CHANNELS)``, the ranges + //! ``[sample_begin, sample_end)`` and + //! ``[d_histogram[c], d_histogram[c] + num_levels[c] - 1)`` shall not overlap in any way. + //! - ``cuda::std::common_type`` must be valid, and both LevelT + //! and SampleT must be valid arithmetic types. The common type must be + //! convertible to ``int`` and trivially copyable. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_histogram_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin multi-histogram-even-2d-env + //! :end-before: example-end multi-histogram-even-2d-env + //! + //! @endrst + //! + //! @tparam NUM_CHANNELS + //! Number of channels interleaved in the input data (may be greater than the number of channels being + //! actively histogrammed) + //! + //! @tparam NUM_ACTIVE_CHANNELS + //! **[inferred]** Number of channels actively being histogrammed + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading input samples @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. + //! + //! @tparam EnvT + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! + //! @param[in] d_samples + //! The pointer to the multi-channel input sequence of data samples. + //! + //! @param[out] d_histogram + //! Array of active channel histogram counter output arrays, each of length `num_levels[channel] - 1`. + //! + //! @param[in] num_levels + //! Array of the number of boundaries (levels) for each active channel. + //! + //! @param[in] lower_level + //! Array of the lower sample value bound (inclusive) for the lowest bin of each active channel. + //! + //! @param[in] upper_level + //! Array of the upper sample value bound (exclusive) for the highest bin of each active channel. + //! + //! @param[in] num_row_pixels + //! The number of multi-channel pixels per row in the region of interest + //! + //! @param[in] num_rows + //! The number of rows in the region of interest + //! + //! @param[in] row_stride_bytes + //! The number of bytes between starts of consecutive rows in the region of interest + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template > + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramEven( + SampleIteratorT d_samples, + ::cuda::std::array d_histogram, + ::cuda::std::array num_levels, + ::cuda::std::array lower_level, + ::cuda::std::array upper_level, + OffsetT num_row_pixels, + OffsetT num_rows, + size_t row_stride_bytes, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceHistogram::MultiHistogramEven"); + + using SampleT = cub::detail::it_value_t; + ::cuda::std::bool_constant is_byte_sample; + + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) -> cudaError_t { + if constexpr (sizeof(OffsetT) > sizeof(int)) + { + if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX) + { + return DispatchHistogram:: + DispatchEven( + storage, + bytes, + d_samples, + d_histogram, + num_levels, + lower_level, + upper_level, + (int) num_row_pixels, + (int) num_rows, + (int) (row_stride_bytes / sizeof(SampleT)), + stream, + is_byte_sample); + } + } + return DispatchHistogram:: + DispatchEven( + storage, + bytes, + d_samples, + d_histogram, + num_levels, + lower_level, + upper_level, + num_row_pixels, + num_rows, + (OffsetT) (row_stride_bytes / sizeof(SampleT)), + stream, + is_byte_sample); + }); + } + + //! @rst + //! Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! - The number of histogram bins is (``num_levels - 1``) + //! - The value range for bin\ :sub:`i` is ``[level[i], level[i+1])`` + //! - The range ``[d_histogram, d_histogram + num_levels - 1)`` shall not + //! overlap ``[d_samples, d_samples + num_samples)`` nor + //! ``[d_levels, d_levels + num_levels)`` in any way. The ranges + //! ``[d_levels, d_levels + num_levels)`` and + //! ``[d_samples, d_samples + num_samples)`` may overlap. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_histogram_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin histogram-range-env + //! :end-before: example-end histogram-range-env + //! + //! @endrst + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading input samples @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. + //! + //! @tparam EnvT + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! + //! @param[in] d_samples + //! The pointer to the input sequence of data samples. + //! + //! @param[out] d_histogram + //! The pointer to the histogram counter output array of length `num_levels - 1`. + //! + //! @param[in] num_levels + //! The number of boundaries (levels) for delineating histogram samples. + //! Implies that the number of bins is `num_levels - 1`. + //! + //! @param[in] d_levels + //! The pointer to the array of boundaries (levels). Bins are defined by consecutive pairs. + //! + //! @param[in] num_samples + //! The number of input samples (i.e., the length of `d_samples`) + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template > + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( + SampleIteratorT d_samples, + CounterT* d_histogram, + int num_levels, + const LevelT* d_levels, + OffsetT num_samples, + EnvT env = {}) + { + using SampleT = cub::detail::it_value_t; + return MultiHistogramRange<1, 1>( + d_samples, + ::cuda::std::array{d_histogram}, + ::cuda::std::array{num_levels}, + ::cuda::std::array{d_levels}, + num_samples, + static_cast(1), + sizeof(SampleT) * num_samples, + env); + } + + //! @rst + //! Computes an intensity histogram from a 2D region of data samples using the specified bin boundary levels. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! - A two-dimensional *region of interest* within ``d_samples`` can be + //! specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters. + //! - The row stride must be a whole multiple of the sample data type + //! size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``. + //! - The number of histogram bins is (``num_levels - 1``) + //! - The value range for bin\ :sub:`i` is ``[level[i], level[i+1])`` + //! - For a given row ``r`` in ``[0, num_rows)``, let + //! ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)`` and + //! ``row_end = row_begin + num_row_samples``. The range + //! ``[d_histogram, d_histogram + num_levels - 1)`` shall not overlap + //! ``[row_begin, row_end)`` nor ``[d_levels, d_levels + num_levels)``. + //! The ranges ``[d_levels, d_levels + num_levels)`` and ``[row_begin, row_end)`` may overlap. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_histogram_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin histogram-range-2d-env + //! :end-before: example-end histogram-range-2d-env + //! + //! @endrst + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading input samples @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. + //! + //! @tparam EnvT + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! + //! @param[in] d_samples + //! The pointer to the input sequence of data samples. + //! + //! @param[out] d_histogram + //! The pointer to the histogram counter output array of length `num_levels - 1`. + //! + //! @param[in] num_levels + //! The number of boundaries (levels) for delineating histogram samples. + //! + //! @param[in] d_levels + //! The pointer to the array of boundaries (levels). Bins are defined by consecutive pairs. + //! + //! @param[in] num_row_samples + //! The number of data samples per row in the region of interest + //! + //! @param[in] num_rows + //! The number of rows in the region of interest + //! + //! @param[in] row_stride_bytes + //! The number of bytes between starts of consecutive rows in the region of interest + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template > + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t HistogramRange( + SampleIteratorT d_samples, + CounterT* d_histogram, + int num_levels, + const LevelT* d_levels, + OffsetT num_row_samples, + OffsetT num_rows, + size_t row_stride_bytes, + EnvT env = {}) + { + return MultiHistogramRange<1, 1>( + d_samples, + ::cuda::std::array{d_histogram}, + ::cuda::std::array{num_levels}, + ::cuda::std::array{d_levels}, + num_row_samples, + num_rows, + row_stride_bytes, + env); + } + + //! @rst + //! Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples + //! using the specified bin boundary levels. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! - The input is a sequence of *pixel* structures, where each pixel + //! comprises a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel). + //! - ``NUM_CHANNELS`` can be up to 4. + //! - Of the ``NUM_CHANNELS`` specified, the function will only compute + //! histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., *RGB* histograms from *RGBA* pixel samples). + //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``. + //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width: + //! ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`` + //! - For given channels ``c1`` and ``c2`` in ``[0, NUM_ACTIVE_CHANNELS)``, the + //! range ``[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)`` shall + //! not overlap ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` nor + //! ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` in any way. + //! The ranges ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` and + //! ``[d_samples, d_samples + NUM_CHANNELS * num_pixels)`` may overlap. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_histogram_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin multi-histogram-range-1d-env + //! :end-before: example-end multi-histogram-range-1d-env + //! + //! @endrst + //! + //! @tparam NUM_CHANNELS + //! Number of channels interleaved in the input data (may be greater than the number of channels being + //! actively histogrammed) + //! + //! @tparam NUM_ACTIVE_CHANNELS + //! **[inferred]** Number of channels actively being histogrammed + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading input samples @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. + //! + //! @tparam EnvT + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! + //! @param[in] d_samples + //! The pointer to the multi-channel input sequence of data samples. + //! + //! @param[out] d_histogram + //! Array of active channel histogram counter output arrays, each of length `num_levels[channel] - 1`. + //! + //! @param[in] num_levels + //! Array of the number of boundaries (levels) for each active channel. + //! + //! @param[in] d_levels + //! Array of pointers to the arrays of boundaries (levels) for each active channel. + //! + //! @param[in] num_pixels + //! The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`) + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template > + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange( + SampleIteratorT d_samples, + ::cuda::std::array d_histogram, + ::cuda::std::array num_levels, + ::cuda::std::array d_levels, + OffsetT num_pixels, + EnvT env = {}) + { + using SampleT = cub::detail::it_value_t; + return MultiHistogramRange( + d_samples, + d_histogram, + num_levels, + d_levels, + num_pixels, + static_cast(1), + sizeof(SampleT) * NUM_CHANNELS * num_pixels, + env); + } + + //! @rst + //! Computes per-channel intensity histograms from a 2D region of multi-channel "pixel" data samples + //! using the specified bin boundary levels. + //! + //! .. versionadded:: 3.4.0 + //! First appears in CUDA Toolkit 13.4. + //! + //! This is an environment-based API that allows customization of: + //! + //! - Stream: Query via ``cuda::get_stream`` + //! - Memory resource: Query via ``cuda::mr::get_memory_resource`` + //! + //! - The input is a sequence of *pixel* structures, where each pixel comprises + //! a record of ``NUM_CHANNELS`` consecutive data samples (e.g., an *RGBA* pixel). + //! - ``NUM_CHANNELS`` can be up to 4. + //! - Of the ``NUM_CHANNELS`` specified, the function will only compute + //! histograms for the first ``NUM_ACTIVE_CHANNELS`` (e.g., *RGB* histograms from *RGBA* pixel samples). + //! - A two-dimensional *region of interest* within ``d_samples`` can be + //! specified using the ``num_row_samples``, ``num_rows``, and ``row_stride_bytes`` parameters. + //! - The row stride must be a whole multiple of the sample data type + //! size, i.e., ``(row_stride_bytes % sizeof(SampleT)) == 0``. + //! - The number of histogram bins for channel\ :sub:`i` is ``num_levels[i] - 1``. + //! - For channel\ :sub:`i`, the range of values for all histogram bins have the same width: + //! ``(upper_level[i] - lower_level[i]) / (num_levels[i] - 1)`` + //! - For a given row ``r`` in ``[0, num_rows)``, and sample ``s`` in ``[0, num_row_pixels)``, let + //! ``row_begin = d_samples + r * row_stride_bytes / sizeof(SampleT)``, + //! ``sample_begin = row_begin + s * NUM_CHANNELS``, and + //! ``sample_end = sample_begin + NUM_ACTIVE_CHANNELS``. For given channels + //! ``c1`` and ``c2`` in ``[0, NUM_ACTIVE_CHANNELS)``, the range + //! ``[d_histogram[c1], d_histogram[c1] + num_levels[c1] - 1)`` shall not overlap + //! ``[sample_begin, sample_end)`` nor + //! ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` in any way. The ranges + //! ``[d_levels[c2], d_levels[c2] + num_levels[c2])`` and + //! ``[sample_begin, sample_end)`` may overlap. + //! - @devicestorage + //! + //! Snippet + //! +++++++ + //! + //! .. literalinclude:: ../../../cub/test/catch2_test_device_histogram_env_api.cu + //! :language: c++ + //! :dedent: + //! :start-after: example-begin multi-histogram-range-2d-env + //! :end-before: example-end multi-histogram-range-2d-env + //! + //! @endrst + //! + //! @tparam NUM_CHANNELS + //! Number of channels interleaved in the input data (may be greater than the number of channels being + //! actively histogrammed) + //! + //! @tparam NUM_ACTIVE_CHANNELS + //! **[inferred]** Number of channels actively being histogrammed + //! + //! @tparam SampleIteratorT + //! **[inferred]** Random-access input iterator type for reading input samples @iterator + //! + //! @tparam CounterT + //! **[inferred]** Integer type for histogram bin counters + //! + //! @tparam LevelT + //! **[inferred]** Type for specifying boundaries (levels) + //! + //! @tparam OffsetT + //! **[inferred]** Signed integer type for sequence offsets, list lengths, pointer differences, etc. + //! + //! @tparam EnvT + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! + //! @param[in] d_samples + //! The pointer to the multi-channel input sequence of data samples. + //! + //! @param[out] d_histogram + //! Array of active channel histogram counter output arrays, each of length `num_levels[channel] - 1`. + //! + //! @param[in] num_levels + //! Array of the number of boundaries (levels) for each active channel. + //! + //! @param[in] d_levels + //! Array of pointers to the arrays of boundaries (levels) for each active channel. + //! + //! @param[in] num_row_pixels + //! The number of multi-channel pixels per row in the region of interest + //! + //! @param[in] num_rows + //! The number of rows in the region of interest + //! + //! @param[in] row_stride_bytes + //! The number of bytes between starts of consecutive rows in the region of interest + //! + //! @param[in] env + //! @rst + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. + //! @endrst + template > + [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange( + SampleIteratorT d_samples, + ::cuda::std::array d_histogram, + ::cuda::std::array num_levels, + ::cuda::std::array d_levels, + OffsetT num_row_pixels, + OffsetT num_rows, + size_t row_stride_bytes, + EnvT env = {}) + { + _CCCL_NVTX_RANGE_SCOPE("cub::DeviceHistogram::MultiHistogramRange"); + + using SampleT = cub::detail::it_value_t; + ::cuda::std::bool_constant is_byte_sample; + + return detail::dispatch_with_env( + env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) -> cudaError_t { + if constexpr (sizeof(OffsetT) > sizeof(int)) + { + if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX) + { + return DispatchHistogram:: + DispatchRange( + storage, + bytes, + d_samples, + d_histogram, + num_levels, + d_levels, + (int) num_row_pixels, + (int) num_rows, + (int) (row_stride_bytes / sizeof(SampleT)), + stream, + is_byte_sample); + } + } + return DispatchHistogram:: + DispatchRange( + storage, + bytes, + d_samples, + d_histogram, + num_levels, + d_levels, + num_row_pixels, + num_rows, + (OffsetT) (row_stride_bytes / sizeof(SampleT)), + stream, + is_byte_sample); + }); + } + + //@} }; CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_device_histogram_env.cu b/cub/test/catch2_test_device_histogram_env.cu new file mode 100644 index 00000000000..6704280461d --- /dev/null +++ b/cub/test/catch2_test_device_histogram_env.cu @@ -0,0 +1,1144 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Should precede any includes +struct stream_registry_factory_t; +#define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY stream_registry_factory_t + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include + +#include + +#include "catch2_test_env_launch_helper.h" + +DECLARE_LAUNCH_WRAPPER(cub::DeviceHistogram::HistogramEven, histogram_even); +DECLARE_LAUNCH_WRAPPER(cub::DeviceHistogram::HistogramRange, histogram_range); + +DECLARE_TMPL_LAUNCH_WRAPPER(cub::DeviceHistogram::MultiHistogramEven, + multi_histogram_even, + ESCAPE_LIST(int Channels, int ActiveChannels), + ESCAPE_LIST(Channels, ActiveChannels)); + +DECLARE_TMPL_LAUNCH_WRAPPER(cub::DeviceHistogram::MultiHistogramRange, + multi_histogram_range, + ESCAPE_LIST(int Channels, int ActiveChannels), + ESCAPE_LIST(Channels, ActiveChannels)); + +// %PARAM% TEST_LAUNCH lid 0:1:2 + +#include + +#include + +namespace stdexec = cuda::std::execution; + +#if TEST_LAUNCH == 0 + +TEST_CASE("DeviceHistogram::HistogramEven works with default environment", "[histogram][device]") +{ + auto d_samples = c2h::device_vector{0, 2, 1, 0, 3, 4, 2, 1}; + int num_samples = static_cast(d_samples.size()); + int num_levels = 6; + int lower_level = 0; + int upper_level = 5; + auto d_histogram = c2h::device_vector(num_levels - 1, 0); + + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::HistogramEven( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_samples)); + + c2h::device_vector expected{2, 2, 2, 1, 1}; + REQUIRE(d_histogram == expected); +} + +TEST_CASE("DeviceHistogram::HistogramRange works with default environment", "[histogram][device]") +{ + auto d_samples = c2h::device_vector{2.2f, 6.1f, 7.5f, 2.9f, 3.5f, 0.3f, 2.9f, 2.1f}; + int num_samples = static_cast(d_samples.size()); + auto d_levels = c2h::device_vector{0.0f, 2.0f, 4.0f, 6.0f, 8.0f}; + int num_levels = static_cast(d_levels.size()); + auto d_histogram = c2h::device_vector(num_levels - 1, 0); + + REQUIRE(cudaSuccess + == cub::DeviceHistogram::HistogramRange( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + thrust::raw_pointer_cast(d_levels.data()), + num_samples)); + + c2h::device_vector expected{1, 5, 0, 2}; + REQUIRE(d_histogram == expected); +} + +TEST_CASE("DeviceHistogram::MultiHistogramEven works with default environment", "[histogram][device]") +{ + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + // 2 pixels: (R=0, G=2, B=1, A=255), (R=3, G=4, B=2, A=128) + auto d_samples = c2h::device_vector{0, 2, 1, 255, 3, 4, 2, 128}; + int num_pixels = 2; + + cuda::std::array num_levels = {5, 5, 5}; + cuda::std::array lower_level = {0, 0, 0}; + cuda::std::array upper_level = {4, 4, 4}; + + auto d_histogram_r = c2h::device_vector(4, 0); + auto d_histogram_g = c2h::device_vector(4, 0); + auto d_histogram_b = c2h::device_vector(4, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + REQUIRE(cudaSuccess + == cub::DeviceHistogram::MultiHistogramEven( + thrust::raw_pointer_cast(d_samples.data()), d_histogram, num_levels, lower_level, upper_level, num_pixels)); + + c2h::device_vector expected_r{1, 0, 0, 1}; + c2h::device_vector expected_g{0, 0, 1, 0}; + c2h::device_vector expected_b{0, 1, 1, 0}; + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); +} + +TEST_CASE("DeviceHistogram::MultiHistogramRange works with default environment", "[histogram][device]") +{ + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + // 2 pixels: (R=0, G=2, B=1, A=255), (R=3, G=4, B=2, A=128) + auto d_samples = c2h::device_vector{0, 2, 1, 255, 3, 4, 2, 128}; + int num_pixels = 2; + + auto d_levels_r = c2h::device_vector{0, 2, 4}; + auto d_levels_g = c2h::device_vector{0, 3, 5}; + auto d_levels_b = c2h::device_vector{0, 1, 2, 3}; + + cuda::std::array num_levels = {3, 3, 4}; + + cuda::std::array d_levels = { + thrust::raw_pointer_cast(d_levels_r.data()), + thrust::raw_pointer_cast(d_levels_g.data()), + thrust::raw_pointer_cast(d_levels_b.data())}; + + auto d_histogram_r = c2h::device_vector(2, 0); + auto d_histogram_g = c2h::device_vector(2, 0); + auto d_histogram_b = c2h::device_vector(3, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + REQUIRE(cudaSuccess + == cub::DeviceHistogram::MultiHistogramRange( + thrust::raw_pointer_cast(d_samples.data()), d_histogram, num_levels, d_levels, num_pixels)); + + c2h::device_vector expected_r{1, 1}; + c2h::device_vector expected_g{1, 1}; + c2h::device_vector expected_b{0, 1, 1}; + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); +} + +TEST_CASE("DeviceHistogram::HistogramEven 2D works with default environment", "[histogram][device]") +{ + // 2 rows, 3 samples per row, stride of 4 (1 padding element) + auto d_samples = c2h::device_vector{0, 1, 2, -1, 1, 2, 0, -1}; + int num_levels = 4; + int lower_level = 0; + int upper_level = 3; + int num_row_samples = 3; + int num_rows = 2; + size_t row_stride_bytes = 4 * sizeof(int); + auto d_histogram = c2h::device_vector(num_levels - 1, 0); + + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::HistogramEven( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_row_samples, + num_rows, + row_stride_bytes)); + + c2h::device_vector expected{2, 2, 2}; + REQUIRE(d_histogram == expected); +} + +TEST_CASE("DeviceHistogram::HistogramRange 2D works with default environment", "[histogram][device]") +{ + auto d_samples = c2h::device_vector{0, 1, 2, -1, 1, 2, 0, -1}; + auto d_levels = c2h::device_vector{0, 1, 2, 3}; + int num_levels = static_cast(d_levels.size()); + int num_row_samples = 3; + int num_rows = 2; + size_t row_stride_bytes = 4 * sizeof(int); + auto d_histogram = c2h::device_vector(num_levels - 1, 0); + + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::HistogramRange( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + thrust::raw_pointer_cast(d_levels.data()), + num_row_samples, + num_rows, + row_stride_bytes)); + + c2h::device_vector expected{2, 2, 2}; + REQUIRE(d_histogram == expected); +} + +TEST_CASE("DeviceHistogram::MultiHistogramEven 2D works with default environment", "[histogram][device]") +{ + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + // 2 rows, 2 pixels per row, stride includes 1 extra pixel of padding + // Row 0: (R=0, G=2, B=1, A=255), (R=3, G=4, B=2, A=128), (PAD, PAD, PAD, PAD) + // Row 1: (R=1, G=1, B=3, A=200), (R=2, G=3, B=0, A=100), (PAD, PAD, PAD, PAD) + auto d_samples = + c2h::device_vector{0, 2, 1, 255, 3, 4, 2, 128, 0, 0, 0, 0, 1, 1, 3, 200, 2, 3, 0, 100, 0, 0, 0, 0}; + + int num_row_pixels = 2; + int num_rows = 2; + size_t row_stride_bytes = 3 * NUM_CHANNELS * sizeof(unsigned char); // 3 pixels wide, 2 used + + cuda::std::array num_levels = {5, 5, 5}; + cuda::std::array lower_level = {0, 0, 0}; + cuda::std::array upper_level = {4, 4, 4}; + + auto d_histogram_r = c2h::device_vector(4, 0); + auto d_histogram_g = c2h::device_vector(4, 0); + auto d_histogram_b = c2h::device_vector(4, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::MultiHistogramEven( + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + lower_level, + upper_level, + num_row_pixels, + num_rows, + row_stride_bytes)); + + // R: 0,3,1,2 → bin[0]=1, bin[1]=1, bin[2]=1, bin[3]=1 + c2h::device_vector expected_r{1, 1, 1, 1}; + // G: 2,4,1,3 → bin[1]=1, bin[2]=1, bin[3]=1 (4 out of range) + c2h::device_vector expected_g{0, 1, 1, 1}; + // B: 1,2,3,0 → bin[0]=1, bin[1]=1, bin[2]=1, bin[3]=1 + c2h::device_vector expected_b{1, 1, 1, 1}; + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); +} + +TEST_CASE("DeviceHistogram::MultiHistogramRange 2D works with default environment", "[histogram][device]") +{ + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + // Same layout as MultiHistogramEven 2D test + auto d_samples = + c2h::device_vector{0, 2, 1, 255, 3, 4, 2, 128, 0, 0, 0, 0, 1, 1, 3, 200, 2, 3, 0, 100, 0, 0, 0, 0}; + + int num_row_pixels = 2; + int num_rows = 2; + size_t row_stride_bytes = 3 * NUM_CHANNELS * sizeof(unsigned char); + + auto d_levels_r = c2h::device_vector{0, 2, 4}; + auto d_levels_g = c2h::device_vector{0, 3, 5}; + auto d_levels_b = c2h::device_vector{0, 1, 2, 3}; + + cuda::std::array num_levels = {3, 3, 4}; + + cuda::std::array d_levels = { + thrust::raw_pointer_cast(d_levels_r.data()), + thrust::raw_pointer_cast(d_levels_g.data()), + thrust::raw_pointer_cast(d_levels_b.data())}; + + auto d_histogram_r = c2h::device_vector(2, 0); + auto d_histogram_g = c2h::device_vector(2, 0); + auto d_histogram_b = c2h::device_vector(3, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::MultiHistogramRange( + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + d_levels, + num_row_pixels, + num_rows, + row_stride_bytes)); + + // R: 0,3,1,2 → [0,2)=2, [2,4)=2 + c2h::device_vector expected_r{2, 2}; + // G: 2,4,1,3 → [0,3)=2, [3,5)=2 + c2h::device_vector expected_g{2, 2}; + // B: 1,2,3,0 → [0,1)=1, [1,2)=1, [2,3)=1 (3 out of range) + c2h::device_vector expected_b{1, 1, 1}; + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); +} + +#endif + +C2H_TEST("DeviceHistogram::HistogramEven uses environment", "[histogram][device]") +{ + auto d_samples = c2h::device_vector{0, 2, 1, 0, 3, 4, 2, 1}; + int num_samples = static_cast(d_samples.size()); + int num_levels = 6; + int lower_level = 0; + int upper_level = 5; + auto d_histogram = c2h::device_vector(num_levels - 1, 0); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::HistogramEven( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_samples)); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + histogram_even( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_samples, + env); + + c2h::device_vector expected{2, 2, 2, 1, 1}; + REQUIRE(d_histogram == expected); +} + +TEST_CASE("DeviceHistogram::HistogramEven uses custom stream", "[histogram][device]") +{ + auto d_samples = c2h::device_vector{0, 2, 1, 0, 3, 4, 2, 1}; + int num_samples = static_cast(d_samples.size()); + int num_levels = 6; + int lower_level = 0; + int upper_level = 5; + auto d_histogram = c2h::device_vector(num_levels - 1, 0); + + cudaStream_t custom_stream; + REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream)); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::HistogramEven( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_samples)); + + auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}}; + auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)}; + + histogram_even( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_samples, + env); + + REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream)); + + c2h::device_vector expected{2, 2, 2, 1, 1}; + REQUIRE(d_histogram == expected); + + REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); +} + +C2H_TEST("DeviceHistogram::HistogramRange uses environment", "[histogram][device]") +{ + auto d_samples = c2h::device_vector{2.2f, 6.1f, 7.5f, 2.9f, 3.5f, 0.3f, 2.9f, 2.1f}; + int num_samples = static_cast(d_samples.size()); + auto d_levels = c2h::device_vector{0.0f, 2.0f, 4.0f, 6.0f, 8.0f}; + int num_levels = static_cast(d_levels.size()); + auto d_histogram = c2h::device_vector(num_levels - 1, 0); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::HistogramRange( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + thrust::raw_pointer_cast(d_levels.data()), + num_samples)); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + histogram_range( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + thrust::raw_pointer_cast(d_levels.data()), + num_samples, + env); + + c2h::device_vector expected{1, 5, 0, 2}; + REQUIRE(d_histogram == expected); +} + +TEST_CASE("DeviceHistogram::HistogramRange uses custom stream", "[histogram][device]") +{ + auto d_samples = c2h::device_vector{2.2f, 6.1f, 7.5f, 2.9f, 3.5f, 0.3f, 2.9f, 2.1f}; + int num_samples = static_cast(d_samples.size()); + auto d_levels = c2h::device_vector{0.0f, 2.0f, 4.0f, 6.0f, 8.0f}; + int num_levels = static_cast(d_levels.size()); + auto d_histogram = c2h::device_vector(num_levels - 1, 0); + + cudaStream_t custom_stream; + REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream)); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::HistogramRange( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + thrust::raw_pointer_cast(d_levels.data()), + num_samples)); + + auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}}; + auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)}; + + histogram_range( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + thrust::raw_pointer_cast(d_levels.data()), + num_samples, + env); + + REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream)); + + c2h::device_vector expected{1, 5, 0, 2}; + REQUIRE(d_histogram == expected); + + REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); +} + +C2H_TEST("DeviceHistogram::MultiHistogramEven uses environment", "[histogram][device]") +{ + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + auto d_samples = c2h::device_vector{0, 2, 1, 255, 3, 4, 2, 128}; + int num_pixels = 2; + + cuda::std::array num_levels = {5, 5, 5}; + cuda::std::array lower_level = {0, 0, 0}; + cuda::std::array upper_level = {4, 4, 4}; + + auto d_histogram_r = c2h::device_vector(4, 0); + auto d_histogram_g = c2h::device_vector(4, 0); + auto d_histogram_b = c2h::device_vector(4, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::MultiHistogramEven( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + lower_level, + upper_level, + num_pixels)); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + multi_histogram_even( + thrust::raw_pointer_cast(d_samples.data()), d_histogram, num_levels, lower_level, upper_level, num_pixels, env); + + c2h::device_vector expected_r{1, 0, 0, 1}; + c2h::device_vector expected_g{0, 0, 1, 0}; + c2h::device_vector expected_b{0, 1, 1, 0}; + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); +} + +TEST_CASE("DeviceHistogram::MultiHistogramEven uses custom stream", "[histogram][device]") +{ + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + auto d_samples = c2h::device_vector{0, 2, 1, 255, 3, 4, 2, 128}; + int num_pixels = 2; + + cuda::std::array num_levels = {5, 5, 5}; + cuda::std::array lower_level = {0, 0, 0}; + cuda::std::array upper_level = {4, 4, 4}; + + auto d_histogram_r = c2h::device_vector(4, 0); + auto d_histogram_g = c2h::device_vector(4, 0); + auto d_histogram_b = c2h::device_vector(4, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + cudaStream_t custom_stream; + REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream)); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::MultiHistogramEven( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + lower_level, + upper_level, + num_pixels)); + + auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}}; + auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)}; + + multi_histogram_even( + thrust::raw_pointer_cast(d_samples.data()), d_histogram, num_levels, lower_level, upper_level, num_pixels, env); + + REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream)); + + c2h::device_vector expected_r{1, 0, 0, 1}; + c2h::device_vector expected_g{0, 0, 1, 0}; + c2h::device_vector expected_b{0, 1, 1, 0}; + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); + + REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); +} + +C2H_TEST("DeviceHistogram::MultiHistogramRange uses environment", "[histogram][device]") +{ + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + auto d_samples = c2h::device_vector{0, 2, 1, 255, 3, 4, 2, 128}; + int num_pixels = 2; + + auto d_levels_r = c2h::device_vector{0, 2, 4}; + auto d_levels_g = c2h::device_vector{0, 3, 5}; + auto d_levels_b = c2h::device_vector{0, 1, 2, 3}; + + cuda::std::array num_levels = {3, 3, 4}; + + cuda::std::array d_levels = { + thrust::raw_pointer_cast(d_levels_r.data()), + thrust::raw_pointer_cast(d_levels_g.data()), + thrust::raw_pointer_cast(d_levels_b.data())}; + + auto d_histogram_r = c2h::device_vector(2, 0); + auto d_histogram_g = c2h::device_vector(2, 0); + auto d_histogram_b = c2h::device_vector(3, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::MultiHistogramRange( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + d_levels, + num_pixels)); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + multi_histogram_range( + thrust::raw_pointer_cast(d_samples.data()), d_histogram, num_levels, d_levels, num_pixels, env); + + c2h::device_vector expected_r{1, 1}; + c2h::device_vector expected_g{1, 1}; + c2h::device_vector expected_b{0, 1, 1}; + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); +} + +TEST_CASE("DeviceHistogram::MultiHistogramRange uses custom stream", "[histogram][device]") +{ + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + auto d_samples = c2h::device_vector{0, 2, 1, 255, 3, 4, 2, 128}; + int num_pixels = 2; + + auto d_levels_r = c2h::device_vector{0, 2, 4}; + auto d_levels_g = c2h::device_vector{0, 3, 5}; + auto d_levels_b = c2h::device_vector{0, 1, 2, 3}; + + cuda::std::array num_levels = {3, 3, 4}; + + cuda::std::array d_levels = { + thrust::raw_pointer_cast(d_levels_r.data()), + thrust::raw_pointer_cast(d_levels_g.data()), + thrust::raw_pointer_cast(d_levels_b.data())}; + + auto d_histogram_r = c2h::device_vector(2, 0); + auto d_histogram_g = c2h::device_vector(2, 0); + auto d_histogram_b = c2h::device_vector(3, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + cudaStream_t custom_stream; + REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream)); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::MultiHistogramRange( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + d_levels, + num_pixels)); + + auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}}; + auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)}; + + multi_histogram_range( + thrust::raw_pointer_cast(d_samples.data()), d_histogram, num_levels, d_levels, num_pixels, env); + + REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream)); + + c2h::device_vector expected_r{1, 1}; + c2h::device_vector expected_g{1, 1}; + c2h::device_vector expected_b{0, 1, 1}; + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); + + REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); +} + +C2H_TEST("DeviceHistogram::HistogramEven 2D uses environment", "[histogram][device]") +{ + auto d_samples = c2h::device_vector{0, 1, 2, -1, 1, 2, 0, -1}; + int num_levels = 4; + int lower_level = 0; + int upper_level = 3; + int num_row_samples = 3; + int num_rows = 2; + size_t row_stride_bytes = 4 * sizeof(int); + auto d_histogram = c2h::device_vector(num_levels - 1, 0); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::HistogramEven( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_row_samples, + num_rows, + row_stride_bytes)); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + histogram_even( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_row_samples, + num_rows, + row_stride_bytes, + env); + + c2h::device_vector expected{2, 2, 2}; + REQUIRE(d_histogram == expected); +} + +TEST_CASE("DeviceHistogram::HistogramEven 2D uses custom stream", "[histogram][device]") +{ + auto d_samples = c2h::device_vector{0, 1, 2, -1, 1, 2, 0, -1}; + int num_levels = 4; + int lower_level = 0; + int upper_level = 3; + int num_row_samples = 3; + int num_rows = 2; + size_t row_stride_bytes = 4 * sizeof(int); + auto d_histogram = c2h::device_vector(num_levels - 1, 0); + + cudaStream_t custom_stream; + REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream)); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::HistogramEven( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_row_samples, + num_rows, + row_stride_bytes)); + + auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}}; + auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)}; + + histogram_even( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_row_samples, + num_rows, + row_stride_bytes, + env); + + REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream)); + + c2h::device_vector expected{2, 2, 2}; + REQUIRE(d_histogram == expected); + + REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); +} + +C2H_TEST("DeviceHistogram::HistogramRange 2D uses environment", "[histogram][device]") +{ + auto d_samples = c2h::device_vector{0, 1, 2, -1, 1, 2, 0, -1}; + auto d_levels = c2h::device_vector{0, 1, 2, 3}; + int num_levels = static_cast(d_levels.size()); + int num_row_samples = 3; + int num_rows = 2; + size_t row_stride_bytes = 4 * sizeof(int); + auto d_histogram = c2h::device_vector(num_levels - 1, 0); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::HistogramRange( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + thrust::raw_pointer_cast(d_levels.data()), + num_row_samples, + num_rows, + row_stride_bytes)); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + histogram_range( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + thrust::raw_pointer_cast(d_levels.data()), + num_row_samples, + num_rows, + row_stride_bytes, + env); + + c2h::device_vector expected{2, 2, 2}; + REQUIRE(d_histogram == expected); +} + +TEST_CASE("DeviceHistogram::HistogramRange 2D uses custom stream", "[histogram][device]") +{ + auto d_samples = c2h::device_vector{0, 1, 2, -1, 1, 2, 0, -1}; + auto d_levels = c2h::device_vector{0, 1, 2, 3}; + int num_levels = static_cast(d_levels.size()); + int num_row_samples = 3; + int num_rows = 2; + size_t row_stride_bytes = 4 * sizeof(int); + auto d_histogram = c2h::device_vector(num_levels - 1, 0); + + cudaStream_t custom_stream; + REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream)); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::HistogramRange( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + thrust::raw_pointer_cast(d_levels.data()), + num_row_samples, + num_rows, + row_stride_bytes)); + + auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}}; + auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)}; + + histogram_range( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + thrust::raw_pointer_cast(d_levels.data()), + num_row_samples, + num_rows, + row_stride_bytes, + env); + + REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream)); + + c2h::device_vector expected{2, 2, 2}; + REQUIRE(d_histogram == expected); + + REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); +} + +C2H_TEST("DeviceHistogram::MultiHistogramEven 2D uses environment", "[histogram][device]") +{ + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + auto d_samples = + c2h::device_vector{0, 2, 1, 255, 3, 4, 2, 128, 0, 0, 0, 0, 1, 1, 3, 200, 2, 3, 0, 100, 0, 0, 0, 0}; + + int num_row_pixels = 2; + int num_rows = 2; + size_t row_stride_bytes = 3 * NUM_CHANNELS * sizeof(unsigned char); + + cuda::std::array num_levels = {5, 5, 5}; + cuda::std::array lower_level = {0, 0, 0}; + cuda::std::array upper_level = {4, 4, 4}; + + auto d_histogram_r = c2h::device_vector(4, 0); + auto d_histogram_g = c2h::device_vector(4, 0); + auto d_histogram_b = c2h::device_vector(4, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::MultiHistogramEven( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + lower_level, + upper_level, + num_row_pixels, + num_rows, + row_stride_bytes)); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + multi_histogram_even( + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + lower_level, + upper_level, + num_row_pixels, + num_rows, + row_stride_bytes, + env); + + c2h::device_vector expected_r{1, 1, 1, 1}; + c2h::device_vector expected_g{0, 1, 1, 1}; + c2h::device_vector expected_b{1, 1, 1, 1}; + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); +} + +TEST_CASE("DeviceHistogram::MultiHistogramEven 2D uses custom stream", "[histogram][device]") +{ + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + auto d_samples = + c2h::device_vector{0, 2, 1, 255, 3, 4, 2, 128, 0, 0, 0, 0, 1, 1, 3, 200, 2, 3, 0, 100, 0, 0, 0, 0}; + + int num_row_pixels = 2; + int num_rows = 2; + size_t row_stride_bytes = 3 * NUM_CHANNELS * sizeof(unsigned char); + + cuda::std::array num_levels = {5, 5, 5}; + cuda::std::array lower_level = {0, 0, 0}; + cuda::std::array upper_level = {4, 4, 4}; + + auto d_histogram_r = c2h::device_vector(4, 0); + auto d_histogram_g = c2h::device_vector(4, 0); + auto d_histogram_b = c2h::device_vector(4, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + cudaStream_t custom_stream; + REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream)); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::MultiHistogramEven( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + lower_level, + upper_level, + num_row_pixels, + num_rows, + row_stride_bytes)); + + auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}}; + auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)}; + + multi_histogram_even( + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + lower_level, + upper_level, + num_row_pixels, + num_rows, + row_stride_bytes, + env); + + REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream)); + + c2h::device_vector expected_r{1, 1, 1, 1}; + c2h::device_vector expected_g{0, 1, 1, 1}; + c2h::device_vector expected_b{1, 1, 1, 1}; + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); + + REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); +} + +C2H_TEST("DeviceHistogram::MultiHistogramRange 2D uses environment", "[histogram][device]") +{ + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + auto d_samples = + c2h::device_vector{0, 2, 1, 255, 3, 4, 2, 128, 0, 0, 0, 0, 1, 1, 3, 200, 2, 3, 0, 100, 0, 0, 0, 0}; + + int num_row_pixels = 2; + int num_rows = 2; + size_t row_stride_bytes = 3 * NUM_CHANNELS * sizeof(unsigned char); + + auto d_levels_r = c2h::device_vector{0, 2, 4}; + auto d_levels_g = c2h::device_vector{0, 3, 5}; + auto d_levels_b = c2h::device_vector{0, 1, 2, 3}; + + cuda::std::array num_levels = {3, 3, 4}; + + cuda::std::array d_levels = { + thrust::raw_pointer_cast(d_levels_r.data()), + thrust::raw_pointer_cast(d_levels_g.data()), + thrust::raw_pointer_cast(d_levels_b.data())}; + + auto d_histogram_r = c2h::device_vector(2, 0); + auto d_histogram_g = c2h::device_vector(2, 0); + auto d_histogram_b = c2h::device_vector(3, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::MultiHistogramRange( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + d_levels, + num_row_pixels, + num_rows, + row_stride_bytes)); + + auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; + + multi_histogram_range( + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + d_levels, + num_row_pixels, + num_rows, + row_stride_bytes, + env); + + c2h::device_vector expected_r{2, 2}; + c2h::device_vector expected_g{2, 2}; + c2h::device_vector expected_b{1, 1, 1}; + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); +} + +TEST_CASE("DeviceHistogram::MultiHistogramRange 2D uses custom stream", "[histogram][device]") +{ + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + auto d_samples = + c2h::device_vector{0, 2, 1, 255, 3, 4, 2, 128, 0, 0, 0, 0, 1, 1, 3, 200, 2, 3, 0, 100, 0, 0, 0, 0}; + + int num_row_pixels = 2; + int num_rows = 2; + size_t row_stride_bytes = 3 * NUM_CHANNELS * sizeof(unsigned char); + + auto d_levels_r = c2h::device_vector{0, 2, 4}; + auto d_levels_g = c2h::device_vector{0, 3, 5}; + auto d_levels_b = c2h::device_vector{0, 1, 2, 3}; + + cuda::std::array num_levels = {3, 3, 4}; + + cuda::std::array d_levels = { + thrust::raw_pointer_cast(d_levels_r.data()), + thrust::raw_pointer_cast(d_levels_g.data()), + thrust::raw_pointer_cast(d_levels_b.data())}; + + auto d_histogram_r = c2h::device_vector(2, 0); + auto d_histogram_g = c2h::device_vector(2, 0); + auto d_histogram_b = c2h::device_vector(3, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + cudaStream_t custom_stream; + REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream)); + + size_t expected_bytes_allocated{}; + REQUIRE( + cudaSuccess + == cub::DeviceHistogram::MultiHistogramRange( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + d_levels, + num_row_pixels, + num_rows, + row_stride_bytes)); + + auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}}; + auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)}; + + multi_histogram_range( + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + d_levels, + num_row_pixels, + num_rows, + row_stride_bytes, + env); + + REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream)); + + c2h::device_vector expected_r{2, 2}; + c2h::device_vector expected_g{2, 2}; + c2h::device_vector expected_b{1, 1, 1}; + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); + + REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); +} diff --git a/cub/test/catch2_test_device_histogram_env_api.cu b/cub/test/catch2_test_device_histogram_env_api.cu new file mode 100644 index 00000000000..1d1e6ed6b36 --- /dev/null +++ b/cub/test/catch2_test_device_histogram_env_api.cu @@ -0,0 +1,403 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include + +#include +#include +#include + +#include + +#include + +C2H_TEST("cub::DeviceHistogram::HistogramEven accepts env with stream", "[histogram][env]") +{ + // example-begin histogram-even-env + auto d_samples = thrust::device_vector{0, 2, 1, 0, 3, 4, 2, 1}; + int num_samples = static_cast(d_samples.size()); + int num_levels = 6; + int lower_level = 0; + int upper_level = 5; + auto d_histogram = thrust::device_vector(num_levels - 1, 0); + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceHistogram::HistogramEven( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_samples, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceHistogram::HistogramEven failed with status: " << error << std::endl; + } + + thrust::device_vector expected{2, 2, 2, 1, 1}; + // example-end histogram-even-env + + REQUIRE(error == cudaSuccess); + REQUIRE(d_histogram == expected); +} + +C2H_TEST("cub::DeviceHistogram::HistogramEven accepts env with stream (2D)", "[histogram][env]") +{ + // example-begin histogram-even-2d-env + // 2D region of interest: 2 rows, 3 samples per row, row stride includes 1 padding element + // Row 0: [0, 1, 2, PAD] Row 1: [1, 2, 0, PAD] + auto d_samples = thrust::device_vector{0, 1, 2, -1, 1, 2, 0, -1}; + int num_levels = 4; // 3 bins: [0,1), [1,2), [2,3) + int lower_level = 0; + int upper_level = 3; + int num_row_samples = 3; + int num_rows = 2; + size_t row_stride_bytes = 4 * sizeof(int); + + auto d_histogram = thrust::device_vector(num_levels - 1, 0); + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceHistogram::HistogramEven( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_row_samples, + num_rows, + row_stride_bytes, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceHistogram::HistogramEven (2D) failed with status: " << error << std::endl; + } + + // Samples: 0,1,2, 1,2,0 → bin[0]=2, bin[1]=2, bin[2]=2 + thrust::device_vector expected{2, 2, 2}; + // example-end histogram-even-2d-env + + REQUIRE(error == cudaSuccess); + REQUIRE(d_histogram == expected); +} + +C2H_TEST("cub::DeviceHistogram::HistogramRange accepts env with stream", "[histogram][env]") +{ + // example-begin histogram-range-env + auto d_samples = thrust::device_vector{2.2f, 6.1f, 7.5f, 2.9f, 3.5f, 0.3f, 2.9f, 2.1f}; + int num_samples = static_cast(d_samples.size()); + auto d_levels = thrust::device_vector{0.0f, 2.0f, 4.0f, 6.0f, 8.0f}; + int num_levels = static_cast(d_levels.size()); + auto d_histogram = thrust::device_vector(num_levels - 1, 0); + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceHistogram::HistogramRange( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + thrust::raw_pointer_cast(d_levels.data()), + num_samples, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceHistogram::HistogramRange failed with status: " << error << std::endl; + } + + thrust::device_vector expected{1, 5, 0, 2}; + // example-end histogram-range-env + + REQUIRE(error == cudaSuccess); + REQUIRE(d_histogram == expected); +} + +C2H_TEST("cub::DeviceHistogram::HistogramRange accepts env with stream (2D)", "[histogram][env]") +{ + // example-begin histogram-range-2d-env + // 2D region of interest: 2 rows, 3 samples per row, row stride includes 1 padding element + // Row 0: [0, 1, 2, PAD] Row 1: [1, 2, 0, PAD] + auto d_samples = thrust::device_vector{0, 1, 2, -1, 1, 2, 0, -1}; + auto d_levels = thrust::device_vector{0, 1, 2, 3}; // 3 bins: [0,1), [1,2), [2,3) + int num_levels = static_cast(d_levels.size()); + int num_row_samples = 3; + int num_rows = 2; + size_t row_stride_bytes = 4 * sizeof(int); + + auto d_histogram = thrust::device_vector(num_levels - 1, 0); + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceHistogram::HistogramRange( + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + thrust::raw_pointer_cast(d_levels.data()), + num_row_samples, + num_rows, + row_stride_bytes, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceHistogram::HistogramRange (2D) failed with status: " << error << std::endl; + } + + // Samples: 0,1,2, 1,2,0 → bin[0]=2, bin[1]=2, bin[2]=2 + thrust::device_vector expected{2, 2, 2}; + // example-end histogram-range-2d-env + + REQUIRE(error == cudaSuccess); + REQUIRE(d_histogram == expected); +} + +C2H_TEST("cub::DeviceHistogram::MultiHistogramEven accepts env with stream (1D)", "[histogram][env]") +{ + // example-begin multi-histogram-even-1d-env + // 4-channel RGBA pixels, histogram 3 active channels + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + // 2 pixels: (R=0, G=2, B=1, A=255), (R=3, G=4, B=2, A=128) + auto d_samples = thrust::device_vector{0, 2, 1, 255, 3, 4, 2, 128}; + int num_pixels = 2; + + // 5 levels per channel → 4 bins per channel: [0,1), [1,2), [2,3), [3,4) + cuda::std::array num_levels = {5, 5, 5}; + cuda::std::array lower_level = {0, 0, 0}; + cuda::std::array upper_level = {4, 4, 4}; + + auto d_histogram_r = thrust::device_vector(4, 0); + auto d_histogram_g = thrust::device_vector(4, 0); + auto d_histogram_b = thrust::device_vector(4, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceHistogram::MultiHistogramEven( + thrust::raw_pointer_cast(d_samples.data()), d_histogram, num_levels, lower_level, upper_level, num_pixels, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceHistogram::MultiHistogramEven failed with status: " << error << std::endl; + } + + // R: 0→bin[0], 3→bin[3] + thrust::device_vector expected_r{1, 0, 0, 1}; + // G: 2→bin[2], 4→out of range + thrust::device_vector expected_g{0, 0, 1, 0}; + // B: 1→bin[1], 2→bin[2] + thrust::device_vector expected_b{0, 1, 1, 0}; + // example-end multi-histogram-even-1d-env + + REQUIRE(error == cudaSuccess); + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); +} + +C2H_TEST("cub::DeviceHistogram::MultiHistogramEven accepts env with stream (2D)", "[histogram][env]") +{ + // example-begin multi-histogram-even-2d-env + // 4-channel RGBA pixels, histogram 3 active channels, 2D region + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + // 2 rows, 2 pixels per row, stride includes 1 extra padding pixel per row + // Row 0: (R=0, G=2, B=1, A=255), (R=3, G=4, B=2, A=128), (PAD, PAD, PAD, PAD) + // Row 1: (R=1, G=1, B=3, A=200), (R=2, G=3, B=0, A=100), (PAD, PAD, PAD, PAD) + auto d_samples = thrust::device_vector{ + 0, 2, 1, 255, 3, 4, 2, 128, 0, 0, 0, 0, 1, 1, 3, 200, 2, 3, 0, 100, 0, 0, 0, 0}; + + int num_row_pixels = 2; + int num_rows = 2; + size_t row_stride_bytes = 3 * NUM_CHANNELS * sizeof(unsigned char); // 3 pixels wide, 2 used + + cuda::std::array num_levels = {5, 5, 5}; + cuda::std::array lower_level = {0, 0, 0}; + cuda::std::array upper_level = {4, 4, 4}; + + auto d_histogram_r = thrust::device_vector(4, 0); + auto d_histogram_g = thrust::device_vector(4, 0); + auto d_histogram_b = thrust::device_vector(4, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceHistogram::MultiHistogramEven( + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + lower_level, + upper_level, + num_row_pixels, + num_rows, + row_stride_bytes, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceHistogram::MultiHistogramEven (2D) failed with status: " << error << std::endl; + } + + // R: 0, 3, 1, 2 → bin[0]=1, bin[1]=1, bin[2]=1, bin[3]=1 + thrust::device_vector expected_r{1, 1, 1, 1}; + // G: 2, 4, 1, 3 → bin[1]=1, bin[2]=1, bin[3]=1 (4 is out of range) + thrust::device_vector expected_g{0, 1, 1, 1}; + // B: 1, 2, 3, 0 → bin[0]=1, bin[1]=1, bin[2]=1, bin[3]=1 + thrust::device_vector expected_b{1, 1, 1, 1}; + // example-end multi-histogram-even-2d-env + + REQUIRE(error == cudaSuccess); + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); +} + +C2H_TEST("cub::DeviceHistogram::MultiHistogramRange accepts env with stream (1D)", "[histogram][env]") +{ + // example-begin multi-histogram-range-1d-env + // 4-channel RGBA pixels, histogram 3 active channels + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + // 2 pixels: (R=0, G=2, B=1, A=255), (R=3, G=4, B=2, A=128) + auto d_samples = thrust::device_vector{0, 2, 1, 255, 3, 4, 2, 128}; + int num_pixels = 2; + + // Custom bin boundaries per channel + auto d_levels_r = thrust::device_vector{0, 2, 4}; // 2 bins: [0,2), [2,4) + auto d_levels_g = thrust::device_vector{0, 3, 5}; // 2 bins: [0,3), [3,5) + auto d_levels_b = thrust::device_vector{0, 1, 2, 3}; // 3 bins: [0,1), [1,2), [2,3) + + cuda::std::array num_levels = {3, 3, 4}; + + cuda::std::array d_levels = { + thrust::raw_pointer_cast(d_levels_r.data()), + thrust::raw_pointer_cast(d_levels_g.data()), + thrust::raw_pointer_cast(d_levels_b.data())}; + + auto d_histogram_r = thrust::device_vector(2, 0); + auto d_histogram_g = thrust::device_vector(2, 0); + auto d_histogram_b = thrust::device_vector(3, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceHistogram::MultiHistogramRange( + thrust::raw_pointer_cast(d_samples.data()), d_histogram, num_levels, d_levels, num_pixels, env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceHistogram::MultiHistogramRange failed with status: " << error << std::endl; + } + + // R: 0→[0,2), 3→[2,4) + thrust::device_vector expected_r{1, 1}; + // G: 2→[0,3), 4→[3,5) + thrust::device_vector expected_g{1, 1}; + // B: 1→[1,2), 2→[2,3) + thrust::device_vector expected_b{0, 1, 1}; + // example-end multi-histogram-range-1d-env + + REQUIRE(error == cudaSuccess); + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); +} + +C2H_TEST("cub::DeviceHistogram::MultiHistogramRange accepts env with stream (2D)", "[histogram][env]") +{ + // example-begin multi-histogram-range-2d-env + // 4-channel RGBA pixels, histogram 3 active channels, 2D region + constexpr int NUM_CHANNELS = 4; + constexpr int NUM_ACTIVE_CHANNELS = 3; + + // 2 rows, 2 pixels per row, stride includes 1 extra padding pixel per row + // Row 0: (R=0, G=2, B=1, A=255), (R=3, G=4, B=2, A=128), (PAD, PAD, PAD, PAD) + // Row 1: (R=1, G=1, B=3, A=200), (R=2, G=3, B=0, A=100), (PAD, PAD, PAD, PAD) + auto d_samples = thrust::device_vector{ + 0, 2, 1, 255, 3, 4, 2, 128, 0, 0, 0, 0, 1, 1, 3, 200, 2, 3, 0, 100, 0, 0, 0, 0}; + + int num_row_pixels = 2; + int num_rows = 2; + size_t row_stride_bytes = 3 * NUM_CHANNELS * sizeof(unsigned char); // 3 pixels wide, 2 used + + auto d_levels_r = thrust::device_vector{0, 2, 4}; // 2 bins: [0,2), [2,4) + auto d_levels_g = thrust::device_vector{0, 3, 5}; // 2 bins: [0,3), [3,5) + auto d_levels_b = thrust::device_vector{0, 1, 2, 3}; // 3 bins: [0,1), [1,2), [2,3) + + cuda::std::array num_levels = {3, 3, 4}; + + cuda::std::array d_levels = { + thrust::raw_pointer_cast(d_levels_r.data()), + thrust::raw_pointer_cast(d_levels_g.data()), + thrust::raw_pointer_cast(d_levels_b.data())}; + + auto d_histogram_r = thrust::device_vector(2, 0); + auto d_histogram_g = thrust::device_vector(2, 0); + auto d_histogram_b = thrust::device_vector(3, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + cuda::stream stream{cuda::devices[0]}; + cuda::stream_ref stream_ref{stream}; + auto env = cuda::std::execution::env{stream_ref}; + + auto error = cub::DeviceHistogram::MultiHistogramRange( + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + d_levels, + num_row_pixels, + num_rows, + row_stride_bytes, + env); + if (error != cudaSuccess) + { + std::cerr << "cub::DeviceHistogram::MultiHistogramRange (2D) failed with status: " << error << std::endl; + } + + // R: 0, 3, 1, 2 → [0,2)=2, [2,4)=2 + thrust::device_vector expected_r{2, 2}; + // G: 2, 4, 1, 3 → [0,3)=2, [3,5)=2 + thrust::device_vector expected_g{2, 2}; + // B: 1, 2, 3, 0 → [0,1)=1, [1,2)=1, [2,3)=1 (3 is out of range) + thrust::device_vector expected_b{1, 1, 1}; + // example-end multi-histogram-range-2d-env + + REQUIRE(error == cudaSuccess); + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); +}