Skip to content

[BUG]: cub::DeviceRunLengthEncode::Encode does not compile when unique_output is a thrust::transform_output_iterator #9486

Description

@fkallen

Is this a duplicate?

Type of Bug

Compile-time Error

Component

CUB

Describe the bug

I am computing an RLE with tuple inputs, but for the output I only need one element of the tuple. To reduce the allocated memory for output, I would like to use a transform_output_iterator which maps a tuple to the required element. This does not compile.

Maybe related: #821

The compilation error is very verbose.

/opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/block/block_load.cuh(117): error: no suitable conversion function from "thrust::_V_300303_SM_1200::iterator_facade<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::detail::make_zip_iterator_base<cuda::std::__4::tuple<int *, int *>>::value_type, thrust::_V_300303_SM_1200::detail::make_zip_iterator_base<cuda::std::__4::tuple<int *, int *>>::system, thrust::_V_300303_SM_1200::detail::make_zip_iterator_base<cuda::std::__4::tuple<int *, int *>>::traversal_category, thrust::_V_300303_SM_1200::detail::make_zip_iterator_base<cuda::std::__4::tuple<int *, int *>>::reference, thrust::_V_300303_SM_1200::detail::make_zip_iterator_base<cuda::std::__4::tuple<int *, int *>>::difference_type>::reference" (aka "thrust::_V_300303_SM_1200::detail::tuple_of_iterator_references<int &, int &>") to "cub::_V_300303_SM_1200::detail::reduce::AgentReduceByKey<cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<cuda::std::__4::__accumulator_t<cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>>, cub::_V_300303_SM_1200::detail::non_void_value_t<thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, cuda::std::__4::decay<lambda [](cuda::std::__4::tuple</* etc... */>)->cuda::std::__4::tuple_element_t</* etc... */>>::type>, int *>, cub::_V_300303_SM_1200::detail::it_value_t<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>>>>::Policy1000::ReduceByKeyPolicyT, thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, cuda::std::__4::decay<lambda [](cuda::std::__4::tuple<thrust::_V_300303_SM_1200::detail::it_value_t<int *>, thrust::_V_300303_SM_1200::detail::it_value_t<int *>>)->cuda::std::__4::tuple_element_t<0UL, cuda::std::__4::tuple<thrust::_V_300303_SM_1200::detail::it_value_t</* etc... */>, thrust::_V_300303_SM_1200::detail::it_value_t</* etc... */>>>>::type>, int *>, thrust::_V_300303_SM_1200::constant_iterator<cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>, thrust::_V_300303_SM_1200::use_default>, int *, int *, cuda::std::__4::equal_to<void>, cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, cuda::std::__4::decay<lambda [](cuda::std::__4::tuple<thrust::_V_300303_SM_1200::detail::it_value_t</* etc... */>, thrust::_V_300303_SM_1200::detail::it_value_t</* etc... */>>)->cuda::std::__4::tuple_element_t<0UL, cuda::std::__4::tuple</* etc... */>>>::type>, int *>, thrust::_V_300303_SM_1200::constant_iterator<cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>, thrust::_V_300303_SM_1200::use_default>, int *, int *, cuda::std::__4::equal_to<void>, cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>, cuda::std::__4::__accumulator_t<cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>>, cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<cuda::std::__4::__accumulator_t<cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>>, cub::_V_300303_SM_1200::detail::non_void_value_t<thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, cuda::std::__4::decay</* etc... */>::type>, int *>, cub::_V_300303_SM_1200::detail::it_value_t<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple</* etc... */>>>>>>::local_offset_t, cuda::std::__4::__accumulator_t<cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>>, cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, cuda::std::__4::decay<lambda [](cuda::std::__4::tuple<thrust::_V_300303_SM_1200::detail::it_value_t</* etc... */>, thrust::_V_300303_SM_1200::detail::it_value_t</* etc... */>>)->cuda::std::__4::tuple_element_t<0UL, cuda::std::__4::tuple</* etc... */>>>::type>, int *>, thrust::_V_300303_SM_1200::constant_iterator<cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>, thrust::_V_300303_SM_1200::use_default>, int *, int *, cuda::std::__4::equal_to<void>, cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>, cuda::std::__4::__accumulator_t<cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>>, cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<cuda::std::__4::__accumulator_t<cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>>, cub::_V_300303_SM_1200::detail::non_void_value_t<thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, cuda::std::__4::decay</* etc... */>::type>, int *>, cub::_V_300303_SM_1200::detail::it_value_t<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple</* etc... */>>>>>>::streaming_context_t>::KeyOutputT" exists
        dst_items[i] = block_src_it[src_pos];
                       ^
          detected during:
            instantiation of "void cub::_V_300303_SM_1200::LoadDirectBlocked(int, RandomAccessIterator, T (&)[ItemsPerThread], int) [with T=int, ItemsPerThread=14, RandomAccessIterator=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>]" at line 874
            instantiation of "void cub::_V_300303_SM_1200::BlockLoad<T, BlockDimX, ItemsPerThread, Algorithm, BlockDimY, BlockDimZ>::LoadInternal<cub::_V_300303_SM_1200::BLOCK_LOAD_DIRECT, Dummy>::Load(RandomAccessIterator, T (&)[ItemsPerThread], int) [with T=int, BlockDimX=256, ItemsPerThread=14, Algorithm=cub::_V_300303_SM_1200::BLOCK_LOAD_DIRECT, BlockDimY=1, BlockDimZ=1, Dummy=0, RandomAccessIterator=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>]" at line 1247
            instantiation of "void cub::_V_300303_SM_1200::BlockLoad<T, BlockDimX, ItemsPerThread, Algorithm, BlockDimY, BlockDimZ>::Load(RandomAccessIterator, T (&)[ItemsPerThread], int) [with T=int, BlockDimX=256, ItemsPerThread=14, Algorithm=cub::_V_300303_SM_1200::BLOCK_LOAD_DIRECT, BlockDimY=1, BlockDimZ=1, RandomAccessIterator=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>]" at line 535 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/agent/agent_reduce_by_key.cuh
            instantiation of "void cub::_V_300303_SM_1200::detail::reduce::AgentReduceByKey<AgentReduceByKeyPolicyT, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, OffsetT, AccumT, StreamingContextT>::ConsumeTile<IS_LAST_TILE>(OffsetT, int, OffsetT, cub::_V_300303_SM_1200::detail::reduce::AgentReduceByKey<AgentReduceByKeyPolicyT, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, OffsetT, AccumT, StreamingContextT>::ScanTileStateT &) [with AgentReduceByKeyPolicyT=cub::_V_300303_SM_1200::AgentReduceByKeyPolicy<256, 14, cub::_V_300303_SM_1200::BLOCK_LOAD_DIRECT, cub::_V_300303_SM_1200::LOAD_CA, cub::_V_300303_SM_1200::BLOCK_SCAN_WARP_SCANS, cub::_V_300303_SM_1200::detail::exponential_backon_constructor_t<956U, 70U>>, KeysInputIteratorT=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, UniqueOutputIteratorT=thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, ValuesInputIteratorT=thrust::_V_300303_SM_1200::constant_iterator<int, int32_t, thrust::_V_300303_SM_1200::use_default>, AggregatesOutputIteratorT=int *, NumRunsOutputIteratorT=int *, EqualityOpT=cuda::std::__4::equal_to<void>, ReductionOpT=cuda::std::__4::plus<void>, OffsetT=int32_t, AccumT=int, StreamingContextT=cub::_V_300303_SM_1200::NullType, IS_LAST_TILE=false]" at line 769 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/agent/agent_reduce_by_key.cuh
            instantiation of "void cub::_V_300303_SM_1200::detail::reduce::AgentReduceByKey<AgentReduceByKeyPolicyT, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, OffsetT, AccumT, StreamingContextT>::ConsumeRange(OffsetT, cub::_V_300303_SM_1200::detail::reduce::AgentReduceByKey<AgentReduceByKeyPolicyT, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, OffsetT, AccumT, StreamingContextT>::ScanTileStateT &, int) [with AgentReduceByKeyPolicyT=cub::_V_300303_SM_1200::AgentReduceByKeyPolicy<256, 14, cub::_V_300303_SM_1200::BLOCK_LOAD_DIRECT, cub::_V_300303_SM_1200::LOAD_CA, cub::_V_300303_SM_1200::BLOCK_SCAN_WARP_SCANS, cub::_V_300303_SM_1200::detail::exponential_backon_constructor_t<956U, 70U>>, KeysInputIteratorT=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, UniqueOutputIteratorT=thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, ValuesInputIteratorT=thrust::_V_300303_SM_1200::constant_iterator<int, int32_t, thrust::_V_300303_SM_1200::use_default>, AggregatesOutputIteratorT=int *, NumRunsOutputIteratorT=int *, EqualityOpT=cuda::std::__4::equal_to<void>, ReductionOpT=cuda::std::__4::plus<void>, OffsetT=int32_t, AccumT=int, StreamingContextT=cub::_V_300303_SM_1200::NullType]" at line 232 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/device/dispatch/dispatch_reduce_by_key.cuh
            [ 2 instantiation contexts not shown ]
            instantiation of "cudaError_t cub::_V_300303_SM_1200::ChainedPolicy<PolicyPtxVersion, PolicyT, PrevPolicyT>::find_and_invoke_policy<DevicePtxVersion,FunctorT>(FunctorT &) [with PolicyPtxVersion=1000, PolicyT=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>::Policy1000, PrevPolicyT=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>::Policy900, DevicePtxVersion=1200, FunctorT=cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, thrust::_V_300303_SM_1200::constant_iterator<int, int32_t, thrust::_V_300303_SM_1200::use_default>, int *, int *, cuda::std::__4::equal_to<void>, cuda::std::__4::plus<void>, int32_t, int, cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>>]" at line 831 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/util_device.cuh
            instantiation of "cudaError_t cub::_V_300303_SM_1200::ChainedPolicy<PolicyPtxVersion, PolicyT, PrevPolicyT>::runtime_arch_to_compiletime<ArchMult,CudaArches...,FunctorT>(int, FunctorT &) [with PolicyPtxVersion=1000, PolicyT=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>::Policy1000, PrevPolicyT=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>::Policy900, ArchMult=1, CudaArches=<1200>, FunctorT=cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, thrust::_V_300303_SM_1200::constant_iterator<int, int32_t, thrust::_V_300303_SM_1200::use_default>, int *, int *, cuda::std::__4::equal_to<void>, cuda::std::__4::plus<void>, int32_t, int, cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>>]" at line 796 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/util_device.cuh
            instantiation of "cudaError_t cub::_V_300303_SM_1200::ChainedPolicy<PolicyPtxVersion, PolicyT, PrevPolicyT>::Invoke(int, FunctorT &) [with PolicyPtxVersion=1000, PolicyT=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>::Policy1000, PrevPolicyT=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>::Policy900, FunctorT=cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, thrust::_V_300303_SM_1200::constant_iterator<int, int32_t, thrust::_V_300303_SM_1200::use_default>, int *, int *, cuda::std::__4::equal_to<void>, cuda::std::__4::plus<void>, int32_t, int, cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>>]" at line 443 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
            instantiation of "cudaError_t cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, OffsetT, AccumT, PolicyHub>::Dispatch(void *, size_t &, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, OffsetT, AccumT, PolicyHub>::global_offset_t, cudaStream_t) [with KeysInputIteratorT=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, UniqueOutputIteratorT=thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, ValuesInputIteratorT=thrust::_V_300303_SM_1200::constant_iterator<int, int32_t, thrust::_V_300303_SM_1200::use_default>, AggregatesOutputIteratorT=int *, NumRunsOutputIteratorT=int *, EqualityOpT=cuda::std::__4::equal_to<void>, ReductionOpT=cuda::std::__4::plus<void>, OffsetT=int32_t, AccumT=int, PolicyHub=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>]" at line 191 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/device/device_run_length_encode.cuh
            instantiation of "cudaError_t cub::_V_300303_SM_1200::DeviceRunLengthEncode::Encode(void *, size_t &, InputIteratorT, UniqueOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, NumItemsT, cudaStream_t) [with InputIteratorT=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, UniqueOutputIteratorT=thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, LengthsOutputIteratorT=int *, NumRunsOutputIteratorT=int *, NumItemsT=int]" at line 36 of <source>

How to Reproduce

https://godbolt.org/z/Prh5qTebT

#include <cub/cub.cuh>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <cuda/functional>



void func(){
    int* items = nullptr;
    int* segmentIds = nullptr;

    auto rle_input = thrust::make_zip_iterator(
        items,
        segmentIds
    );

    int* unique_items = nullptr;
    //does not compile
    auto rle_output = thrust::make_transform_output_iterator(
        unique_items,
        cuda::proclaim_return_type<int>(
            []__device__(cuda::std::tuple<int,int> tup){
                return cuda::std::get<0>(tup);
            }
        )
    );

    //does not compile
    /*auto rle_output = thrust::make_zip_iterator(
        unique_items,
        thrust::make_discard_iterator()
    );*/

    size_t temp_storage_bytes = 0;
    cub::DeviceRunLengthEncode::Encode(
        nullptr,
        temp_storage_bytes,
        //(cuda::std::tuple<int,int>*) nullptr,
        rle_input,
        //(cuda::std::tuple<int,int>*) nullptr,
        rle_output,
        (int*)nullptr,
        (int*)nullptr,
        42
    );
}

Expected behavior

It works with transform_output_iterator

Reproduction link

https://godbolt.org/z/Prh5qTebT

Operating System

No response

nvidia-smi output

No response

NVCC version

No response

Metadata

Metadata

Assignees

Labels

needs triageIssues that require the team's attention

Type

No fields configured for Bug.

Projects

Status
Todo

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions