I am computing an RLE with tuple inputs, but for the output I only need one element of the tuple. To reduce the allocated memory for output, I would like to use a transform_output_iterator which maps a tuple to the required element. This does not compile.
The compilation error is very verbose.
/opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/block/block_load.cuh(117): error: no suitable conversion function from "thrust::_V_300303_SM_1200::iterator_facade<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::detail::make_zip_iterator_base<cuda::std::__4::tuple<int *, int *>>::value_type, thrust::_V_300303_SM_1200::detail::make_zip_iterator_base<cuda::std::__4::tuple<int *, int *>>::system, thrust::_V_300303_SM_1200::detail::make_zip_iterator_base<cuda::std::__4::tuple<int *, int *>>::traversal_category, thrust::_V_300303_SM_1200::detail::make_zip_iterator_base<cuda::std::__4::tuple<int *, int *>>::reference, thrust::_V_300303_SM_1200::detail::make_zip_iterator_base<cuda::std::__4::tuple<int *, int *>>::difference_type>::reference" (aka "thrust::_V_300303_SM_1200::detail::tuple_of_iterator_references<int &, int &>") to "cub::_V_300303_SM_1200::detail::reduce::AgentReduceByKey<cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<cuda::std::__4::__accumulator_t<cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>>, cub::_V_300303_SM_1200::detail::non_void_value_t<thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, cuda::std::__4::decay<lambda [](cuda::std::__4::tuple</* etc... */>)->cuda::std::__4::tuple_element_t</* etc... */>>::type>, int *>, cub::_V_300303_SM_1200::detail::it_value_t<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>>>>::Policy1000::ReduceByKeyPolicyT, thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, cuda::std::__4::decay<lambda [](cuda::std::__4::tuple<thrust::_V_300303_SM_1200::detail::it_value_t<int *>, thrust::_V_300303_SM_1200::detail::it_value_t<int *>>)->cuda::std::__4::tuple_element_t<0UL, cuda::std::__4::tuple<thrust::_V_300303_SM_1200::detail::it_value_t</* etc... */>, thrust::_V_300303_SM_1200::detail::it_value_t</* etc... */>>>>::type>, int *>, thrust::_V_300303_SM_1200::constant_iterator<cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>, thrust::_V_300303_SM_1200::use_default>, int *, int *, cuda::std::__4::equal_to<void>, cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, cuda::std::__4::decay<lambda [](cuda::std::__4::tuple<thrust::_V_300303_SM_1200::detail::it_value_t</* etc... */>, thrust::_V_300303_SM_1200::detail::it_value_t</* etc... */>>)->cuda::std::__4::tuple_element_t<0UL, cuda::std::__4::tuple</* etc... */>>>::type>, int *>, thrust::_V_300303_SM_1200::constant_iterator<cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>, thrust::_V_300303_SM_1200::use_default>, int *, int *, cuda::std::__4::equal_to<void>, cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>, cuda::std::__4::__accumulator_t<cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>>, cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<cuda::std::__4::__accumulator_t<cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>>, cub::_V_300303_SM_1200::detail::non_void_value_t<thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, cuda::std::__4::decay</* etc... */>::type>, int *>, cub::_V_300303_SM_1200::detail::it_value_t<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple</* etc... */>>>>>>::local_offset_t, cuda::std::__4::__accumulator_t<cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>>, cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, cuda::std::__4::decay<lambda [](cuda::std::__4::tuple<thrust::_V_300303_SM_1200::detail::it_value_t</* etc... */>, thrust::_V_300303_SM_1200::detail::it_value_t</* etc... */>>)->cuda::std::__4::tuple_element_t<0UL, cuda::std::__4::tuple</* etc... */>>>::type>, int *>, thrust::_V_300303_SM_1200::constant_iterator<cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>, thrust::_V_300303_SM_1200::use_default>, int *, int *, cuda::std::__4::equal_to<void>, cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>, cuda::std::__4::__accumulator_t<cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>>, cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<cuda::std::__4::__accumulator_t<cuda::std::__4::plus<void>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>, cub::_V_300303_SM_1200::detail::non_void_value_t<int *, cub::_V_300303_SM_1200::detail::choose_signed_offset_t<int>>>, cub::_V_300303_SM_1200::detail::non_void_value_t<thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, cuda::std::__4::decay</* etc... */>::type>, int *>, cub::_V_300303_SM_1200::detail::it_value_t<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple</* etc... */>>>>>>::streaming_context_t>::KeyOutputT" exists
dst_items[i] = block_src_it[src_pos];
^
detected during:
instantiation of "void cub::_V_300303_SM_1200::LoadDirectBlocked(int, RandomAccessIterator, T (&)[ItemsPerThread], int) [with T=int, ItemsPerThread=14, RandomAccessIterator=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>]" at line 874
instantiation of "void cub::_V_300303_SM_1200::BlockLoad<T, BlockDimX, ItemsPerThread, Algorithm, BlockDimY, BlockDimZ>::LoadInternal<cub::_V_300303_SM_1200::BLOCK_LOAD_DIRECT, Dummy>::Load(RandomAccessIterator, T (&)[ItemsPerThread], int) [with T=int, BlockDimX=256, ItemsPerThread=14, Algorithm=cub::_V_300303_SM_1200::BLOCK_LOAD_DIRECT, BlockDimY=1, BlockDimZ=1, Dummy=0, RandomAccessIterator=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>]" at line 1247
instantiation of "void cub::_V_300303_SM_1200::BlockLoad<T, BlockDimX, ItemsPerThread, Algorithm, BlockDimY, BlockDimZ>::Load(RandomAccessIterator, T (&)[ItemsPerThread], int) [with T=int, BlockDimX=256, ItemsPerThread=14, Algorithm=cub::_V_300303_SM_1200::BLOCK_LOAD_DIRECT, BlockDimY=1, BlockDimZ=1, RandomAccessIterator=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>]" at line 535 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/agent/agent_reduce_by_key.cuh
instantiation of "void cub::_V_300303_SM_1200::detail::reduce::AgentReduceByKey<AgentReduceByKeyPolicyT, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, OffsetT, AccumT, StreamingContextT>::ConsumeTile<IS_LAST_TILE>(OffsetT, int, OffsetT, cub::_V_300303_SM_1200::detail::reduce::AgentReduceByKey<AgentReduceByKeyPolicyT, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, OffsetT, AccumT, StreamingContextT>::ScanTileStateT &) [with AgentReduceByKeyPolicyT=cub::_V_300303_SM_1200::AgentReduceByKeyPolicy<256, 14, cub::_V_300303_SM_1200::BLOCK_LOAD_DIRECT, cub::_V_300303_SM_1200::LOAD_CA, cub::_V_300303_SM_1200::BLOCK_SCAN_WARP_SCANS, cub::_V_300303_SM_1200::detail::exponential_backon_constructor_t<956U, 70U>>, KeysInputIteratorT=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, UniqueOutputIteratorT=thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, ValuesInputIteratorT=thrust::_V_300303_SM_1200::constant_iterator<int, int32_t, thrust::_V_300303_SM_1200::use_default>, AggregatesOutputIteratorT=int *, NumRunsOutputIteratorT=int *, EqualityOpT=cuda::std::__4::equal_to<void>, ReductionOpT=cuda::std::__4::plus<void>, OffsetT=int32_t, AccumT=int, StreamingContextT=cub::_V_300303_SM_1200::NullType, IS_LAST_TILE=false]" at line 769 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/agent/agent_reduce_by_key.cuh
instantiation of "void cub::_V_300303_SM_1200::detail::reduce::AgentReduceByKey<AgentReduceByKeyPolicyT, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, OffsetT, AccumT, StreamingContextT>::ConsumeRange(OffsetT, cub::_V_300303_SM_1200::detail::reduce::AgentReduceByKey<AgentReduceByKeyPolicyT, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, OffsetT, AccumT, StreamingContextT>::ScanTileStateT &, int) [with AgentReduceByKeyPolicyT=cub::_V_300303_SM_1200::AgentReduceByKeyPolicy<256, 14, cub::_V_300303_SM_1200::BLOCK_LOAD_DIRECT, cub::_V_300303_SM_1200::LOAD_CA, cub::_V_300303_SM_1200::BLOCK_SCAN_WARP_SCANS, cub::_V_300303_SM_1200::detail::exponential_backon_constructor_t<956U, 70U>>, KeysInputIteratorT=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, UniqueOutputIteratorT=thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, ValuesInputIteratorT=thrust::_V_300303_SM_1200::constant_iterator<int, int32_t, thrust::_V_300303_SM_1200::use_default>, AggregatesOutputIteratorT=int *, NumRunsOutputIteratorT=int *, EqualityOpT=cuda::std::__4::equal_to<void>, ReductionOpT=cuda::std::__4::plus<void>, OffsetT=int32_t, AccumT=int, StreamingContextT=cub::_V_300303_SM_1200::NullType]" at line 232 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/device/dispatch/dispatch_reduce_by_key.cuh
[ 2 instantiation contexts not shown ]
instantiation of "cudaError_t cub::_V_300303_SM_1200::ChainedPolicy<PolicyPtxVersion, PolicyT, PrevPolicyT>::find_and_invoke_policy<DevicePtxVersion,FunctorT>(FunctorT &) [with PolicyPtxVersion=1000, PolicyT=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>::Policy1000, PrevPolicyT=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>::Policy900, DevicePtxVersion=1200, FunctorT=cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, thrust::_V_300303_SM_1200::constant_iterator<int, int32_t, thrust::_V_300303_SM_1200::use_default>, int *, int *, cuda::std::__4::equal_to<void>, cuda::std::__4::plus<void>, int32_t, int, cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>>]" at line 831 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/util_device.cuh
instantiation of "cudaError_t cub::_V_300303_SM_1200::ChainedPolicy<PolicyPtxVersion, PolicyT, PrevPolicyT>::runtime_arch_to_compiletime<ArchMult,CudaArches...,FunctorT>(int, FunctorT &) [with PolicyPtxVersion=1000, PolicyT=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>::Policy1000, PrevPolicyT=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>::Policy900, ArchMult=1, CudaArches=<1200>, FunctorT=cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, thrust::_V_300303_SM_1200::constant_iterator<int, int32_t, thrust::_V_300303_SM_1200::use_default>, int *, int *, cuda::std::__4::equal_to<void>, cuda::std::__4::plus<void>, int32_t, int, cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>>]" at line 796 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/util_device.cuh
instantiation of "cudaError_t cub::_V_300303_SM_1200::ChainedPolicy<PolicyPtxVersion, PolicyT, PrevPolicyT>::Invoke(int, FunctorT &) [with PolicyPtxVersion=1000, PolicyT=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>::Policy1000, PrevPolicyT=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>::Policy900, FunctorT=cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, thrust::_V_300303_SM_1200::constant_iterator<int, int32_t, thrust::_V_300303_SM_1200::use_default>, int *, int *, cuda::std::__4::equal_to<void>, cuda::std::__4::plus<void>, int32_t, int, cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>>]" at line 443 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/device/dispatch/dispatch_streaming_reduce_by_key.cuh
instantiation of "cudaError_t cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, OffsetT, AccumT, PolicyHub>::Dispatch(void *, size_t &, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, cub::_V_300303_SM_1200::detail::reduce::DispatchStreamingReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOpT, ReductionOpT, OffsetT, AccumT, PolicyHub>::global_offset_t, cudaStream_t) [with KeysInputIteratorT=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, UniqueOutputIteratorT=thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, ValuesInputIteratorT=thrust::_V_300303_SM_1200::constant_iterator<int, int32_t, thrust::_V_300303_SM_1200::use_default>, AggregatesOutputIteratorT=int *, NumRunsOutputIteratorT=int *, EqualityOpT=cuda::std::__4::equal_to<void>, ReductionOpT=cuda::std::__4::plus<void>, OffsetT=int32_t, AccumT=int, PolicyHub=cub::_V_300303_SM_1200::detail::rle::encode::policy_hub<int, int>]" at line 191 of /opt/compiler-explorer/cuda/13.3.0/bin/../targets/x86_64-linux/include/cccl/cub/device/device_run_length_encode.cuh
instantiation of "cudaError_t cub::_V_300303_SM_1200::DeviceRunLengthEncode::Encode(void *, size_t &, InputIteratorT, UniqueOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, NumItemsT, cudaStream_t) [with InputIteratorT=thrust::_V_300303_SM_1200::zip_iterator<cuda::std::__4::tuple<int *, int *>>, UniqueOutputIteratorT=thrust::_V_300303_SM_1200::transform_output_iterator<cuda::__4::__detail::__return_type_wrapper<int, lambda [](cuda::std::__4::tuple<int, int>)->int>, int *>, LengthsOutputIteratorT=int *, NumRunsOutputIteratorT=int *, NumItemsT=int]" at line 36 of <source>
Is this a duplicate?
Type of Bug
Compile-time Error
Component
CUB
Describe the bug
I am computing an RLE with tuple inputs, but for the output I only need one element of the tuple. To reduce the allocated memory for output, I would like to use a transform_output_iterator which maps a tuple to the required element. This does not compile.
Maybe related: #821
The compilation error is very verbose.
How to Reproduce
https://godbolt.org/z/Prh5qTebT
Expected behavior
It works with transform_output_iterator
Reproduction link
https://godbolt.org/z/Prh5qTebT
Operating System
No response
nvidia-smi output
No response
NVCC version
No response