Skip to content

Commit 3a08273

Browse files
committed
Implement parallel cuda::std::iota
This implements the `iota` algorithm for the cuda backend. * `std::iota` see https://en.cppreference.com/w/cpp/algorithm/iota.html It provides tests and benchmarks similar to Thrust and some boilerplate for libcu++ The functionality is publicly available yet and implemented in a private internal header Fixes #7927
1 parent 2c32015 commit 3a08273

File tree

4 files changed

+418
-0
lines changed

4 files changed

+418
-0
lines changed
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <thrust/device_vector.h>
12+
#include <thrust/sequence.h>
13+
14+
#include <cuda/memory_pool>
15+
#include <cuda/std/__pstl_algorithm>
16+
#include <cuda/stream>
17+
18+
#include "nvbench_helper.cuh"
19+
20+
template <typename T>
21+
static void basic(nvbench::state& state, nvbench::type_list<T>)
22+
{
23+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
24+
25+
thrust::device_vector<T> in(elements, thrust::no_init);
26+
27+
state.add_element_count(elements);
28+
state.add_global_memory_reads<T>(1);
29+
state.add_global_memory_writes<T>(elements);
30+
31+
caching_allocator_t alloc{};
32+
33+
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
34+
[&](nvbench::launch& launch) {
35+
cuda::std::iota(cuda_policy(alloc, launch), in.begin(), in.end(), T{42});
36+
});
37+
}
38+
39+
NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
40+
.set_name("base")
41+
.set_type_axes_names({"T{ct}"})
42+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
43+
44+
template <typename T>
45+
static void stepped(nvbench::state& state, nvbench::type_list<T>)
46+
{
47+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
48+
49+
thrust::device_vector<T> in(elements, thrust::no_init);
50+
51+
state.add_element_count(elements);
52+
state.add_global_memory_reads<T>(1);
53+
state.add_global_memory_writes<T>(elements);
54+
55+
caching_allocator_t alloc{};
56+
57+
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
58+
[&](nvbench::launch& launch) {
59+
cuda::std::iota(cuda_policy(alloc, launch), in.begin(), in.end(), T{42}, T{2});
60+
});
61+
}
62+
63+
NVBENCH_BENCH_TYPES(stepped, NVBENCH_TYPE_AXES(fundamental_types))
64+
.set_name("stepped")
65+
.set_type_axes_names({"T{ct}"})
66+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___PSTL_IOTA_H
12+
#define _CUDA_STD___PSTL_IOTA_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#if !_CCCL_COMPILER(NVRTC)
25+
26+
# include <cuda/__iterator/counting_iterator.h>
27+
# include <cuda/__iterator/strided_iterator.h>
28+
# include <cuda/__nvtx/nvtx.h>
29+
# include <cuda/std/__concepts/concept_macros.h>
30+
# include <cuda/std/__execution/policy.h>
31+
# include <cuda/std/__functional/not_fn.h>
32+
# include <cuda/std/__iterator/concepts.h>
33+
# include <cuda/std/__iterator/incrementable_traits.h>
34+
# include <cuda/std/__iterator/iterator_traits.h>
35+
# include <cuda/std/__numeric/iota.h>
36+
# include <cuda/std/__pstl/dispatch.h>
37+
# include <cuda/std/__type_traits/always_false.h>
38+
# include <cuda/std/__type_traits/is_arithmetic.h>
39+
# include <cuda/std/__type_traits/is_convertible.h>
40+
# include <cuda/std/__type_traits/is_execution_policy.h>
41+
# include <cuda/std/__type_traits/is_integral.h>
42+
# include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
43+
# include <cuda/std/__type_traits/is_same.h>
44+
# include <cuda/std/__utility/move.h>
45+
46+
# if _CCCL_HAS_BACKEND_CUDA()
47+
# include <cuda/std/__pstl/cuda/transform.h>
48+
# endif // _CCCL_HAS_BACKEND_CUDA()
49+
50+
# include <cuda/std/__cccl/prologue.h>
51+
52+
_CCCL_BEGIN_NAMESPACE_CUDA_STD
53+
54+
template <class _Tp>
55+
_CCCL_CONCEPT __can_operator_plus_integral = _CCCL_REQUIRES_EXPR((_Tp), const _Tp& __val, const uint64_t __index)(
56+
requires(is_convertible_v<decltype(__val + __index), _Tp>));
57+
58+
template <class _Tp>
59+
_CCCL_CONCEPT __can_operator_plus_conversion = _CCCL_REQUIRES_EXPR((_Tp), const _Tp& __val, const uint64_t __index)(
60+
requires(is_convertible_v<decltype(__val + static_cast<_Tp>(__index)), _Tp>));
61+
62+
template <class _Tp>
63+
struct __iota_init_fn
64+
{
65+
_Tp __init_;
66+
67+
_CCCL_API constexpr __iota_init_fn(const _Tp& __init) noexcept(is_nothrow_copy_constructible_v<_Tp>)
68+
: __init_(__init)
69+
{}
70+
71+
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE _Tp constexpr operator()(const uint64_t __index) const
72+
{
73+
# if _CCCL_HAS_NVFP16()
74+
// We cannot rely on operator+ and constructors from integers to be available for the extended fp types
75+
if constexpr (is_same_v<_Tp, __half>)
76+
{
77+
return ::__hadd(__init_, ::__ull2half_rn(__index));
78+
}
79+
else
80+
# endif // _CCCL_HAS_NVFP16()
81+
# if _CCCL_HAS_NVBF16()
82+
if constexpr (is_same_v<_Tp, __nv_bfloat16>)
83+
{
84+
return ::__hadd(__init_, ::__ull2bfloat16_rn(__index));
85+
}
86+
else
87+
# endif // _CCCL_HAS_NVBF16()
88+
if constexpr (is_arithmetic_v<_Tp>)
89+
{ // avoid warnings about integer conversions
90+
return static_cast<_Tp>(__init_ + static_cast<_Tp>(__index));
91+
}
92+
else if constexpr (__can_operator_plus_integral<_Tp>)
93+
{
94+
return __init_ + __index;
95+
}
96+
else if constexpr (__can_operator_plus_conversion<_Tp>)
97+
{
98+
return __init_ + static_cast<_Tp>(__index);
99+
}
100+
else
101+
{
102+
static_assert(__always_false_v<_Tp>, "cuda::std::iota(iter, iter, init) requires that T supports operator+");
103+
}
104+
}
105+
};
106+
107+
template <class _Tp>
108+
_CCCL_CONCEPT __can_operator_plus_times_integral = _CCCL_REQUIRES_EXPR((_Tp), const _Tp& __val, const uint64_t __index)(
109+
requires(is_convertible_v<decltype(__val + __val * __index), _Tp>));
110+
111+
template <class _Tp>
112+
_CCCL_CONCEPT __can_operator_plus_times_conversion =
113+
_CCCL_REQUIRES_EXPR((_Tp), const _Tp& __val, const uint64_t __index) //
114+
(requires(is_convertible_v<decltype(__val + __val * static_cast<_Tp>(__index)), _Tp>));
115+
116+
template <class _Tp>
117+
struct __iota_init_step_fn
118+
{
119+
_Tp __init_;
120+
_Tp __step_;
121+
122+
_CCCL_API constexpr __iota_init_step_fn(const _Tp& __init,
123+
const _Tp& __step) noexcept(is_nothrow_copy_constructible_v<_Tp>)
124+
: __init_(__init)
125+
, __step_(__step)
126+
{}
127+
128+
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE constexpr _Tp operator()(const uint64_t __index) const
129+
{
130+
# if _CCCL_HAS_NVFP16()
131+
// We cannot rely on operator+ and constructors from integers to be available for the extended fp types
132+
if constexpr (is_same_v<_Tp, __half>)
133+
{
134+
return ::__hadd(__init_, ::__hmul(__step_, ::__ull2half_rn(__index)));
135+
}
136+
else
137+
# endif // _CCCL_HAS_NVFP16()
138+
# if _CCCL_HAS_NVBF16()
139+
if constexpr (is_same_v<_Tp, __nv_bfloat16>)
140+
{
141+
return ::__hadd(__init_, ::__hmul(__step_, ::__ull2bfloat16_rn(__index)));
142+
}
143+
else
144+
# endif // _CCCL_HAS_NVBF16()
145+
if constexpr (is_arithmetic_v<_Tp>)
146+
{ // avoid warnings about integer conversions
147+
return static_cast<_Tp>(__init_ + __step_ * static_cast<_Tp>(__index));
148+
}
149+
else if constexpr (__can_operator_plus_times_integral<_Tp>)
150+
{
151+
return __init_ + __step_ * __index;
152+
}
153+
else if constexpr (__can_operator_plus_times_conversion<_Tp>)
154+
{
155+
return __init_ + __step_ * static_cast<_Tp>(__index);
156+
}
157+
else
158+
{
159+
static_assert(__always_false_v<_Tp>,
160+
"cuda::std::iota(iter, iter, init, step) requires that T supports operator+ and operator*");
161+
}
162+
}
163+
};
164+
165+
_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
166+
167+
_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _Tp = iter_value_t<_InputIterator>)
168+
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
169+
_CCCL_HOST_API void
170+
iota([[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, const _Tp& __init = _Tp{})
171+
{
172+
static_assert(indirectly_writable<_InputIterator, _Tp>,
173+
"cuda::std::iota requires InputIterator to be indirectly writable with T");
174+
175+
[[maybe_unused]] auto __dispatch =
176+
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>();
177+
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
178+
{
179+
_CCCL_NVTX_RANGE_SCOPE("cuda::std::iota");
180+
181+
if (__first == __last)
182+
{
183+
return;
184+
}
185+
186+
// Note: using a different offset type than uint64_t degrades performance considerably for larger integer types
187+
const auto __count = static_cast<uint64_t>(::cuda::std::distance(__first, __last));
188+
// For whatever reason __iota_init_step_fn is much faster for int64_t and __int128
189+
if constexpr (is_arithmetic_v<_Tp>)
190+
{
191+
auto __res = __dispatch(
192+
__policy,
193+
::cuda::counting_iterator<uint64_t>{0},
194+
::cuda::counting_iterator<uint64_t>{__count},
195+
::cuda::std::move(__first),
196+
__iota_init_step_fn{__init, _Tp{1}});
197+
}
198+
else
199+
{
200+
auto __res = __dispatch(
201+
__policy,
202+
::cuda::counting_iterator<uint64_t>{0},
203+
::cuda::counting_iterator<uint64_t>{static_cast<uint64_t>(__count)},
204+
::cuda::std::move(__first),
205+
__iota_init_fn{__init});
206+
}
207+
}
208+
else
209+
{
210+
static_assert(__always_false_v<_Policy>, "Parallel cuda::std::iota requires at least one selected backend");
211+
return ::cuda::std::iota(::cuda::std::move(__first), ::cuda::std::move(__last), __init);
212+
}
213+
}
214+
215+
_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _Tp = iter_value_t<_InputIterator>)
216+
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
217+
_CCCL_HOST_API void
218+
iota([[maybe_unused]] const _Policy& __policy,
219+
_InputIterator __first,
220+
_InputIterator __last,
221+
const _Tp& __init,
222+
const _Tp& __step)
223+
{
224+
static_assert(indirectly_writable<_InputIterator, _Tp>,
225+
"cuda::std::iota requires InputIterator to be indirectly writable with T");
226+
227+
[[maybe_unused]] auto __dispatch =
228+
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>();
229+
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
230+
{
231+
_CCCL_NVTX_RANGE_SCOPE("cuda::std::iota");
232+
233+
if (__first == __last)
234+
{
235+
return;
236+
}
237+
238+
// Note: using a different offset type than uint64_t degrades performance considerably for larger integer types
239+
const auto __count = static_cast<uint64_t>(::cuda::std::distance(__first, __last));
240+
auto __res = __dispatch(
241+
__policy,
242+
::cuda::counting_iterator<uint64_t>{0},
243+
::cuda::counting_iterator<uint64_t>{__count},
244+
::cuda::std::move(__first),
245+
__iota_init_step_fn{__init, __step});
246+
}
247+
else
248+
{
249+
static_assert(__always_false_v<_Policy>, "Parallel cuda::std::iota requires at least one selected backend");
250+
// TODO(miscco): Consider adding that overload to serial iota
251+
return ::cuda::std::iota(::cuda::std::move(__first), ::cuda::std::move(__last), __init /*, __step*/);
252+
}
253+
}
254+
255+
_CCCL_END_NAMESPACE_ARCH_DEPENDENT
256+
257+
_CCCL_END_NAMESPACE_CUDA_STD
258+
259+
# include <cuda/std/__cccl/epilogue.h>
260+
261+
#endif // !_CCCL_COMPILER(NVRTC)
262+
263+
#endif // _CUDA_STD___PSTL_IOTA_H

libcudacxx/include/cuda/std/__pstl_algorithm

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include <cuda/std/__pstl/generate.h>
4343
#include <cuda/std/__pstl/generate_n.h>
4444
#include <cuda/std/__pstl/inclusive_scan.h>
45+
#include <cuda/std/__pstl/iota.h>
4546
#include <cuda/std/__pstl/merge.h>
4647
#include <cuda/std/__pstl/mismatch.h>
4748
#include <cuda/std/__pstl/none_of.h>

0 commit comments

Comments
 (0)