diff --git a/libcudacxx/benchmarks/bench/iota/basic.cu b/libcudacxx/benchmarks/bench/iota/basic.cu new file mode 100644 index 00000000000..8b953852814 --- /dev/null +++ b/libcudacxx/benchmarks/bench/iota/basic.cu @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include +#include +#include + +#include "nvbench_helper.cuh" + +template +static void basic(nvbench::state& state, nvbench::type_list) +{ + const auto elements = static_cast(state.get_int64("Elements")); + + thrust::device_vector in(elements, thrust::no_init); + + state.add_element_count(elements); + state.add_global_memory_reads(1); + state.add_global_memory_writes(elements); + + caching_allocator_t alloc{}; + + state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { + cuda::std::iota(cuda_policy(alloc, launch), in.begin(), in.end(), T{42}); + }); +} + +NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("base") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)); + +template +static void stepped(nvbench::state& state, nvbench::type_list) +{ + const auto elements = static_cast(state.get_int64("Elements")); + + thrust::device_vector in(elements, thrust::no_init); + + state.add_element_count(elements); + state.add_global_memory_reads(1); + state.add_global_memory_writes(elements); + + caching_allocator_t alloc{}; + + state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { + cuda::std::iota(cuda_policy(alloc, launch), in.begin(), in.end(), T{42}, T{2}); + }); +} + +NVBENCH_BENCH_TYPES(stepped, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("stepped") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)); diff --git a/libcudacxx/include/cuda/std/__pstl/iota.h b/libcudacxx/include/cuda/std/__pstl/iota.h new file mode 100644 index 00000000000..160b1c12b13 --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/iota.h @@ -0,0 +1,263 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_IOTA_H +#define _CUDA_STD___PSTL_IOTA_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if !_CCCL_COMPILER(NVRTC) + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# if _CCCL_HAS_BACKEND_CUDA() +# include +# endif // _CCCL_HAS_BACKEND_CUDA() + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +template +_CCCL_CONCEPT __can_operator_plus_integral = _CCCL_REQUIRES_EXPR((_Tp), const _Tp& __val, const uint64_t __index)( + requires(is_convertible_v)); + +template +_CCCL_CONCEPT __can_operator_plus_conversion = _CCCL_REQUIRES_EXPR((_Tp), const _Tp& __val, const uint64_t __index)( + requires(is_convertible_v(__index)), _Tp>)); + +template +struct __iota_init_fn +{ + _Tp __init_; + + _CCCL_API constexpr __iota_init_fn(const _Tp& __init) noexcept(is_nothrow_copy_constructible_v<_Tp>) + : __init_(__init) + {} + + [[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE _Tp constexpr operator()(const uint64_t __index) const + { +# if _LIBCUDACXX_HAS_NVFP16() + // We cannot rely on operator+ and constructors from integers to be available for the extended fp types + if constexpr (is_same_v<_Tp, __half>) + { + return ::__hadd(__init_, ::__ull2half_rn(__index)); + } + else +# endif // _LIBCUDACXX_HAS_NVFP16() +# if _LIBCUDACXX_HAS_NVBF16() + if constexpr (is_same_v<_Tp, __nv_bfloat16>) + { + return ::__hadd(__init_, ::__ull2bfloat16_rn(__index)); + } + else +# endif // _LIBCUDACXX_HAS_NVBF16() + if constexpr (is_arithmetic_v<_Tp>) + { // avoid warnings about integer conversions + return static_cast<_Tp>(__init_ + static_cast<_Tp>(__index)); + } + else if constexpr (__can_operator_plus_integral<_Tp>) + { + return __init_ + __index; + } + else if constexpr (__can_operator_plus_conversion<_Tp>) + { + return __init_ + static_cast<_Tp>(__index); + } + else + { + static_assert(__always_false_v<_Tp>, "cuda::std::iota(iter, iter, init) requires that T supports operator+"); + } + } +}; + +template +_CCCL_CONCEPT __can_operator_plus_times_integral = _CCCL_REQUIRES_EXPR((_Tp), const _Tp& __val, const uint64_t __index)( + requires(is_convertible_v)); + +template +_CCCL_CONCEPT __can_operator_plus_times_conversion = + _CCCL_REQUIRES_EXPR((_Tp), const _Tp& __val, const uint64_t __index) // + (requires(is_convertible_v(__index)), _Tp>)); + +template +struct __iota_init_step_fn +{ + _Tp __init_; + _Tp __step_; + + _CCCL_API constexpr __iota_init_step_fn(const _Tp& __init, + const _Tp& __step) noexcept(is_nothrow_copy_constructible_v<_Tp>) + : __init_(__init) + , __step_(__step) + {} + + [[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE constexpr _Tp operator()(const uint64_t __index) const + { +# if _LIBCUDACXX_HAS_NVFP16() + // We cannot rely on operator+ and constructors from integers to be available for the extended fp types + if constexpr (is_same_v<_Tp, __half>) + { + return ::__hadd(__init_, ::__hmul(__step_, ::__ull2half_rn(__index))); + } + else +# endif // _LIBCUDACXX_HAS_NVFP16() +# if _LIBCUDACXX_HAS_NVBF16() + if constexpr (is_same_v<_Tp, __nv_bfloat16>) + { + return ::__hadd(__init_, ::__hmul(__step_, ::__ull2bfloat16_rn(__index))); + } + else +# endif // _LIBCUDACXX_HAS_NVBF16() + if constexpr (is_arithmetic_v<_Tp>) + { // avoid warnings about integer conversions + return static_cast<_Tp>(__init_ + __step_ * static_cast<_Tp>(__index)); + } + else if constexpr (__can_operator_plus_times_integral<_Tp>) + { + return __init_ + __step_ * __index; + } + else if constexpr (__can_operator_plus_times_conversion<_Tp>) + { + return __init_ + __step_ * static_cast<_Tp>(__index); + } + else + { + static_assert(__always_false_v<_Tp>, + "cuda::std::iota(iter, iter, init, step) requires that T supports operator+ and operator*"); + } + } +}; + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _Tp = iter_value_t<_InputIterator>) +_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>) +_CCCL_HOST_API void +iota([[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, const _Tp& __init = _Tp{}) +{ + static_assert(indirectly_writable<_InputIterator, _Tp>, + "cuda::std::iota requires InputIterator to be indirectly writable with T"); + + [[maybe_unused]] auto __dispatch = + ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>(); + if constexpr (::cuda::std::execution::__pstl_can_dispatch) + { + _CCCL_NVTX_RANGE_SCOPE("cuda::std::iota"); + + if (__first == __last) + { + return; + } + + // Note: using a different offset type than uint64_t degrades performance considerably for larger integer types + const auto __count = static_cast(::cuda::std::distance(__first, __last)); + // For whatever reason __iota_init_step_fn is much faster for int64_t and __int128 + if constexpr (is_arithmetic_v<_Tp>) + { + (void) __dispatch( + __policy, + ::cuda::counting_iterator{0}, + ::cuda::counting_iterator{__count}, + ::cuda::std::move(__first), + __iota_init_step_fn{__init, _Tp{1}}); + } + else + { + (void) __dispatch( + __policy, + ::cuda::counting_iterator{0}, + ::cuda::counting_iterator{static_cast(__count)}, + ::cuda::std::move(__first), + __iota_init_fn{__init}); + } + } + else + { + static_assert(__always_false_v<_Policy>, "Parallel cuda::std::iota requires at least one selected backend"); + return ::cuda::std::iota(::cuda::std::move(__first), ::cuda::std::move(__last), __init); + } +} + +_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _Tp = iter_value_t<_InputIterator>) +_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>) +_CCCL_HOST_API void +iota([[maybe_unused]] const _Policy& __policy, + _InputIterator __first, + _InputIterator __last, + const _Tp& __init, + const _Tp& __step) +{ + static_assert(indirectly_writable<_InputIterator, _Tp>, + "cuda::std::iota requires InputIterator to be indirectly writable with T"); + + [[maybe_unused]] auto __dispatch = + ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>(); + if constexpr (::cuda::std::execution::__pstl_can_dispatch) + { + _CCCL_NVTX_RANGE_SCOPE("cuda::std::iota"); + + if (__first == __last) + { + return; + } + + // Note: using a different offset type than uint64_t degrades performance considerably for larger integer types + const auto __count = static_cast(::cuda::std::distance(__first, __last)); + (void) __dispatch( + __policy, + ::cuda::counting_iterator{0}, + ::cuda::counting_iterator{__count}, + ::cuda::std::move(__first), + __iota_init_step_fn{__init, __step}); + } + else + { + static_assert(__always_false_v<_Policy>, "Parallel cuda::std::iota requires at least one selected backend"); + // TODO(miscco): Consider adding that overload to serial iota + return ::cuda::std::iota(::cuda::std::move(__first), ::cuda::std::move(__last), __init /*, __step*/); + } +} + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD + +# include + +#endif // !_CCCL_COMPILER(NVRTC) + +#endif // _CUDA_STD___PSTL_IOTA_H diff --git a/libcudacxx/include/cuda/std/__pstl_algorithm b/libcudacxx/include/cuda/std/__pstl_algorithm index f3108051c70..9b39487f0dd 100644 --- a/libcudacxx/include/cuda/std/__pstl_algorithm +++ b/libcudacxx/include/cuda/std/__pstl_algorithm @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/numeric.iota/pstl_iota.cu b/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/numeric.iota/pstl_iota.cu new file mode 100644 index 00000000000..c88adf3dea6 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/numeric.ops/numeric.iota/pstl_iota.cu @@ -0,0 +1,88 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// template > +// void iota(const Policy& policy, +// InputIterator first, +// InputIterator last, +// const T& init = T{}); + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +inline constexpr int size = 1000; + +template +void test_iota(const Policy& policy, thrust::device_vector& output) +{ + cuda::std::fill(policy, output.begin(), output.end(), -1); + { // With default value + cuda::std::iota(policy, output.begin(), output.end()); + CHECK(cuda::std::equal(output.begin(), output.end(), cuda::counting_iterator{0})); + } + + cuda::std::fill(policy, output.begin(), output.end(), -1); + { // With init + cuda::std::iota(policy, output.begin(), output.end(), 42); + CHECK(cuda::std::equal(output.begin(), output.end(), cuda::counting_iterator{42})); + } + + cuda::std::fill(policy, output.begin(), output.end(), -1); + { // With init and step + cuda::std::iota(policy, output.begin(), output.end(), 42, 2); + CHECK(cuda::std::equal(output.begin(), output.end(), cuda::strided_iterator{cuda::counting_iterator{42}, 2})); + } +} + +C2H_TEST("cuda::std::iota", "[parallel algorithm]") +{ + thrust::device_vector output(size, thrust::no_init); + + SECTION("with default stream") + { + const auto policy = cuda::execution::__cub_par_unseq; + + test_iota(policy, output); + } + + SECTION("with provided stream") + { + cuda::stream stream{cuda::device_ref{0}}; + const auto policy = cuda::execution::__cub_par_unseq.with_stream(stream); + + test_iota(policy, output); + } + + SECTION("with provided memory_resource") + { + cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0}); + const auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(device_resource); + + test_iota(policy, output); + } + + SECTION("with provided stream and memory_resource") + { + cuda::stream stream{cuda::device_ref{0}}; + cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(stream.device()); + const auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(device_resource).with_stream(stream); + + test_iota(policy, output); + } +}