Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions libcudacxx/benchmarks/bench/iota/basic.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <thrust/device_vector.h>
#include <thrust/sequence.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/stream>

#include "nvbench_helper.cuh"

template <typename T>
static void basic(nvbench::state& state, nvbench::type_list<T>)
{
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));

thrust::device_vector<T> in(elements, thrust::no_init);

state.add_element_count(elements);
state.add_global_memory_reads<T>(1);
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc{};

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
cuda::std::iota(cuda_policy(alloc, launch), in.begin(), in.end(), T{42});
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
.set_name("base")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));

template <typename T>
static void stepped(nvbench::state& state, nvbench::type_list<T>)
{
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));

thrust::device_vector<T> in(elements, thrust::no_init);

state.add_element_count(elements);
state.add_global_memory_reads<T>(1);
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc{};

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
cuda::std::iota(cuda_policy(alloc, launch), in.begin(), in.end(), T{42}, T{2});
});
}

NVBENCH_BENCH_TYPES(stepped, NVBENCH_TYPE_AXES(fundamental_types))
.set_name("stepped")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
263 changes: 263 additions & 0 deletions libcudacxx/include/cuda/std/__pstl/iota.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_STD___PSTL_IOTA_H
#define _CUDA_STD___PSTL_IOTA_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if !_CCCL_COMPILER(NVRTC)

# include <cuda/__iterator/counting_iterator.h>
# include <cuda/__iterator/strided_iterator.h>
# include <cuda/__nvtx/nvtx.h>
# include <cuda/std/__concepts/concept_macros.h>
# include <cuda/std/__execution/policy.h>
# include <cuda/std/__functional/not_fn.h>
# include <cuda/std/__iterator/concepts.h>
# include <cuda/std/__iterator/incrementable_traits.h>
# include <cuda/std/__iterator/iterator_traits.h>
# include <cuda/std/__numeric/iota.h>
# include <cuda/std/__pstl/dispatch.h>
# include <cuda/std/__type_traits/always_false.h>
# include <cuda/std/__type_traits/is_arithmetic.h>
# include <cuda/std/__type_traits/is_convertible.h>
# include <cuda/std/__type_traits/is_execution_policy.h>
# include <cuda/std/__type_traits/is_integral.h>
# include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
# include <cuda/std/__type_traits/is_same.h>
# include <cuda/std/__utility/move.h>

# if _CCCL_HAS_BACKEND_CUDA()
# include <cuda/std/__pstl/cuda/transform.h>
# endif // _CCCL_HAS_BACKEND_CUDA()

# include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_STD

template <class _Tp>
_CCCL_CONCEPT __can_operator_plus_integral = _CCCL_REQUIRES_EXPR((_Tp), const _Tp& __val, const uint64_t __index)(
requires(is_convertible_v<decltype(__val + __index), _Tp>));

template <class _Tp>
_CCCL_CONCEPT __can_operator_plus_conversion = _CCCL_REQUIRES_EXPR((_Tp), const _Tp& __val, const uint64_t __index)(
requires(is_convertible_v<decltype(__val + static_cast<_Tp>(__index)), _Tp>));

template <class _Tp>
struct __iota_init_fn
{
_Tp __init_;

_CCCL_API constexpr __iota_init_fn(const _Tp& __init) noexcept(is_nothrow_copy_constructible_v<_Tp>)
: __init_(__init)
{}

[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE _Tp constexpr operator()(const uint64_t __index) const
{
# if _LIBCUDACXX_HAS_NVFP16()
// We cannot rely on operator+ and constructors from integers to be available for the extended fp types
if constexpr (is_same_v<_Tp, __half>)
{
return ::__hadd(__init_, ::__ull2half_rn(__index));
}
else
# endif // _LIBCUDACXX_HAS_NVFP16()
# if _LIBCUDACXX_HAS_NVBF16()
if constexpr (is_same_v<_Tp, __nv_bfloat16>)
{
return ::__hadd(__init_, ::__ull2bfloat16_rn(__index));
}
else
# endif // _LIBCUDACXX_HAS_NVBF16()
if constexpr (is_arithmetic_v<_Tp>)
{ // avoid warnings about integer conversions
return static_cast<_Tp>(__init_ + static_cast<_Tp>(__index));
}
else if constexpr (__can_operator_plus_integral<_Tp>)
{
return __init_ + __index;
}
else if constexpr (__can_operator_plus_conversion<_Tp>)
{
return __init_ + static_cast<_Tp>(__index);
Comment on lines +73 to +98
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: Shouldn't cuda::std::plus already handle all these details? Can we just use that here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, it does not but does the plain return __x + __y;

That means we can get integer promotion / sign conversion warnings

}
else
{
static_assert(__always_false_v<_Tp>, "cuda::std::iota(iter, iter, init) requires that T supports operator+");
}
}
};

template <class _Tp>
_CCCL_CONCEPT __can_operator_plus_times_integral = _CCCL_REQUIRES_EXPR((_Tp), const _Tp& __val, const uint64_t __index)(
requires(is_convertible_v<decltype(__val + __val * __index), _Tp>));

template <class _Tp>
_CCCL_CONCEPT __can_operator_plus_times_conversion =
_CCCL_REQUIRES_EXPR((_Tp), const _Tp& __val, const uint64_t __index) //
(requires(is_convertible_v<decltype(__val + __val * static_cast<_Tp>(__index)), _Tp>));

template <class _Tp>
struct __iota_init_step_fn
{
_Tp __init_;
_Tp __step_;

_CCCL_API constexpr __iota_init_step_fn(const _Tp& __init,
const _Tp& __step) noexcept(is_nothrow_copy_constructible_v<_Tp>)
: __init_(__init)
, __step_(__step)
{}

[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE constexpr _Tp operator()(const uint64_t __index) const
{
# if _LIBCUDACXX_HAS_NVFP16()
// We cannot rely on operator+ and constructors from integers to be available for the extended fp types
if constexpr (is_same_v<_Tp, __half>)
{
return ::__hadd(__init_, ::__hmul(__step_, ::__ull2half_rn(__index)));
}
else
# endif // _LIBCUDACXX_HAS_NVFP16()
# if _LIBCUDACXX_HAS_NVBF16()
if constexpr (is_same_v<_Tp, __nv_bfloat16>)
{
return ::__hadd(__init_, ::__hmul(__step_, ::__ull2bfloat16_rn(__index)));
}
else
# endif // _LIBCUDACXX_HAS_NVBF16()
if constexpr (is_arithmetic_v<_Tp>)
{ // avoid warnings about integer conversions
return static_cast<_Tp>(__init_ + __step_ * static_cast<_Tp>(__index));
}
else if constexpr (__can_operator_plus_times_integral<_Tp>)
{
return __init_ + __step_ * __index;
}
else if constexpr (__can_operator_plus_times_conversion<_Tp>)
{
return __init_ + __step_ * static_cast<_Tp>(__index);
}
else
{
static_assert(__always_false_v<_Tp>,
"cuda::std::iota(iter, iter, init, step) requires that T supports operator+ and operator*");
}
}
};

_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT

_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _Tp = iter_value_t<_InputIterator>)
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
_CCCL_HOST_API void
iota([[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, const _Tp& __init = _Tp{})
{
static_assert(indirectly_writable<_InputIterator, _Tp>,
"cuda::std::iota requires InputIterator to be indirectly writable with T");

[[maybe_unused]] auto __dispatch =
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>();
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
{
_CCCL_NVTX_RANGE_SCOPE("cuda::std::iota");

if (__first == __last)
{
return;
}

// Note: using a different offset type than uint64_t degrades performance considerably for larger integer types
const auto __count = static_cast<uint64_t>(::cuda::std::distance(__first, __last));
// For whatever reason __iota_init_step_fn is much faster for int64_t and __int128
if constexpr (is_arithmetic_v<_Tp>)
{
(void) __dispatch(
__policy,
::cuda::counting_iterator<uint64_t>{0},
::cuda::counting_iterator<uint64_t>{__count},
::cuda::std::move(__first),
__iota_init_step_fn{__init, _Tp{1}});
}
else
{
(void) __dispatch(
__policy,
::cuda::counting_iterator<uint64_t>{0},
::cuda::counting_iterator<uint64_t>{static_cast<uint64_t>(__count)},
::cuda::std::move(__first),
__iota_init_fn{__init});
}
}
else
{
static_assert(__always_false_v<_Policy>, "Parallel cuda::std::iota requires at least one selected backend");
return ::cuda::std::iota(::cuda::std::move(__first), ::cuda::std::move(__last), __init);
}
}

_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _Tp = iter_value_t<_InputIterator>)
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
_CCCL_HOST_API void
iota([[maybe_unused]] const _Policy& __policy,
_InputIterator __first,
_InputIterator __last,
const _Tp& __init,
const _Tp& __step)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment: Adding a parallel iota is one thing, but adding an iota with a step feels too extreme to still be called cuda::std::iota.

I'm leaning more towards a cuda::sequence algorithm here, otherwise we're starting to twist our promise that everything in cuda/std is conforming.

{
static_assert(indirectly_writable<_InputIterator, _Tp>,
"cuda::std::iota requires InputIterator to be indirectly writable with T");

[[maybe_unused]] auto __dispatch =
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__transform, _Policy>();
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
{
_CCCL_NVTX_RANGE_SCOPE("cuda::std::iota");

if (__first == __last)
{
return;
}

// Note: using a different offset type than uint64_t degrades performance considerably for larger integer types
const auto __count = static_cast<uint64_t>(::cuda::std::distance(__first, __last));
(void) __dispatch(
__policy,
::cuda::counting_iterator<uint64_t>{0},
::cuda::counting_iterator<uint64_t>{__count},
::cuda::std::move(__first),
__iota_init_step_fn{__init, __step});
}
else
{
static_assert(__always_false_v<_Policy>, "Parallel cuda::std::iota requires at least one selected backend");
// TODO(miscco): Consider adding that overload to serial iota
return ::cuda::std::iota(::cuda::std::move(__first), ::cuda::std::move(__last), __init /*, __step*/);
}
}

_CCCL_END_NAMESPACE_ARCH_DEPENDENT

_CCCL_END_NAMESPACE_CUDA_STD

# include <cuda/std/__cccl/epilogue.h>

#endif // !_CCCL_COMPILER(NVRTC)

#endif // _CUDA_STD___PSTL_IOTA_H
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/std/__pstl_algorithm
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include <cuda/std/__pstl/generate.h>
#include <cuda/std/__pstl/generate_n.h>
#include <cuda/std/__pstl/inclusive_scan.h>
#include <cuda/std/__pstl/iota.h>
#include <cuda/std/__pstl/merge.h>
#include <cuda/std/__pstl/mismatch.h>
#include <cuda/std/__pstl/none_of.h>
Expand Down
Loading