Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions libcudacxx/benchmarks/bench/unique_copy/basic.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/sequence.h>
#include <thrust/transform.h>

#include <cuda/memory_pool>
#include <cuda/std/__pstl_algorithm>
#include <cuda/stream_ref>

#include "nvbench_helper.cuh"

// Input with runs of equal elements: 0,0,1,1,2,2,... (segment size 2)
template <typename T>
static void make_unique_input(thrust::device_vector<T>& in, std::size_t elements)
{
in.resize(elements);
thrust::transform(
thrust::counting_iterator<std::size_t>(0),
thrust::counting_iterator<std::size_t>(elements),
in.begin(),
[] __device__(std::size_t i) {
return static_cast<T>(i / 2);
});
}

template <typename T>
static void basic(nvbench::state& state, nvbench::type_list<T>)
{
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));

thrust::device_vector<T> in;
make_unique_input(in, elements);
thrust::device_vector<T> out(elements, thrust::no_init);

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
// unique_copy writes at most elements
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc{};

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
do_not_optimize(cuda::std::unique_copy(cuda_policy(alloc, launch), in.begin(), in.end(), out.begin()));
});
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
.set_name("base")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));

template <typename T>
static void with_comp(nvbench::state& state, nvbench::type_list<T>)
{
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));

thrust::device_vector<T> in;
make_unique_input(in, elements);
thrust::device_vector<T> out(elements, thrust::no_init);

state.add_element_count(elements);
state.add_global_memory_reads<T>(elements);
// unique_copy writes at most elements
state.add_global_memory_writes<T>(elements);

caching_allocator_t alloc{};

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
[&](nvbench::launch& launch) {
do_not_optimize(cuda::std::unique_copy(
cuda_policy(alloc, launch), in.begin(), in.end(), out.begin(), cuda::std::equal_to<T>{}));
});
}

NVBENCH_BENCH_TYPES(with_comp, NVBENCH_TYPE_AXES(fundamental_types))
.set_name("with_comp")
.set_type_axes_names({"T{ct}"})
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4));
177 changes: 177 additions & 0 deletions libcudacxx/include/cuda/std/__pstl/cuda/unique_copy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_STD___PSTL_CUDA_UNIQUE_COPY_H
#define _CUDA_STD___PSTL_CUDA_UNIQUE_COPY_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if _CCCL_HAS_BACKEND_CUDA()

_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow")
_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef")
_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage)

# include <cub/device/device_select.cuh>

_CCCL_DIAG_POP

# include <cuda/__execution/policy.h>
# include <cuda/__memory_pool/device_memory_pool.h>
# include <cuda/__memory_resource/get_memory_resource.h>
# include <cuda/__runtime/api_wrapper.h>
# include <cuda/__stream/get_stream.h>
# include <cuda/__stream/stream_ref.h>
# include <cuda/std/__algorithm/unique_copy.h>
# include <cuda/std/__exception/cuda_error.h>
# include <cuda/std/__exception/exception_macros.h>
# include <cuda/std/__execution/env.h>
# include <cuda/std/__execution/policy.h>
# include <cuda/std/__iterator/incrementable_traits.h>
# include <cuda/std/__iterator/iterator_traits.h>
# include <cuda/std/__iterator/next.h>
# include <cuda/std/__pstl/cuda/temporary_storage.h>
# include <cuda/std/__pstl/dispatch.h>
# include <cuda/std/__type_traits/always_false.h>
# include <cuda/std/__utility/move.h>
# include <cuda/std/cstdint>

# include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION

_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT

template <>
struct __pstl_dispatch<__pstl_algorithm::__unique_copy, __execution_backend::__cuda>
{
template <class _Policy, class _InputIterator, class _OutputIterator, class _BinaryPredicate>
[[nodiscard]] _CCCL_HOST_API static _OutputIterator __par_impl(
const _Policy& __policy,
_InputIterator __first,
_InputIterator __last,
_OutputIterator __result,
_BinaryPredicate __pred)
{
// DeviceSelect always uses int64_t
using _OffsetType = ::cuda::std::int64_t;
_OffsetType __ret;

const auto __count = static_cast<_OffsetType>(::cuda::std::distance(__first, __last));

// Determine temporary device storage requirements for DeviceSelect::Unique
size_t __num_bytes = 0;
_CCCL_TRY_CUDA_API(
CUB_NS_QUALIFIER::DeviceSelect::Unique,
"__pstl_cuda_unique_copy: determination of device storage for cub::DeviceSelect::Unique failed",
static_cast<void*>(nullptr),
__num_bytes,
__first,
__result,
static_cast<_OffsetType*>(nullptr),
__count,
__pred,
0);

auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy);
auto __resource = ::cuda::__call_or(
::cuda::mr::get_memory_resource, ::cuda::device_default_memory_pool(__stream.device()), __policy);

{
__temporary_storage<_OffsetType, decltype(__resource)> __storage{__stream, __resource, __num_bytes};

_CCCL_TRY_CUDA_API(
CUB_NS_QUALIFIER::DeviceSelect::Unique,
"__pstl_cuda_unique_copy: kernel launch of cub::DeviceSelect::Unique failed",
__storage.__get_temp_storage(),
__num_bytes,
::cuda::std::move(__first),
__result,
__storage.__get_result_iter(),
__count,
::cuda::std::move(__pred),
__stream.get());

_CCCL_TRY_CUDA_API(
::cudaMemcpyAsync,
"__pstl_cuda_unique_copy: copy of result from device to host failed",
::cuda::std::addressof(__ret),
__storage.__res_,
sizeof(_OffsetType),
cudaMemcpyDefault,
__stream.get());
}

__stream.sync();
return __result + static_cast<iter_difference_t<_OutputIterator>>(__ret);
}

_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator, class _BinaryPredicate)
_CCCL_REQUIRES(__has_forward_traversal<_OutputIterator>)
[[nodiscard]] _CCCL_HOST_API _OutputIterator operator()(
[[maybe_unused]] const _Policy& __policy,
_InputIterator __first,
_InputIterator __last,
_OutputIterator __result,
_BinaryPredicate __pred) const
{
if constexpr (::cuda::std::__has_random_access_traversal<_InputIterator>
&& ::cuda::std::__has_random_access_traversal<_OutputIterator>)
{
try
{
return __par_impl(
__policy,
::cuda::std::move(__first),
::cuda::std::move(__last),
::cuda::std::move(__result),
::cuda::std::move(__pred));
}
catch (const ::cuda::cuda_error& __err)
{
if (__err.status() == ::cudaErrorMemoryAllocation)
{
_CCCL_THROW(::std::bad_alloc);
}
else
{
throw __err;
}
}
}
else
{
static_assert(__always_false_v<_Policy>,
"__pstl_dispatch: CUDA backend of cuda::std::unique_copy requires at least random access "
"iterators");
return ::cuda::std::unique_copy(
::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__result), ::cuda::std::move(__pred));
}
}
};

_CCCL_END_NAMESPACE_ARCH_DEPENDENT

_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION

# include <cuda/std/__cccl/epilogue.h>

#endif // _CCCL_HAS_BACKEND_CUDA()

#endif // _CUDA_STD___PSTL_CUDA_UNIQUE_COPY_H
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/std/__pstl/dispatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ enum class __pstl_algorithm
__remove_if,
__transform,
__transform_reduce,
__unique_copy,
};

//! @brief tag type to indicate that we cannot dispatch to a parallel algorithm and should run the algorithm serially
Expand Down
104 changes: 104 additions & 0 deletions libcudacxx/include/cuda/std/__pstl/unique_copy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_STD___PSTL_UNIQUE_COPY_H
#define _CUDA_STD___PSTL_UNIQUE_COPY_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if !_CCCL_COMPILER(NVRTC)

# include <cuda/__nvtx/nvtx.h>
# include <cuda/std/__algorithm/unique_copy.h>
# include <cuda/std/__execution/policy.h>
# include <cuda/std/__functional/not_fn.h>
# include <cuda/std/__functional/operations.h>
# include <cuda/std/__iterator/concepts.h>
# include <cuda/std/__iterator/distance.h>
# include <cuda/std/__iterator/iterator_traits.h>
# include <cuda/std/__iterator/next.h>
# include <cuda/std/__iterator/prev.h>
# include <cuda/std/__pstl/dispatch.h>
# include <cuda/std/__type_traits/always_false.h>
# include <cuda/std/__type_traits/is_execution_policy.h>
# include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
# include <cuda/std/__utility/move.h>
# include <cuda/std/tuple>

# if _CCCL_HAS_BACKEND_CUDA()
# include <cuda/std/__pstl/cuda/unique_copy.h>
# endif // _CCCL_HAS_BACKEND_CUDA()

# include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_STD

_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT

_CCCL_TEMPLATE(class _Policy,
class _InputIterator,
class _OutputIterator,
class _BinaryPredicate = equal_to<iter_value_t<_InputIterator>>)
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator> _CCCL_AND
is_execution_policy_v<_Policy>)
_CCCL_HOST_API _OutputIterator unique_copy(
[[maybe_unused]] const _Policy& __policy,
_InputIterator __first,
_InputIterator __last,
_OutputIterator __result,
_BinaryPredicate __pred = {})
{
static_assert(indirect_binary_predicate<_BinaryPredicate, _InputIterator, _InputIterator>,
"cuda::std::unique_copy: BinaryPredicate must satisfy "
"indirect_binary_predicate<BinaryPredicate, InputIterator, InputIterator>");

[[maybe_unused]] auto __dispatch =
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__unique_copy, _Policy>();
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
{
_CCCL_NVTX_RANGE_SCOPE("cuda::std::unique_copy");

if (__first == __last)
{
return __result;
}

return __dispatch(
__policy,
::cuda::std::move(__first),
::cuda::std::move(__last),
::cuda::std::move(__result),
::cuda::std::move(__pred));
}
else
{
static_assert(__always_false_v<_Policy>, "Parallel cuda::std::unique_copy requires at least one selected backend");
return ::cuda::std::unique_copy(
::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__result), ::cuda::std::move(__pred));
}
}

_CCCL_END_NAMESPACE_ARCH_DEPENDENT

_CCCL_END_NAMESPACE_CUDA_STD

# include <cuda/std/__cccl/epilogue.h>

#endif // !_CCCL_COMPILER(NVRTC)

#endif // _CUDA_STD___PSTL_UNIQUE_COPY_H
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/std/__pstl_algorithm
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
#include <cuda/std/__pstl/reverse_copy.h>
#include <cuda/std/__pstl/transform.h>
#include <cuda/std/__pstl/transform_reduce.h>
#include <cuda/std/__pstl/unique_copy.h>

// [algorithm.syn]
#include <cuda/std/initializer_list>
Expand Down
Loading
Loading