diff --git a/libcudacxx/benchmarks/bench/unique_copy/basic.cu b/libcudacxx/benchmarks/bench/unique_copy/basic.cu new file mode 100644 index 00000000000..c48d3d0101b --- /dev/null +++ b/libcudacxx/benchmarks/bench/unique_copy/basic.cu @@ -0,0 +1,89 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include +#include +#include + +#include "nvbench_helper.cuh" + +// Input with runs of equal elements: 0,0,1,1,2,2,... (segment size 2) +template +static void make_unique_input(thrust::device_vector& in, std::size_t elements) +{ + in.resize(elements); + thrust::transform( + thrust::counting_iterator(0), + thrust::counting_iterator(elements), + in.begin(), + [] __device__(std::size_t i) { + return static_cast(i / 2); + }); +} + +template +static void basic(nvbench::state& state, nvbench::type_list) +{ + const auto elements = static_cast(state.get_int64("Elements")); + + thrust::device_vector in; + make_unique_input(in, elements); + thrust::device_vector out(elements, thrust::no_init); + + state.add_element_count(elements); + state.add_global_memory_reads(elements); + // unique_copy writes at most elements + state.add_global_memory_writes(elements); + + caching_allocator_t alloc{}; + + state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { + do_not_optimize(cuda::std::unique_copy(cuda_policy(alloc, launch), in.begin(), in.end(), out.begin())); + }); +} + +NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("base") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)); + +template +static void with_comp(nvbench::state& state, nvbench::type_list) +{ + const auto elements = static_cast(state.get_int64("Elements")); + + thrust::device_vector in; + make_unique_input(in, elements); + thrust::device_vector out(elements, thrust::no_init); + + state.add_element_count(elements); + state.add_global_memory_reads(elements); + // unique_copy writes at most elements + state.add_global_memory_writes(elements); + + caching_allocator_t alloc{}; + + state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { + do_not_optimize(cuda::std::unique_copy( + cuda_policy(alloc, launch), in.begin(), in.end(), out.begin(), cuda::std::equal_to{})); + }); +} + +NVBENCH_BENCH_TYPES(with_comp, NVBENCH_TYPE_AXES(fundamental_types)) + .set_name("with_comp") + .set_type_axes_names({"T{ct}"}) + .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)); diff --git a/libcudacxx/include/cuda/std/__pstl/cuda/unique_copy.h b/libcudacxx/include/cuda/std/__pstl/cuda/unique_copy.h new file mode 100644 index 00000000000..8a313d06428 --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/cuda/unique_copy.h @@ -0,0 +1,177 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_CUDA_UNIQUE_COPY_H +#define _CUDA_STD___PSTL_CUDA_UNIQUE_COPY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_HAS_BACKEND_CUDA() + +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wshadow") +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef") +_CCCL_DIAG_SUPPRESS_NVHPC(attribute_requires_external_linkage) + +# include + +_CCCL_DIAG_POP + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_EXECUTION + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +template <> +struct __pstl_dispatch<__pstl_algorithm::__unique_copy, __execution_backend::__cuda> +{ + template + [[nodiscard]] _CCCL_HOST_API static _OutputIterator __par_impl( + const _Policy& __policy, + _InputIterator __first, + _InputIterator __last, + _OutputIterator __result, + _BinaryPredicate __pred) + { + // DeviceSelect always uses int64_t + using _OffsetType = ::cuda::std::int64_t; + _OffsetType __ret; + + const auto __count = static_cast<_OffsetType>(::cuda::std::distance(__first, __last)); + + // Determine temporary device storage requirements for DeviceSelect::Unique + size_t __num_bytes = 0; + _CCCL_TRY_CUDA_API( + CUB_NS_QUALIFIER::DeviceSelect::Unique, + "__pstl_cuda_unique_copy: determination of device storage for cub::DeviceSelect::Unique failed", + static_cast(nullptr), + __num_bytes, + __first, + __result, + static_cast<_OffsetType*>(nullptr), + __count, + __pred, + 0); + + auto __stream = ::cuda::__call_or(::cuda::get_stream, ::cuda::stream_ref{cudaStreamPerThread}, __policy); + auto __resource = ::cuda::__call_or( + ::cuda::mr::get_memory_resource, ::cuda::device_default_memory_pool(__stream.device()), __policy); + + { + __temporary_storage<_OffsetType, decltype(__resource)> __storage{__stream, __resource, __num_bytes}; + + _CCCL_TRY_CUDA_API( + CUB_NS_QUALIFIER::DeviceSelect::Unique, + "__pstl_cuda_unique_copy: kernel launch of cub::DeviceSelect::Unique failed", + __storage.__get_temp_storage(), + __num_bytes, + ::cuda::std::move(__first), + __result, + __storage.__get_result_iter(), + __count, + ::cuda::std::move(__pred), + __stream.get()); + + _CCCL_TRY_CUDA_API( + ::cudaMemcpyAsync, + "__pstl_cuda_unique_copy: copy of result from device to host failed", + ::cuda::std::addressof(__ret), + __storage.__res_, + sizeof(_OffsetType), + cudaMemcpyDefault, + __stream.get()); + } + + __stream.sync(); + return __result + static_cast>(__ret); + } + + _CCCL_TEMPLATE(class _Policy, class _InputIterator, class _OutputIterator, class _BinaryPredicate) + _CCCL_REQUIRES(__has_forward_traversal<_OutputIterator>) + [[nodiscard]] _CCCL_HOST_API _OutputIterator operator()( + [[maybe_unused]] const _Policy& __policy, + _InputIterator __first, + _InputIterator __last, + _OutputIterator __result, + _BinaryPredicate __pred) const + { + if constexpr (::cuda::std::__has_random_access_traversal<_InputIterator> + && ::cuda::std::__has_random_access_traversal<_OutputIterator>) + { + try + { + return __par_impl( + __policy, + ::cuda::std::move(__first), + ::cuda::std::move(__last), + ::cuda::std::move(__result), + ::cuda::std::move(__pred)); + } + catch (const ::cuda::cuda_error& __err) + { + if (__err.status() == ::cudaErrorMemoryAllocation) + { + _CCCL_THROW(::std::bad_alloc); + } + else + { + throw __err; + } + } + } + else + { + static_assert(__always_false_v<_Policy>, + "__pstl_dispatch: CUDA backend of cuda::std::unique_copy requires at least random access " + "iterators"); + return ::cuda::std::unique_copy( + ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__result), ::cuda::std::move(__pred)); + } + } +}; + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD_EXECUTION + +# include + +#endif // _CCCL_HAS_BACKEND_CUDA() + +#endif // _CUDA_STD___PSTL_CUDA_UNIQUE_COPY_H diff --git a/libcudacxx/include/cuda/std/__pstl/dispatch.h b/libcudacxx/include/cuda/std/__pstl/dispatch.h index 44a627e6b8d..267e82824de 100644 --- a/libcudacxx/include/cuda/std/__pstl/dispatch.h +++ b/libcudacxx/include/cuda/std/__pstl/dispatch.h @@ -45,6 +45,7 @@ enum class __pstl_algorithm __remove_if, __transform, __transform_reduce, + __unique_copy, }; //! @brief tag type to indicate that we cannot dispatch to a parallel algorithm and should run the algorithm serially diff --git a/libcudacxx/include/cuda/std/__pstl/unique_copy.h b/libcudacxx/include/cuda/std/__pstl/unique_copy.h new file mode 100644 index 00000000000..ed02f1394d3 --- /dev/null +++ b/libcudacxx/include/cuda/std/__pstl/unique_copy.h @@ -0,0 +1,104 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___PSTL_UNIQUE_COPY_H +#define _CUDA_STD___PSTL_UNIQUE_COPY_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if !_CCCL_COMPILER(NVRTC) + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# if _CCCL_HAS_BACKEND_CUDA() +# include +# endif // _CCCL_HAS_BACKEND_CUDA() + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT + +_CCCL_TEMPLATE(class _Policy, + class _InputIterator, + class _OutputIterator, + class _BinaryPredicate = equal_to>) +_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND __has_forward_traversal<_OutputIterator> _CCCL_AND + is_execution_policy_v<_Policy>) +_CCCL_HOST_API _OutputIterator unique_copy( + [[maybe_unused]] const _Policy& __policy, + _InputIterator __first, + _InputIterator __last, + _OutputIterator __result, + _BinaryPredicate __pred = {}) +{ + static_assert(indirect_binary_predicate<_BinaryPredicate, _InputIterator, _InputIterator>, + "cuda::std::unique_copy: BinaryPredicate must satisfy " + "indirect_binary_predicate"); + + [[maybe_unused]] auto __dispatch = + ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__unique_copy, _Policy>(); + if constexpr (::cuda::std::execution::__pstl_can_dispatch) + { + _CCCL_NVTX_RANGE_SCOPE("cuda::std::unique_copy"); + + if (__first == __last) + { + return __result; + } + + return __dispatch( + __policy, + ::cuda::std::move(__first), + ::cuda::std::move(__last), + ::cuda::std::move(__result), + ::cuda::std::move(__pred)); + } + else + { + static_assert(__always_false_v<_Policy>, "Parallel cuda::std::unique_copy requires at least one selected backend"); + return ::cuda::std::unique_copy( + ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__result), ::cuda::std::move(__pred)); + } +} + +_CCCL_END_NAMESPACE_ARCH_DEPENDENT + +_CCCL_END_NAMESPACE_CUDA_STD + +# include + +#endif // !_CCCL_COMPILER(NVRTC) + +#endif // _CUDA_STD___PSTL_UNIQUE_COPY_H diff --git a/libcudacxx/include/cuda/std/__pstl_algorithm b/libcudacxx/include/cuda/std/__pstl_algorithm index 222540a2d12..defd013720b 100644 --- a/libcudacxx/include/cuda/std/__pstl_algorithm +++ b/libcudacxx/include/cuda/std/__pstl_algorithm @@ -58,6 +58,7 @@ #include #include #include +#include // [algorithm.syn] #include diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.unique/pstl_unique_copy.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.unique/pstl_unique_copy.cu new file mode 100644 index 00000000000..6a7dd4b2f5c --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/algorithms/alg.modifying/alg.unique/pstl_unique_copy.cu @@ -0,0 +1,108 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// template +// OutputIterator unique_copy(ExecutionPolicy&& policy, +// InputIterator first, +// InputIterator last, +// OutputIterator result); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +inline constexpr int size = 1000; + +template +void test_unique_copy(const Policy& policy, + [[maybe_unused]] thrust::device_vector& input, + [[maybe_unused]] thrust::device_vector& output) +{ + { // empty should not access anything + const auto res = cuda::std::unique_copy( + policy, static_cast(nullptr), static_cast(nullptr), static_cast(nullptr)); + CHECK(res == nullptr); + } + + cuda::std::fill(output.begin(), output.end(), -1); + { // no duplicates + const auto res = cuda::std::unique_copy(policy, input.begin(), input.end(), output.begin()); + CHECK(input == output); + CHECK(res == output.end()); + CHECK(cuda::std::equal(policy, output.begin(), output.end(), input.begin())); + } + + cuda::std::fill(output.begin(), output.end(), -1); + { // one match, + input[43] = input[42]; + const auto res = cuda::std::unique_copy(policy, input.begin(), input.end(), output.begin()); + CHECK(res == cuda::std::prev(output.end())); + + auto mid_in = input.begin() + 43; + auto mid_out = output.begin() + 42; + CHECK(cuda::std::equal(policy, output.begin(), mid_out, input.begin())); + CHECK(cuda::std::equal(policy, mid_out, output.end(), mid_in)); + } + + cuda::std::fill(output.begin(), output.end(), -1); + { // all equal, continuous iterator + cuda::std::fill(input.begin(), input.end(), 42); + const auto res = cuda::std::unique_copy(policy, input.begin(), input.end(), output.begin()); + CHECK(res == cuda::std::next(output.begin())); + CHECK(output[0] == 42); + } +} + +C2H_TEST("cuda::std::unique_copy", "[parallel algorithm]") +{ + thrust::device_vector input(size, thrust::no_init); + thrust::device_vector output(size, thrust::no_init); + thrust::sequence(input.begin(), input.end(), 0); + + SECTION("with default stream") + { + const auto policy = cuda::execution::__cub_par_unseq; + test_unique_copy(policy, input, output); + } + + SECTION("with provided stream") + { + cuda::stream stream{cuda::device_ref{0}}; + const auto policy = cuda::execution::__cub_par_unseq.with_stream(stream); + test_unique_copy(policy, input, output); + } + + SECTION("with provided memory_resource") + { + cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0}); + const auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(device_resource); + test_unique_copy(policy, input, output); + } + + SECTION("with provided stream and memory_resource") + { + cuda::stream stream{cuda::device_ref{0}}; + cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(stream.device()); + const auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(device_resource).with_stream(stream); + test_unique_copy(policy, input, output); + } +}