Merge branch 'main' into stf_remove_unused_exec_affinity_include

caugonnet · web-flow · commit d27508b09db9 · 2026-03-05T15:57:35.000+01:00
diff --git a/libcudacxx/benchmarks/bench/adjacent_find/basic.cu b/libcudacxx/benchmarks/bench/adjacent_find/basic.cu
@@ -0,0 +1,76 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+
+#include <cuda/memory_pool>
+#include <cuda/std/__pstl_algorithm>
+#include <cuda/std/algorithm>
+#include <cuda/stream>
+
+#include "nvbench_helper.cuh"
+
+template <typename T>
+static void basic(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements"));
+  const auto common_prefix  = state.get_float64("MismatchAt");
+  const auto mismatch_point = cuda::std::clamp<std::size_t>(elements * common_prefix, 0, elements - 2);
+
+  thrust::device_vector<T> in(elements, thrust::no_init);
+  thrust::sequence(in.begin(), in.end(), 0);
+  in[mismatch_point] = in[mismatch_point + 1];
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(mismatch_point);
+  state.add_global_memory_writes<T>(0);
+
+  caching_allocator_t alloc;
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) {
+               do_not_optimize(cuda::std::adjacent_find(cuda_policy(alloc, launch), in.cbegin(), in.cend()));
+             });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.01});
+
+template <typename T>
+static void with_comp(nvbench::state& state, nvbench::type_list<T>)
+{
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements"));
+  const auto common_prefix  = state.get_float64("MismatchAt");
+  const auto mismatch_point = cuda::std::clamp<std::size_t>(elements * common_prefix, 0, elements - 2);
+
+  thrust::device_vector<T> in(elements, thrust::no_init);
+  thrust::sequence(in.begin(), in.end(), 0);
+  in[mismatch_point] = in[mismatch_point + 1];
+
+  state.add_element_count(elements);
+  state.add_global_memory_reads<T>(mismatch_point);
+  state.add_global_memory_writes<T>(0);
+
+  caching_allocator_t alloc;
+  state.exec(
+    nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      do_not_optimize(
+        cuda::std::adjacent_find(cuda_policy(alloc, launch), in.cbegin(), in.cend(), ::cuda::std::greater<T>{}));
+    });
+}
+
+NVBENCH_BENCH_TYPES(with_comp, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("with_comp")
+  .set_type_axes_names({"T{ct}"})
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.01});
diff --git a/libcudacxx/include/cuda/std/__pstl/adjacent_find.h b/libcudacxx/include/cuda/std/__pstl/adjacent_find.h
@@ -0,0 +1,92 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_ADJACENT_FIND_H
+#define _CUDA_STD___PSTL_ADJACENT_FIND_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/__iterator/zip_function.h>
+#  include <cuda/__iterator/zip_iterator.h>
+#  include <cuda/__nvtx/nvtx.h>
+#  include <cuda/std/__algorithm/adjacent_find.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__functional/operations.h>
+#  include <cuda/std/__iterator/concepts.h>
+#  include <cuda/std/__iterator/next.h>
+#  include <cuda/std/__iterator/prev.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__utility/move.h>
+
+#  if _CCCL_HAS_BACKEND_CUDA()
+#    include <cuda/std/__pstl/cuda/find_if.h>
+#  endif // _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_TEMPLATE(class _Policy,
+               class _InputIterator,
+               class _BinaryPredicate = ::cuda::std::equal_to<iter_value_t<_InputIterator>>)
+_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
+[[nodiscard]] _CCCL_HOST_API _InputIterator adjacent_find(
+  [[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _BinaryPredicate __pred = {})
+{
+  [[maybe_unused]] auto __dispatch =
+    ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__find_if, _Policy>();
+  if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cuda::std::adjacent_find");
+
+    if (__first == __last)
+    {
+      return __first;
+    }
+
+    auto __zipped_ret = __dispatch(
+      __policy,
+      ::cuda::zip_iterator{__first, ::cuda::std::next(__first)},
+      ::cuda::zip_iterator{::cuda::std::prev(__last), __last},
+      ::cuda::zip_function{::cuda::std::move(__pred)});
+    return ::cuda::std::get<0>(__zipped_ret.__iterators());
+  }
+  else
+  {
+    static_assert(__always_false_v<_Policy>,
+                  "Parallel cuda::std::adjacent_find requires at least one selected backend");
+    return ::cuda::std::adjacent_find(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
+  }
+}
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___PSTL_ADJACENT_FIND_H
diff --git a/libcudacxx/include/cuda/std/__pstl_algorithm b/libcudacxx/include/cuda/std/__pstl_algorithm
@@ -22,6 +22,7 @@
 #endif // no system header
 
 #include <cuda/std/__pstl/adjacent_difference.h>
+#include <cuda/std/__pstl/adjacent_find.h>
 #include <cuda/std/__pstl/all_of.h>
 #include <cuda/std/__pstl/any_of.h>
 #include <cuda/std/__pstl/copy.h>
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.adjacent.find/pstl_adjacent_find.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.adjacent.find/pstl_adjacent_find.cu
@@ -0,0 +1,87 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// template<class Policy, class InputIterator, class BinaryPredicate>
+// InputIterator adjacent_find(Policy policy,
+//                             InputIterator first,
+//                             InputIterator last,
+//                             BinaryPredicate pred)
+
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+
+#include <cuda/memory_pool>
+#include <cuda/std/__pstl_algorithm>
+#include <cuda/std/execution>
+#include <cuda/std/functional>
+#include <cuda/stream>
+
+#include <testing.cuh>
+#include <utility.cuh>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+
+inline constexpr int size = 1000;
+
+template <class Policy>
+void test_adjacent_find(const Policy& policy, const thrust::device_vector<int>& input)
+{
+  { // empty should not access anything
+    auto res = cuda::std::adjacent_find(policy, static_cast<int*>(nullptr), static_cast<int*>(nullptr));
+    CHECK(res == static_cast<int*>(nullptr));
+  }
+
+  {
+    auto res = cuda::std::adjacent_find(policy, input.begin(), input.end());
+    CHECK(*res == *(input.begin() + 41));
+  }
+
+  { // non contiguous input
+    auto* inptr = thrust::raw_pointer_cast(input.data());
+    auto res    = cuda::std::adjacent_find(policy, random_access_iterator{inptr}, random_access_iterator{inptr + size});
+    CHECK(res == random_access_iterator{inptr + 41});
+  }
+}
+
+C2H_TEST("cuda::std::adjacent_find(Iter, Iter)", "[parallel algorithm]")
+{
+  thrust::device_vector<int> input(size);
+  thrust::sequence(input.begin(), input.end(), 1);
+  input[42] = 42;
+
+  SECTION("with default stream")
+  {
+    const auto policy = cuda::execution::__cub_par_unseq;
+    test_adjacent_find(policy, input);
+  }
+
+  SECTION("with provided stream")
+  {
+    cuda::stream stream{cuda::device_ref{0}};
+    const auto policy = cuda::execution::__cub_par_unseq.with_stream(stream);
+    test_adjacent_find(policy, input);
+  }
+
+  SECTION("with provided memory_resource")
+  {
+    cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0});
+    const auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(device_resource);
+    test_adjacent_find(policy, input);
+  }
+
+  SECTION("with provided stream and memory_resource")
+  {
+    cuda::stream stream{cuda::device_ref{0}};
+    cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(stream.device());
+    const auto policy = cuda::execution::__cub_par_unseq.with_stream(stream).with_memory_resource(device_resource);
+    test_adjacent_find(policy, input);
+  }
+}
diff --git a/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.adjacent.find/pstl_adjacent_find_pred.cu b/libcudacxx/test/libcudacxx/std/algorithms/alg.nonmodifying/alg.adjacent.find/pstl_adjacent_find_pred.cu
@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// template<class Policy, class InputIterator, class BinaryPredicate>
+// InputIterator adjacent_find(Policy policy,
+//                             InputIterator first,
+//                             InputIterator last,
+//                             BinaryPredicate pred)
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include <cuda/iterator>
+#include <cuda/memory_pool>
+#include <cuda/std/__pstl_algorithm>
+#include <cuda/std/execution>
+#include <cuda/std/functional>
+#include <cuda/stream>
+
+#include <testing.cuh>
+#include <utility.cuh>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+
+inline constexpr int size = 100;
+
+template <class Policy>
+void test_adjacent_find(const Policy& policy, const thrust::device_vector<int>& input)
+{
+  { // empty should not access anything
+    auto res =
+      cuda::std::adjacent_find(policy, static_cast<int*>(nullptr), static_cast<int*>(nullptr), cuda::std::greater<>{});
+    CHECK(res == static_cast<int*>(nullptr));
+  }
+
+  {
+    auto res = cuda::std::adjacent_find(policy, input.begin(), input.end(), cuda::std::greater<>{});
+    CHECK(*res == *(input.begin() + 42));
+  }
+
+  { // non contiguous input
+    auto* inptr = thrust::raw_pointer_cast(input.data());
+    auto res    = cuda::std::adjacent_find(
+      policy, random_access_iterator{inptr}, random_access_iterator{inptr + size}, cuda::std::greater<>{});
+    CHECK(res == random_access_iterator{inptr + 42});
+  }
+}
+
+C2H_TEST("cuda::std::adjacent_find(Iter, Iter, comp)", "[parallel algorithm]")
+{
+  thrust::device_vector<int> input(size);
+  thrust::sequence(input.begin(), input.end(), 1);
+  input[42] = 1337;
+
+  SECTION("with default stream")
+  {
+    const auto policy = cuda::execution::__cub_par_unseq;
+    test_adjacent_find(policy, input);
+  }
+
+  SECTION("with provided stream")
+  {
+    cuda::stream stream{cuda::device_ref{0}};
+    const auto policy = cuda::execution::__cub_par_unseq.with_stream(stream);
+    test_adjacent_find(policy, input);
+  }
+
+  SECTION("with provided memory_resource")
+  {
+    cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0});
+    const auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(device_resource);
+    test_adjacent_find(policy, input);
+  }
+
+  SECTION("with provided stream and memory_resource")
+  {
+    cuda::stream stream{cuda::device_ref{0}};
+    cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(stream.device());
+    const auto policy = cuda::execution::__cub_par_unseq.with_stream(stream).with_memory_resource(device_resource);
+    test_adjacent_find(policy, input);
+  }
+}