Skip to content

Commit d27508b

Browse files
authored
Merge branch 'main' into stf_remove_unused_exec_affinity_include
2 parents bbb6f73 + 2e90f09 commit d27508b

File tree

5 files changed

+347
-0
lines changed

5 files changed

+347
-0
lines changed
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDA Experimental in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <thrust/device_vector.h>
12+
#include <thrust/sequence.h>
13+
14+
#include <cuda/memory_pool>
15+
#include <cuda/std/__pstl_algorithm>
16+
#include <cuda/std/algorithm>
17+
#include <cuda/stream>
18+
19+
#include "nvbench_helper.cuh"
20+
21+
template <typename T>
22+
static void basic(nvbench::state& state, nvbench::type_list<T>)
23+
{
24+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
25+
const auto common_prefix = state.get_float64("MismatchAt");
26+
const auto mismatch_point = cuda::std::clamp<std::size_t>(elements * common_prefix, 0, elements - 2);
27+
28+
thrust::device_vector<T> in(elements, thrust::no_init);
29+
thrust::sequence(in.begin(), in.end(), 0);
30+
in[mismatch_point] = in[mismatch_point + 1];
31+
32+
state.add_element_count(elements);
33+
state.add_global_memory_reads<T>(mismatch_point);
34+
state.add_global_memory_writes<T>(0);
35+
36+
caching_allocator_t alloc;
37+
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
38+
[&](nvbench::launch& launch) {
39+
do_not_optimize(cuda::std::adjacent_find(cuda_policy(alloc, launch), in.cbegin(), in.cend()));
40+
});
41+
}
42+
43+
NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
44+
.set_name("base")
45+
.set_type_axes_names({"T{ct}"})
46+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
47+
.add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.01});
48+
49+
template <typename T>
50+
static void with_comp(nvbench::state& state, nvbench::type_list<T>)
51+
{
52+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
53+
const auto common_prefix = state.get_float64("MismatchAt");
54+
const auto mismatch_point = cuda::std::clamp<std::size_t>(elements * common_prefix, 0, elements - 2);
55+
56+
thrust::device_vector<T> in(elements, thrust::no_init);
57+
thrust::sequence(in.begin(), in.end(), 0);
58+
in[mismatch_point] = in[mismatch_point + 1];
59+
60+
state.add_element_count(elements);
61+
state.add_global_memory_reads<T>(mismatch_point);
62+
state.add_global_memory_writes<T>(0);
63+
64+
caching_allocator_t alloc;
65+
state.exec(
66+
nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
67+
do_not_optimize(
68+
cuda::std::adjacent_find(cuda_policy(alloc, launch), in.cbegin(), in.cend(), ::cuda::std::greater<T>{}));
69+
});
70+
}
71+
72+
NVBENCH_BENCH_TYPES(with_comp, NVBENCH_TYPE_AXES(fundamental_types))
73+
.set_name("with_comp")
74+
.set_type_axes_names({"T{ct}"})
75+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
76+
.add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.01});
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___PSTL_ADJACENT_FIND_H
12+
#define _CUDA_STD___PSTL_ADJACENT_FIND_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#if !_CCCL_COMPILER(NVRTC)
25+
26+
# include <cuda/__iterator/zip_function.h>
27+
# include <cuda/__iterator/zip_iterator.h>
28+
# include <cuda/__nvtx/nvtx.h>
29+
# include <cuda/std/__algorithm/adjacent_find.h>
30+
# include <cuda/std/__concepts/concept_macros.h>
31+
# include <cuda/std/__execution/policy.h>
32+
# include <cuda/std/__functional/operations.h>
33+
# include <cuda/std/__iterator/concepts.h>
34+
# include <cuda/std/__iterator/next.h>
35+
# include <cuda/std/__iterator/prev.h>
36+
# include <cuda/std/__pstl/dispatch.h>
37+
# include <cuda/std/__type_traits/always_false.h>
38+
# include <cuda/std/__type_traits/is_execution_policy.h>
39+
# include <cuda/std/__utility/move.h>
40+
41+
# if _CCCL_HAS_BACKEND_CUDA()
42+
# include <cuda/std/__pstl/cuda/find_if.h>
43+
# endif // _CCCL_HAS_BACKEND_CUDA()
44+
45+
# include <cuda/std/__cccl/prologue.h>
46+
47+
_CCCL_BEGIN_NAMESPACE_CUDA_STD
48+
49+
_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
50+
51+
_CCCL_TEMPLATE(class _Policy,
52+
class _InputIterator,
53+
class _BinaryPredicate = ::cuda::std::equal_to<iter_value_t<_InputIterator>>)
54+
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
55+
[[nodiscard]] _CCCL_HOST_API _InputIterator adjacent_find(
56+
[[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _BinaryPredicate __pred = {})
57+
{
58+
[[maybe_unused]] auto __dispatch =
59+
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__find_if, _Policy>();
60+
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
61+
{
62+
_CCCL_NVTX_RANGE_SCOPE("cuda::std::adjacent_find");
63+
64+
if (__first == __last)
65+
{
66+
return __first;
67+
}
68+
69+
auto __zipped_ret = __dispatch(
70+
__policy,
71+
::cuda::zip_iterator{__first, ::cuda::std::next(__first)},
72+
::cuda::zip_iterator{::cuda::std::prev(__last), __last},
73+
::cuda::zip_function{::cuda::std::move(__pred)});
74+
return ::cuda::std::get<0>(__zipped_ret.__iterators());
75+
}
76+
else
77+
{
78+
static_assert(__always_false_v<_Policy>,
79+
"Parallel cuda::std::adjacent_find requires at least one selected backend");
80+
return ::cuda::std::adjacent_find(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
81+
}
82+
}
83+
84+
_CCCL_END_NAMESPACE_ARCH_DEPENDENT
85+
86+
_CCCL_END_NAMESPACE_CUDA_STD
87+
88+
# include <cuda/std/__cccl/epilogue.h>
89+
90+
#endif // !_CCCL_COMPILER(NVRTC)
91+
92+
#endif // _CUDA_STD___PSTL_ADJACENT_FIND_H

libcudacxx/include/cuda/std/__pstl_algorithm

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#endif // no system header
2323

2424
#include <cuda/std/__pstl/adjacent_difference.h>
25+
#include <cuda/std/__pstl/adjacent_find.h>
2526
#include <cuda/std/__pstl/all_of.h>
2627
#include <cuda/std/__pstl/any_of.h>
2728
#include <cuda/std/__pstl/copy.h>
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
// template<class Policy, class InputIterator, class BinaryPredicate>
12+
// InputIterator adjacent_find(Policy policy,
13+
// InputIterator first,
14+
// InputIterator last,
15+
// BinaryPredicate pred)
16+
17+
#include <thrust/device_vector.h>
18+
#include <thrust/sequence.h>
19+
20+
#include <cuda/memory_pool>
21+
#include <cuda/std/__pstl_algorithm>
22+
#include <cuda/std/execution>
23+
#include <cuda/std/functional>
24+
#include <cuda/stream>
25+
26+
#include <testing.cuh>
27+
#include <utility.cuh>
28+
29+
#include "test_iterators.h"
30+
#include "test_macros.h"
31+
32+
inline constexpr int size = 1000;
33+
34+
template <class Policy>
35+
void test_adjacent_find(const Policy& policy, const thrust::device_vector<int>& input)
36+
{
37+
{ // empty should not access anything
38+
auto res = cuda::std::adjacent_find(policy, static_cast<int*>(nullptr), static_cast<int*>(nullptr));
39+
CHECK(res == static_cast<int*>(nullptr));
40+
}
41+
42+
{
43+
auto res = cuda::std::adjacent_find(policy, input.begin(), input.end());
44+
CHECK(*res == *(input.begin() + 41));
45+
}
46+
47+
{ // non contiguous input
48+
auto* inptr = thrust::raw_pointer_cast(input.data());
49+
auto res = cuda::std::adjacent_find(policy, random_access_iterator{inptr}, random_access_iterator{inptr + size});
50+
CHECK(res == random_access_iterator{inptr + 41});
51+
}
52+
}
53+
54+
C2H_TEST("cuda::std::adjacent_find(Iter, Iter)", "[parallel algorithm]")
55+
{
56+
thrust::device_vector<int> input(size);
57+
thrust::sequence(input.begin(), input.end(), 1);
58+
input[42] = 42;
59+
60+
SECTION("with default stream")
61+
{
62+
const auto policy = cuda::execution::__cub_par_unseq;
63+
test_adjacent_find(policy, input);
64+
}
65+
66+
SECTION("with provided stream")
67+
{
68+
cuda::stream stream{cuda::device_ref{0}};
69+
const auto policy = cuda::execution::__cub_par_unseq.with_stream(stream);
70+
test_adjacent_find(policy, input);
71+
}
72+
73+
SECTION("with provided memory_resource")
74+
{
75+
cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0});
76+
const auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(device_resource);
77+
test_adjacent_find(policy, input);
78+
}
79+
80+
SECTION("with provided stream and memory_resource")
81+
{
82+
cuda::stream stream{cuda::device_ref{0}};
83+
cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(stream.device());
84+
const auto policy = cuda::execution::__cub_par_unseq.with_stream(stream).with_memory_resource(device_resource);
85+
test_adjacent_find(policy, input);
86+
}
87+
}
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
// template<class Policy, class InputIterator, class BinaryPredicate>
12+
// InputIterator adjacent_find(Policy policy,
13+
// InputIterator first,
14+
// InputIterator last,
15+
// BinaryPredicate pred)
16+
17+
#include <thrust/device_vector.h>
18+
#include <thrust/execution_policy.h>
19+
#include <thrust/sequence.h>
20+
21+
#include <cuda/iterator>
22+
#include <cuda/memory_pool>
23+
#include <cuda/std/__pstl_algorithm>
24+
#include <cuda/std/execution>
25+
#include <cuda/std/functional>
26+
#include <cuda/stream>
27+
28+
#include <testing.cuh>
29+
#include <utility.cuh>
30+
31+
#include "test_iterators.h"
32+
#include "test_macros.h"
33+
34+
inline constexpr int size = 100;
35+
36+
template <class Policy>
37+
void test_adjacent_find(const Policy& policy, const thrust::device_vector<int>& input)
38+
{
39+
{ // empty should not access anything
40+
auto res =
41+
cuda::std::adjacent_find(policy, static_cast<int*>(nullptr), static_cast<int*>(nullptr), cuda::std::greater<>{});
42+
CHECK(res == static_cast<int*>(nullptr));
43+
}
44+
45+
{
46+
auto res = cuda::std::adjacent_find(policy, input.begin(), input.end(), cuda::std::greater<>{});
47+
CHECK(*res == *(input.begin() + 42));
48+
}
49+
50+
{ // non contiguous input
51+
auto* inptr = thrust::raw_pointer_cast(input.data());
52+
auto res = cuda::std::adjacent_find(
53+
policy, random_access_iterator{inptr}, random_access_iterator{inptr + size}, cuda::std::greater<>{});
54+
CHECK(res == random_access_iterator{inptr + 42});
55+
}
56+
}
57+
58+
C2H_TEST("cuda::std::adjacent_find(Iter, Iter, comp)", "[parallel algorithm]")
59+
{
60+
thrust::device_vector<int> input(size);
61+
thrust::sequence(input.begin(), input.end(), 1);
62+
input[42] = 1337;
63+
64+
SECTION("with default stream")
65+
{
66+
const auto policy = cuda::execution::__cub_par_unseq;
67+
test_adjacent_find(policy, input);
68+
}
69+
70+
SECTION("with provided stream")
71+
{
72+
cuda::stream stream{cuda::device_ref{0}};
73+
const auto policy = cuda::execution::__cub_par_unseq.with_stream(stream);
74+
test_adjacent_find(policy, input);
75+
}
76+
77+
SECTION("with provided memory_resource")
78+
{
79+
cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(cuda::device_ref{0});
80+
const auto policy = cuda::execution::__cub_par_unseq.with_memory_resource(device_resource);
81+
test_adjacent_find(policy, input);
82+
}
83+
84+
SECTION("with provided stream and memory_resource")
85+
{
86+
cuda::stream stream{cuda::device_ref{0}};
87+
cuda::device_memory_pool_ref device_resource = cuda::device_default_memory_pool(stream.device());
88+
const auto policy = cuda::execution::__cub_par_unseq.with_stream(stream).with_memory_resource(device_resource);
89+
test_adjacent_find(policy, input);
90+
}
91+
}

0 commit comments

Comments
 (0)