-
Notifications
You must be signed in to change notification settings - Fork 359
Expand file tree
/
Copy pathcommon.h
More file actions
98 lines (89 loc) · 4.04 KB
/
common.h
File metadata and controls
98 lines (89 loc) · 4.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause
#pragma once
// keep checks at the top so compilation of discarded variants fails really fast
#include <cub/device/dispatch/dispatch_transform.cuh>
#if !TUNE_BASE
# if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
# error "When tuning, this benchmark does not support being compiled for multiple architectures"
# endif
# if TUNE_ALGORITHM == 3
# if (__CUDA_ARCH_LIST__) < 900
# error "Cannot compile algorithm 3 (ublkcp) below sm90"
# endif
# endif // TUNE_ALGORITHM == 3
#endif // !TUNE_BASE
#include <cub/util_namespace.cuh>
#include <cuda/__numeric/narrow.h>
#include <cuda/std/type_traits>
#include <stdexcept>
#include <nvbench_helper.cuh>
#if !TUNE_BASE
struct policy_selector
{
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_policy
{
const int min_bytes_in_flight =
cub::detail::transform::arch_to_min_bytes_in_flight(::cuda::arch_id{__CUDA_ARCH_LIST__ / 10}) + TUNE_BIF_BIAS;
# if TUNE_ALGORITHM == 0 || TUNE_ALGORITHM == 1
// setup prefetch, since it's either used directly or the fallback to vectorized
auto algorithm = cub::detail::transform::Algorithm::prefetch;
auto pref_policy = cub::detail::transform::prefetch_policy{};
pref_policy.block_threads = TUNE_THREADS;
pref_policy.unroll_factor = TUNE_UNROLL_FACTOR;
# ifdef TUNE_PREFETCH_MULT
pref_policy.prefetch_byte_stride = 32 * TUNE_PREFETCH_MULT;
# endif // TUNE_PREFETCH_MULT
# ifdef TUNE_ITEMS_PER_THREAD_NO_INPUT
pref_policy.items_per_thread_no_input = TUNE_ITEMS_PER_THREAD_NO_INPUT;
# endif // TUNE_ITEMS_PER_THREAD_NO_INPUT
// setup vectorized if requested
auto vec_policy = cub::detail::transform::vectorized_policy{};
# if TUNE_ALGORITHM == 1
algorithm = cub::detail::transform::Algorithm::vectorized;
vec_policy.block_threads = TUNE_THREADS;
vec_policy.vec_size = (1 << TUNE_VEC_SIZE_POW2);
vec_policy.items_per_thread = policy.vec_size * TUNE_UNROLL_FACTOR;
# endif
return {min_bytes_in_flight, algorithm, pref_policy, vec_policy, {}};
# elif TUNE_ALGORITHM == 2
constexpr auto algorithm = cub::detail::transform::Algorithm::memcpy_async;
auto policy = cub::detail::transform::async_copy_policy{};
policy.block_threads = TUNE_THREADS;
policy.bulk_copy_alignment = cub::detail::transform::ldgsts_size_and_align;
policy.unroll_factor = TUNE_UNROLL_FACTOR;
return {min_bytes_in_flight, algorithm, {}, {}, policy};
# elif TUNE_ALGORITHM == 3
constexpr auto algorithm = cub::detail::transform::Algorithm::ublkcp;
auto policy = cub::detail::transform::async_copy_policy{};
policy.block_threads = TUNE_THREADS;
policy.bulk_copy_alignment = cub::detail::transform::bulk_copy_alignment(::cuda::arch_id{__CUDA_ARCH_LIST__ / 10});
policy.unroll_factor = TUNE_UNROLL_FACTOR;
return {min_bytes_in_flight, algorithm, {}, {}, policy};
# else // TUNE_ALGORITHM
# error Policy hub does not yet implement the specified value for algorithm
# endif // TUNE_ALGORITHM
}
};
#endif // !TUNE_BASE
template <typename OffsetT, typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
void bench_transform(nvbench::state& state,
::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
RandomAccessIteratorOut output,
OffsetT num_items,
TransformOp transform_op)
{
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](const nvbench::launch& launch) {
cub::DeviceTransform::Transform(
inputs,
output,
num_items,
transform_op,
cuda::std::execution::env{::cuda::stream_ref{launch.get_stream().get_stream()}
#if !TUNE_BASE
,
cuda::execution::__tune(policy_selector{})
#endif // !TUNE_BASE
});
});
}