cccl/cub/benchmarks/bench/transform/common.h at 7f0e3a430badc27ac66c2d7c84ab4fa6f6be7fe4 · NVIDIA/cccl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

// keep checks at the top so compilation of discarded variants fails really fast
#include <cub/device/dispatch/dispatch_transform.cuh>
#if !TUNE_BASE
#  if _CCCL_PP_COUNT(__CUDA_ARCH_LIST__) != 1
#    error "When tuning, this benchmark does not support being compiled for multiple architectures"
#  endif
#  if TUNE_ALGORITHM == 3
#    if (__CUDA_ARCH_LIST__) < 900
#      error "Cannot compile algorithm 3 (ublkcp) below sm90"
#    endif
#  endif // TUNE_ALGORITHM == 3
#endif // !TUNE_BASE

#include <cub/util_namespace.cuh>

#include <cuda/__numeric/narrow.h>
#include <cuda/std/type_traits>

#include <stdexcept>

#include <nvbench_helper.cuh>

#if !TUNE_BASE
struct policy_selector
{
  _CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_policy
  {
    const int min_bytes_in_flight =
      cub::detail::transform::arch_to_min_bytes_in_flight(::cuda::arch_id{__CUDA_ARCH_LIST__ / 10}) + TUNE_BIF_BIAS;
#  if TUNE_ALGORITHM == 0 || TUNE_ALGORITHM == 1
    // setup prefetch, since it's either used directly or the fallback to vectorized
    auto algorithm            = cub::detail::transform::Algorithm::prefetch;
    auto pref_policy          = cub::detail::transform::prefetch_policy{};
    pref_policy.block_threads = TUNE_THREADS;
    pref_policy.unroll_factor = TUNE_UNROLL_FACTOR;
#    ifdef TUNE_PREFETCH_MULT
    pref_policy.prefetch_byte_stride = 32 * TUNE_PREFETCH_MULT;
#    endif //  TUNE_PREFETCH_MULT
#    ifdef TUNE_ITEMS_PER_THREAD_NO_INPUT
    pref_policy.items_per_thread_no_input = TUNE_ITEMS_PER_THREAD_NO_INPUT;
#    endif // TUNE_ITEMS_PER_THREAD_NO_INPUT

    // setup vectorized if requested
    auto vec_policy = cub::detail::transform::vectorized_policy{};
#    if TUNE_ALGORITHM == 1
    algorithm                   = cub::detail::transform::Algorithm::vectorized;
    vec_policy.block_threads    = TUNE_THREADS;
    vec_policy.vec_size         = (1 << TUNE_VEC_SIZE_POW2);
    vec_policy.items_per_thread = policy.vec_size * TUNE_UNROLL_FACTOR;
#    endif
    return {min_bytes_in_flight, algorithm, pref_policy, vec_policy, {}};
#  elif TUNE_ALGORITHM == 2
    constexpr auto algorithm   = cub::detail::transform::Algorithm::memcpy_async;
    auto policy                = cub::detail::transform::async_copy_policy{};
    policy.block_threads       = TUNE_THREADS;
    policy.bulk_copy_alignment = cub::detail::transform::ldgsts_size_and_align;
    policy.unroll_factor       = TUNE_UNROLL_FACTOR;
    return {min_bytes_in_flight, algorithm, {}, {}, policy};
#  elif TUNE_ALGORITHM == 3
    constexpr auto algorithm   = cub::detail::transform::Algorithm::ublkcp;
    auto policy                = cub::detail::transform::async_copy_policy{};
    policy.block_threads       = TUNE_THREADS;
    policy.bulk_copy_alignment = cub::detail::transform::bulk_copy_alignment(::cuda::arch_id{__CUDA_ARCH_LIST__ / 10});
    policy.unroll_factor       = TUNE_UNROLL_FACTOR;
    return {min_bytes_in_flight, algorithm, {}, {}, policy};
#  else // TUNE_ALGORITHM
#    error Policy hub does not yet implement the specified value for algorithm
#  endif // TUNE_ALGORITHM
  }
};
#endif // !TUNE_BASE

template <typename OffsetT, typename... RandomAccessIteratorsIn, typename RandomAccessIteratorOut, typename TransformOp>
void bench_transform(nvbench::state& state,
                     ::cuda::std::tuple<RandomAccessIteratorsIn...> inputs,
                     RandomAccessIteratorOut output,
                     OffsetT num_items,
                     TransformOp transform_op)
{
  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](const nvbench::launch& launch) {
    cub::DeviceTransform::Transform(
      inputs,
      output,
      num_items,
      transform_op,
      cuda::std::execution::env{::cuda::stream_ref{launch.get_stream().get_stream()}
#if !TUNE_BASE
                                ,
                                cuda::execution::__tune(policy_selector{})
#endif // !TUNE_BASE
      });
  });
}