|
3 | 3 |
|
4 | 4 | #include <cub/device/device_reduce.cuh> |
5 | 5 | #include <cub/device/dispatch/dispatch_streaming_reduce.cuh> |
| 6 | +#include <cub/device/dispatch/tuning/tuning_reduce.cuh> |
6 | 7 |
|
| 8 | +#include <cuda/__device/arch_id.h> |
7 | 9 | #include <cuda/std/limits> |
8 | 10 | #include <cuda/std/type_traits> |
9 | 11 |
|
|
14 | 16 | // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 |
15 | 17 |
|
16 | 18 | #if !TUNE_BASE |
17 | | -# error "Cannot tune until https://github.com/NVIDIA/cccl/pull/7807 is merged" |
18 | | -# define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2) |
19 | | -template <typename AccumT, typename OffsetT> |
20 | | -struct policy_hub_t |
| 19 | +struct tuned_policy_selector |
21 | 20 | { |
22 | | - struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> |
| 21 | + [[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id) const -> cub::detail::reduce::reduce_policy |
23 | 22 | { |
24 | | - static constexpr int threads_per_block = TUNE_THREADS_PER_BLOCK; |
25 | | - static constexpr int items_per_thread = TUNE_ITEMS_PER_THREAD; |
26 | | - static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD; |
27 | | - |
28 | | - using ReducePolicy = |
29 | | - cub::AgentReducePolicy<threads_per_block, |
30 | | - items_per_thread, |
31 | | - AccumT, |
32 | | - items_per_vec_load, |
33 | | - cub::BLOCK_REDUCE_WARP_REDUCTIONS, |
34 | | - cub::LOAD_DEFAULT>; |
35 | | - |
36 | | - using SingleTilePolicy = ReducePolicy; |
37 | | - using SegmentedReducePolicy = ReducePolicy; |
38 | | - }; |
| 23 | + cub::detail::reduce::agent_reduce_policy rp{ |
| 24 | + TUNE_THREADS_PER_BLOCK, |
| 25 | + TUNE_ITEMS_PER_THREAD, |
| 26 | + 1 << TUNE_ITEMS_PER_VEC_LOAD_POW2, |
| 27 | + cub::BLOCK_REDUCE_WARP_REDUCTIONS, |
| 28 | + cub::LOAD_DEFAULT}; |
| 29 | + auto rp_nondet = rp; |
| 30 | + rp_nondet.block_algorithm = cub::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC; |
| 31 | + return {rp, rp, rp_nondet}; |
| 32 | + } |
| 33 | +}; |
39 | 34 | #endif // !TUNE_BASE |
40 | 35 |
|
41 | | - template <typename T, typename OpT> |
42 | | - void arg_reduce(nvbench::state& state, nvbench::type_list<T, OpT>) |
43 | | - { |
44 | | - // Offset type used within the kernel and to index within one partition |
45 | | - using per_partition_offset_t = int; |
46 | | - |
47 | | - // Offset type used to index within the total input in the range [d_in, d_in + num_items) |
48 | | - using global_offset_t = ::cuda::std::int64_t; |
49 | | - |
50 | | - // The value type of the KeyValuePair<global_offset_t, output_value_t> returned by the ArgIndexInputIterator |
51 | | - using output_value_t = T; |
52 | | - |
53 | | - // Iterator providing the values being reduced |
54 | | - using values_it_t = T*; |
55 | | - |
56 | | - // Iterator providing the input items for the reduction |
57 | | - using input_it_t = values_it_t; |
58 | | - |
59 | | - // Type used for the final result |
60 | | - using output_tuple_t = cub::KeyValuePair<global_offset_t, T>; |
61 | | - |
62 | | - auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin> |
63 | | - ? ::cuda::std::numeric_limits<T>::max() |
64 | | - : ::cuda::std::numeric_limits<T>::lowest(); |
65 | | - |
| 36 | +template <typename T, typename OpT> |
| 37 | +void arg_reduce(nvbench::state& state, nvbench::type_list<T, OpT>) |
| 38 | +{ |
| 39 | + // Offset type used within the kernel and to index within one partition |
| 40 | + using per_partition_offset_t = int; |
| 41 | + |
| 42 | + // Offset type used to index within the total input in the range [d_in, d_in + num_items) |
| 43 | + using global_offset_t = ::cuda::std::int64_t; |
| 44 | + |
| 45 | + // Iterator providing the values being reduced |
| 46 | + using values_it_t = T*; |
| 47 | + |
| 48 | + // Type used for the final result |
| 49 | + using output_tuple_t = cub::KeyValuePair<global_offset_t, T>; |
| 50 | + |
| 51 | + auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin> |
| 52 | + ? ::cuda::std::numeric_limits<T>::max() |
| 53 | + : ::cuda::std::numeric_limits<T>::lowest(); |
| 54 | + |
| 55 | + // Retrieve axis parameters |
| 56 | + const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}")); |
| 57 | + thrust::device_vector<T> in = generate(elements); |
| 58 | + thrust::device_vector<output_tuple_t> out(1); |
| 59 | + |
| 60 | + values_it_t d_in = thrust::raw_pointer_cast(in.data()); |
| 61 | + output_tuple_t* d_out = thrust::raw_pointer_cast(out.data()); |
| 62 | + auto const num_items = static_cast<global_offset_t>(elements); |
| 63 | + |
| 64 | + // Enable throughput calculations and add "Size" column to results. |
| 65 | + state.add_element_count(elements); |
| 66 | + state.add_global_memory_reads<T>(elements, "Size"); |
| 67 | + state.add_global_memory_writes<output_tuple_t>(1); |
| 68 | + |
| 69 | + // Allocate temporary storage |
| 70 | + std::size_t temp_size; |
| 71 | + cub::detail::reduce::dispatch_streaming_arg_reduce<per_partition_offset_t>( |
| 72 | + nullptr, |
| 73 | + temp_size, |
| 74 | + d_in, |
| 75 | + d_out, |
| 76 | + num_items, |
| 77 | + OpT{}, |
| 78 | + init, |
| 79 | + 0 /* stream */ |
66 | 80 | #if !TUNE_BASE |
67 | | - using policy_t = policy_hub_t<output_tuple_t, per_partition_offset_t>; |
68 | | - using dispatch_t = cub::detail::reduce::dispatch_streaming_arg_reduce_t< |
69 | | - input_it_t, |
70 | | - output_tuple_t*, |
71 | | - per_partition_offset_t, |
72 | | - global_offset_t, |
73 | | - OpT, |
74 | | - T, |
75 | | - policy_t>; |
76 | | -#else // TUNE_BASE |
77 | | - using dispatch_t = cub::detail::reduce:: |
78 | | - dispatch_streaming_arg_reduce_t<input_it_t, output_tuple_t*, per_partition_offset_t, global_offset_t, OpT, T>; |
| 81 | + , |
| 82 | + tuned_policy_selector{} |
79 | 83 | #endif // TUNE_BASE |
| 84 | + ); |
| 85 | + |
| 86 | + thrust::device_vector<nvbench::uint8_t> temp(temp_size); |
| 87 | + auto* temp_storage = thrust::raw_pointer_cast(temp.data()); |
| 88 | + |
| 89 | + state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { |
| 90 | + cub::detail::reduce::dispatch_streaming_arg_reduce<per_partition_offset_t>( |
| 91 | + temp_storage, |
| 92 | + temp_size, |
| 93 | + d_in, |
| 94 | + d_out, |
| 95 | + num_items, |
| 96 | + OpT{}, |
| 97 | + init, |
| 98 | + launch.get_stream() |
| 99 | +#if !TUNE_BASE |
| 100 | + , |
| 101 | + tuned_policy_selector{} |
| 102 | +#endif // TUNE_BASE |
| 103 | + ); |
| 104 | + }); |
| 105 | +} |
80 | 106 |
|
81 | | - // Retrieve axis parameters |
82 | | - const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}")); |
83 | | - thrust::device_vector<T> in = generate(elements); |
84 | | - thrust::device_vector<output_tuple_t> out(1); |
85 | | - |
86 | | - values_it_t d_in = thrust::raw_pointer_cast(in.data()); |
87 | | - output_tuple_t* d_out = thrust::raw_pointer_cast(out.data()); |
88 | | - |
89 | | - // Enable throughput calculations and add "Size" column to results. |
90 | | - state.add_element_count(elements); |
91 | | - state.add_global_memory_reads<T>(elements, "Size"); |
92 | | - state.add_global_memory_writes<output_tuple_t>(1); |
93 | | - |
94 | | - // Allocate temporary storage: |
95 | | - std::size_t temp_size; |
96 | | - dispatch_t::Dispatch( |
97 | | - nullptr, temp_size, d_in, d_out, static_cast<global_offset_t>(elements), OpT{}, init, 0 /* stream */); |
98 | | - |
99 | | - thrust::device_vector<nvbench::uint8_t> temp(temp_size); |
100 | | - auto* temp_storage = thrust::raw_pointer_cast(temp.data()); |
101 | | - |
102 | | - state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { |
103 | | - dispatch_t::Dispatch( |
104 | | - temp_storage, temp_size, d_in, d_out, static_cast<global_offset_t>(elements), OpT{}, init, launch.get_stream()); |
105 | | - }); |
106 | | - } |
107 | | - |
108 | | - using op_types = nvbench::type_list<cub::ArgMin, cub::ArgMax>; |
| 107 | +using op_types = nvbench::type_list<cub::ArgMin, cub::ArgMax>; |
109 | 108 |
|
110 | | - NVBENCH_BENCH_TYPES(arg_reduce, NVBENCH_TYPE_AXES(fundamental_types, op_types)) |
111 | | - .set_name("base") |
112 | | - .set_type_axes_names({"T{ct}", "Operation{ct}"}) |
113 | | - .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); |
| 109 | +NVBENCH_BENCH_TYPES(arg_reduce, NVBENCH_TYPE_AXES(fundamental_types, op_types)) |
| 110 | + .set_name("base") |
| 111 | + .set_type_axes_names({"T{ct}", "Operation{ct}"}) |
| 112 | + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); |
0 commit comments