Skip to content

Commit 821523a

Browse files
Implement the new tuning API for detail::reduce::dispatch_streaming_arg_reduce_t
Fixes: #7645
1 parent b189da8 commit 821523a

File tree

3 files changed

+290
-354
lines changed

3 files changed

+290
-354
lines changed

cub/benchmarks/bench/reduce/arg_extrema.cu

Lines changed: 88 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33

44
#include <cub/device/device_reduce.cuh>
55
#include <cub/device/dispatch/dispatch_streaming_reduce.cuh>
6+
#include <cub/device/dispatch/tuning/tuning_reduce.cuh>
67

8+
#include <cuda/__device/arch_id.h>
79
#include <cuda/std/limits>
810
#include <cuda/std/type_traits>
911

@@ -14,100 +16,97 @@
1416
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
1517

1618
#if !TUNE_BASE
17-
# error "Cannot tune until https://github.com/NVIDIA/cccl/pull/7807 is merged"
18-
# define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2)
19-
template <typename AccumT, typename OffsetT>
20-
struct policy_hub_t
19+
struct tuned_policy_selector
2120
{
22-
struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
21+
[[nodiscard]] _CCCL_API constexpr auto operator()(::cuda::arch_id) const -> cub::detail::reduce::reduce_policy
2322
{
24-
static constexpr int threads_per_block = TUNE_THREADS_PER_BLOCK;
25-
static constexpr int items_per_thread = TUNE_ITEMS_PER_THREAD;
26-
static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD;
27-
28-
using ReducePolicy =
29-
cub::AgentReducePolicy<threads_per_block,
30-
items_per_thread,
31-
AccumT,
32-
items_per_vec_load,
33-
cub::BLOCK_REDUCE_WARP_REDUCTIONS,
34-
cub::LOAD_DEFAULT>;
35-
36-
using SingleTilePolicy = ReducePolicy;
37-
using SegmentedReducePolicy = ReducePolicy;
38-
};
23+
cub::detail::reduce::agent_reduce_policy rp{
24+
TUNE_THREADS_PER_BLOCK,
25+
TUNE_ITEMS_PER_THREAD,
26+
1 << TUNE_ITEMS_PER_VEC_LOAD_POW2,
27+
cub::BLOCK_REDUCE_WARP_REDUCTIONS,
28+
cub::LOAD_DEFAULT};
29+
auto rp_nondet = rp;
30+
rp_nondet.block_algorithm = cub::BLOCK_REDUCE_WARP_REDUCTIONS_NONDETERMINISTIC;
31+
return {rp, rp, rp_nondet};
32+
}
33+
};
3934
#endif // !TUNE_BASE
4035

41-
template <typename T, typename OpT>
42-
void arg_reduce(nvbench::state& state, nvbench::type_list<T, OpT>)
43-
{
44-
// Offset type used within the kernel and to index within one partition
45-
using per_partition_offset_t = int;
46-
47-
// Offset type used to index within the total input in the range [d_in, d_in + num_items)
48-
using global_offset_t = ::cuda::std::int64_t;
49-
50-
// The value type of the KeyValuePair<global_offset_t, output_value_t> returned by the ArgIndexInputIterator
51-
using output_value_t = T;
52-
53-
// Iterator providing the values being reduced
54-
using values_it_t = T*;
55-
56-
// Iterator providing the input items for the reduction
57-
using input_it_t = values_it_t;
58-
59-
// Type used for the final result
60-
using output_tuple_t = cub::KeyValuePair<global_offset_t, T>;
61-
62-
auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin>
63-
? ::cuda::std::numeric_limits<T>::max()
64-
: ::cuda::std::numeric_limits<T>::lowest();
65-
36+
template <typename T, typename OpT>
37+
void arg_reduce(nvbench::state& state, nvbench::type_list<T, OpT>)
38+
{
39+
// Offset type used within the kernel and to index within one partition
40+
using per_partition_offset_t = int;
41+
42+
// Offset type used to index within the total input in the range [d_in, d_in + num_items)
43+
using global_offset_t = ::cuda::std::int64_t;
44+
45+
// Iterator providing the values being reduced
46+
using values_it_t = T*;
47+
48+
// Type used for the final result
49+
using output_tuple_t = cub::KeyValuePair<global_offset_t, T>;
50+
51+
auto const init = ::cuda::std::is_same_v<OpT, cub::ArgMin>
52+
? ::cuda::std::numeric_limits<T>::max()
53+
: ::cuda::std::numeric_limits<T>::lowest();
54+
55+
// Retrieve axis parameters
56+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
57+
thrust::device_vector<T> in = generate(elements);
58+
thrust::device_vector<output_tuple_t> out(1);
59+
60+
values_it_t d_in = thrust::raw_pointer_cast(in.data());
61+
output_tuple_t* d_out = thrust::raw_pointer_cast(out.data());
62+
auto const num_items = static_cast<global_offset_t>(elements);
63+
64+
// Enable throughput calculations and add "Size" column to results.
65+
state.add_element_count(elements);
66+
state.add_global_memory_reads<T>(elements, "Size");
67+
state.add_global_memory_writes<output_tuple_t>(1);
68+
69+
// Allocate temporary storage
70+
std::size_t temp_size;
71+
cub::detail::reduce::dispatch_streaming_arg_reduce<per_partition_offset_t>(
72+
nullptr,
73+
temp_size,
74+
d_in,
75+
d_out,
76+
num_items,
77+
OpT{},
78+
init,
79+
0 /* stream */
6680
#if !TUNE_BASE
67-
using policy_t = policy_hub_t<output_tuple_t, per_partition_offset_t>;
68-
using dispatch_t = cub::detail::reduce::dispatch_streaming_arg_reduce_t<
69-
input_it_t,
70-
output_tuple_t*,
71-
per_partition_offset_t,
72-
global_offset_t,
73-
OpT,
74-
T,
75-
policy_t>;
76-
#else // TUNE_BASE
77-
using dispatch_t = cub::detail::reduce::
78-
dispatch_streaming_arg_reduce_t<input_it_t, output_tuple_t*, per_partition_offset_t, global_offset_t, OpT, T>;
81+
,
82+
tuned_policy_selector{}
7983
#endif // TUNE_BASE
84+
);
85+
86+
thrust::device_vector<nvbench::uint8_t> temp(temp_size);
87+
auto* temp_storage = thrust::raw_pointer_cast(temp.data());
88+
89+
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
90+
cub::detail::reduce::dispatch_streaming_arg_reduce<per_partition_offset_t>(
91+
temp_storage,
92+
temp_size,
93+
d_in,
94+
d_out,
95+
num_items,
96+
OpT{},
97+
init,
98+
launch.get_stream()
99+
#if !TUNE_BASE
100+
,
101+
tuned_policy_selector{}
102+
#endif // TUNE_BASE
103+
);
104+
});
105+
}
80106

81-
// Retrieve axis parameters
82-
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
83-
thrust::device_vector<T> in = generate(elements);
84-
thrust::device_vector<output_tuple_t> out(1);
85-
86-
values_it_t d_in = thrust::raw_pointer_cast(in.data());
87-
output_tuple_t* d_out = thrust::raw_pointer_cast(out.data());
88-
89-
// Enable throughput calculations and add "Size" column to results.
90-
state.add_element_count(elements);
91-
state.add_global_memory_reads<T>(elements, "Size");
92-
state.add_global_memory_writes<output_tuple_t>(1);
93-
94-
// Allocate temporary storage:
95-
std::size_t temp_size;
96-
dispatch_t::Dispatch(
97-
nullptr, temp_size, d_in, d_out, static_cast<global_offset_t>(elements), OpT{}, init, 0 /* stream */);
98-
99-
thrust::device_vector<nvbench::uint8_t> temp(temp_size);
100-
auto* temp_storage = thrust::raw_pointer_cast(temp.data());
101-
102-
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
103-
dispatch_t::Dispatch(
104-
temp_storage, temp_size, d_in, d_out, static_cast<global_offset_t>(elements), OpT{}, init, launch.get_stream());
105-
});
106-
}
107-
108-
using op_types = nvbench::type_list<cub::ArgMin, cub::ArgMax>;
107+
using op_types = nvbench::type_list<cub::ArgMin, cub::ArgMax>;
109108

110-
NVBENCH_BENCH_TYPES(arg_reduce, NVBENCH_TYPE_AXES(fundamental_types, op_types))
111-
.set_name("base")
112-
.set_type_axes_names({"T{ct}", "Operation{ct}"})
113-
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
109+
NVBENCH_BENCH_TYPES(arg_reduce, NVBENCH_TYPE_AXES(fundamental_types, op_types))
110+
.set_name("base")
111+
.set_type_axes_names({"T{ct}", "Operation{ct}"})
112+
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));

0 commit comments

Comments
 (0)