Skip to content

Commit d5b60e6

Browse files
Implement the new tuning API for DeviceAdjacentDifference part 2 (#7918)
The work in #7524 was incomplete. Here is the rest. Fixes: #7522 Co-authored-by: Nader Al Awar <naderalawar@gmail.com>
1 parent 18025d6 commit d5b60e6

File tree

4 files changed

+55
-88
lines changed

4 files changed

+55
-88
lines changed

cub/benchmarks/bench/adjacent_difference/subtract_left.cu

Lines changed: 37 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,17 @@
99
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
1010

1111
#if !TUNE_BASE
12-
struct policy_hub_t
12+
struct policy_selector_t
1313
{
14-
struct Policy500 : cub::ChainedPolicy<500, Policy500, Policy500>
14+
_CCCL_API constexpr auto operator()(::cuda::arch_id) const
15+
-> cub::detail::adjacent_difference::adjacent_difference_policy
1516
{
16-
using AdjacentDifferencePolicy =
17-
cub::AgentAdjacentDifferencePolicy<TUNE_THREADS_PER_BLOCK,
18-
TUNE_ITEMS_PER_THREAD,
19-
cub::BLOCK_LOAD_WARP_TRANSPOSE,
20-
cub::LOAD_CA,
21-
cub::BLOCK_STORE_WARP_TRANSPOSE>;
22-
};
23-
24-
using MaxPolicy = Policy500;
17+
return {TUNE_THREADS_PER_BLOCK,
18+
TUNE_ITEMS_PER_THREAD,
19+
cub::BLOCK_LOAD_WARP_TRANSPOSE,
20+
cub::LOAD_CA,
21+
cub::BLOCK_STORE_WARP_TRANSPOSE};
22+
}
2523
};
2624
#endif // !TUNE_BASE
2725

@@ -33,25 +31,6 @@ void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
3331
using difference_op_t = ::cuda::std::minus<>;
3432
using offset_t = cub::detail::choose_offset_t<OffsetT>;
3533

36-
#if !TUNE_BASE
37-
using dispatch_t = cub::DispatchAdjacentDifference<
38-
input_it_t,
39-
output_it_t,
40-
difference_op_t,
41-
offset_t,
42-
cub::MayAlias::No,
43-
cub::ReadOption::Left,
44-
policy_hub_t>;
45-
#else
46-
using dispatch_t = cub::DispatchAdjacentDifference<
47-
input_it_t,
48-
output_it_t,
49-
difference_op_t,
50-
offset_t,
51-
cub::MayAlias::No,
52-
cub::ReadOption::Left>;
53-
#endif // TUNE_BASE
54-
5534
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
5635
thrust::device_vector<T> in = generate(elements);
5736
thrust::device_vector<T> out(elements);
@@ -64,20 +43,40 @@ void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
6443
state.add_global_memory_writes<T>(elements);
6544

6645
std::size_t temp_storage_bytes{};
67-
dispatch_t::Dispatch(nullptr, temp_storage_bytes, d_in, d_out, static_cast<offset_t>(elements), difference_op_t{}, 0);
6846

69-
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
70-
std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
71-
72-
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
73-
dispatch_t::Dispatch(
74-
d_temp_storage,
47+
cub::detail::adjacent_difference::
48+
dispatch<cub::MayAlias::No, cub::ReadOption::Left, input_it_t, output_it_t, offset_t, difference_op_t>(
49+
nullptr,
7550
temp_storage_bytes,
7651
d_in,
7752
d_out,
7853
static_cast<offset_t>(elements),
7954
difference_op_t{},
80-
launch.get_stream());
55+
0
56+
#if !TUNE_BASE
57+
,
58+
policy_selector_t{}
59+
#endif // TUNE_BASE
60+
);
61+
62+
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes, thrust::no_init);
63+
std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
64+
65+
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
66+
cub::detail::adjacent_difference::
67+
dispatch<cub::MayAlias::No, cub::ReadOption::Left, input_it_t, output_it_t, offset_t, difference_op_t>(
68+
d_temp_storage,
69+
temp_storage_bytes,
70+
d_in,
71+
d_out,
72+
static_cast<offset_t>(elements),
73+
difference_op_t{},
74+
launch.get_stream()
75+
#if !TUNE_BASE
76+
,
77+
policy_selector_t{}
78+
#endif // TUNE_BASE
79+
);
8180
});
8281
}
8382

cub/cub/device/device_adjacent_difference.cuh

Lines changed: 12 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -84,32 +84,6 @@ CUB_NAMESPACE_BEGIN
8484
//! @endrst
8585
struct DeviceAdjacentDifference
8686
{
87-
private:
88-
template <MayAlias AliasOpt,
89-
ReadOption ReadOpt,
90-
typename NumItemsT,
91-
typename InputIteratorT,
92-
typename OutputIteratorT,
93-
typename DifferenceOpT>
94-
static CUB_RUNTIME_FUNCTION cudaError_t AdjacentDifference(
95-
void* d_temp_storage,
96-
size_t& temp_storage_bytes,
97-
InputIteratorT d_input,
98-
OutputIteratorT d_output,
99-
NumItemsT num_items,
100-
DifferenceOpT difference_op,
101-
cudaStream_t stream)
102-
{
103-
using OffsetT = detail::choose_offset_t<NumItemsT>;
104-
105-
using DispatchT =
106-
DispatchAdjacentDifference<InputIteratorT, OutputIteratorT, DifferenceOpT, OffsetT, AliasOpt, ReadOpt>;
107-
108-
return DispatchT::Dispatch(
109-
d_temp_storage, temp_storage_bytes, d_input, d_output, static_cast<OffsetT>(num_items), difference_op, stream);
110-
}
111-
112-
public:
11387
//! @rst
11488
//! Subtracts the left element of each adjacent pair of elements residing within device-accessible memory
11589
//!
@@ -234,9 +208,9 @@ public:
234208
cudaStream_t stream = 0)
235209
{
236210
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeftCopy");
237-
238-
return AdjacentDifference<MayAlias::No, ReadOption::Left>(
239-
d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
211+
using OffsetT = detail::choose_offset_t<NumItemsT>;
212+
return detail::adjacent_difference::dispatch<MayAlias::No, ReadOption::Left>(
213+
d_temp_storage, temp_storage_bytes, d_input, d_output, static_cast<OffsetT>(num_items), difference_op, stream);
240214
}
241215

242216
//! @rst
@@ -345,9 +319,9 @@ public:
345319
cudaStream_t stream = 0)
346320
{
347321
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractLeft");
348-
349-
return AdjacentDifference<MayAlias::Yes, ReadOption::Left>(
350-
d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream);
322+
using OffsetT = detail::choose_offset_t<NumItemsT>;
323+
return detail::adjacent_difference::dispatch<MayAlias::Yes, ReadOption::Left>(
324+
d_temp_storage, temp_storage_bytes, d_input, d_input, static_cast<OffsetT>(num_items), difference_op, stream);
351325
}
352326

353327
//! @rst
@@ -475,9 +449,9 @@ public:
475449
cudaStream_t stream = 0)
476450
{
477451
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractRightCopy");
478-
479-
return AdjacentDifference<MayAlias::No, ReadOption::Right>(
480-
d_temp_storage, temp_storage_bytes, d_input, d_output, num_items, difference_op, stream);
452+
using OffsetT = detail::choose_offset_t<NumItemsT>;
453+
return detail::adjacent_difference::dispatch<MayAlias::No, ReadOption::Right>(
454+
d_temp_storage, temp_storage_bytes, d_input, d_output, static_cast<OffsetT>(num_items), difference_op, stream);
481455
}
482456

483457
//! @rst
@@ -575,9 +549,9 @@ public:
575549
cudaStream_t stream = 0)
576550
{
577551
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceAdjacentDifference::SubtractRight");
578-
579-
return AdjacentDifference<MayAlias::Yes, ReadOption::Right>(
580-
d_temp_storage, temp_storage_bytes, d_input, d_input, num_items, difference_op, stream);
552+
using OffsetT = detail::choose_offset_t<NumItemsT>;
553+
return detail::adjacent_difference::dispatch<MayAlias::Yes, ReadOption::Right>(
554+
d_temp_storage, temp_storage_bytes, d_input, d_input, static_cast<OffsetT>(num_items), difference_op, stream);
581555
}
582556
};
583557

cub/cub/device/dispatch/dispatch_adjacent_difference.cuh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -313,12 +313,12 @@ struct DispatchAdjacentDifference
313313

314314
namespace detail::adjacent_difference
315315
{
316-
template <typename InputIteratorT,
316+
template <MayAlias AliasOpt,
317+
ReadOption ReadOpt,
318+
typename InputIteratorT,
317319
typename OutputIteratorT,
318-
typename DifferenceOpT,
319320
typename OffsetT,
320-
MayAlias AliasOpt,
321-
ReadOption ReadOpt,
321+
typename DifferenceOpT,
322322
typename PolicySelector = policy_selector_from_types<InputIteratorT, AliasOpt == MayAlias::Yes>,
323323
typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY>
324324
#if _CCCL_HAS_CONCEPTS()

thrust/thrust/system/cuda/detail/adjacent_difference.h

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,10 @@ cudaError_t THRUST_RUNTIME_FUNCTION doit_step(
6565
return cudaSuccess;
6666
}
6767

68-
constexpr cub::ReadOption read_left = cub::ReadOption::Left;
69-
70-
using Dispatch32 = cub::DispatchAdjacentDifference<InputIt, OutputIt, BinaryOp, std::int32_t, AliasOpt, read_left>;
71-
using Dispatch64 = cub::DispatchAdjacentDifference<InputIt, OutputIt, BinaryOp, std::int64_t, AliasOpt, read_left>;
72-
7368
cudaError_t status;
74-
THRUST_INDEX_TYPE_DISPATCH2(
69+
THRUST_INDEX_TYPE_DISPATCH(
7570
status,
76-
Dispatch32::Dispatch,
77-
Dispatch64::Dispatch,
71+
(cub::detail::adjacent_difference::dispatch<AliasOpt, cub::ReadOption::Left>),
7872
num_items,
7973
(d_temp_storage, temp_storage_bytes, first, result, num_items_fixed, binary_op, stream));
8074
return status;

0 commit comments

Comments
 (0)