Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 16 additions & 56 deletions cub/benchmarks/bench/radix_sort/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,82 +14,42 @@
template <typename T, typename OffsetT>
void radix_sort_keys(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
using offset_t = cub::detail::choose_offset_t<OffsetT>;

constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending;
constexpr bool is_overwrite_ok = false;
using key_t = T;
using value_t = cub::NullType;

if constexpr (!fits_in_default_shared_memory<T, value_t, offset_t, sort_order>())
using key_t = T;
using value_t = cub::NullType;
if constexpr (!fits_in_default_shared_memory<T, value_t, OffsetT, cub::SortOrder::Ascending>())
{
return;
}

constexpr int begin_bit = 0;
constexpr int end_bit = sizeof(key_t) * 8;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));

thrust::device_vector<T> buffer_1 = generate(elements, entropy);
thrust::device_vector<T> buffer_2(elements);
thrust::device_vector<T> buffer_2(elements, thrust::no_init);

key_t* d_buffer_1 = thrust::raw_pointer_cast(buffer_1.data());
key_t* d_buffer_2 = thrust::raw_pointer_cast(buffer_2.data());

cub::DoubleBuffer<key_t> d_keys(d_buffer_1, d_buffer_2);
cub::DoubleBuffer<value_t> d_values;
const key_t* d_buffer_1 = thrust::raw_pointer_cast(buffer_1.data());
key_t* d_buffer_2 = thrust::raw_pointer_cast(buffer_2.data());

// Enable throughput calculations and add "Size" column to results.
state.add_element_count(elements);
state.add_global_memory_reads<T>(elements, "Size");
state.add_global_memory_writes<T>(elements);

// Allocate temporary storage:
std::size_t temp_size{};

cub::detail::radix_sort::dispatch<sort_order>(
nullptr,
temp_size,
d_keys,
d_values,
static_cast<offset_t>(elements),
begin_bit,
end_bit,
is_overwrite_ok,
0 /* stream */
#if !TUNE_BASE
,
cub::detail::identity_decomposer_t{},
policy_selector<key_t, value_t, offset_t>{}
#endif // !TUNE_BASE
);

thrust::device_vector<nvbench::uint8_t> temp(temp_size, thrust::no_init);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());

auto mr = cub::detail::device_memory_resource{};
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
cub::DoubleBuffer<key_t> keys = d_keys;
cub::DoubleBuffer<value_t> values = d_values;

cub::detail::radix_sort::dispatch<sort_order>(
temp_storage,
temp_size,
keys,
values,
static_cast<offset_t>(elements),
begin_bit,
end_bit,
is_overwrite_ok,
launch.get_stream()
cub::DeviceRadixSort::SortKeys(
d_buffer_1,
d_buffer_2,
static_cast<OffsetT>(elements),
cuda::std::execution::env{
::cuda::stream_ref{launch.get_stream().get_stream()},
mr,
#if !TUNE_BASE
,
cub::detail::identity_decomposer_t{},
policy_selector<KeyT>{}
cuda::execution::__tune(policy_selector<key_t, value_t, offset_t>{})
#endif // !TUNE_BASE
);
});
});
}

Expand Down
83 changes: 23 additions & 60 deletions cub/benchmarks/bench/radix_sort/pairs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,37 +16,26 @@
template <typename KeyT, typename ValueT, typename OffsetT>
void radix_sort_values(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT>)
{
using offset_t = cub::detail::choose_offset_t<OffsetT>;

constexpr cub::SortOrder sort_order = cub::SortOrder::Ascending;
constexpr bool is_overwrite_ok = false;
using key_t = KeyT;
using value_t = ValueT;

if constexpr (!fits_in_default_shared_memory<key_t, value_t, offset_t, sort_order>())
using key_t = KeyT;
using value_t = ValueT;
if constexpr (!fits_in_default_shared_memory<key_t, value_t, OffsetT, cub::SortOrder::Ascending>())
{
return;
}

constexpr int begin_bit = 0;
constexpr int end_bit = sizeof(key_t) * 8;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));

thrust::device_vector<key_t> keys_buffer_1 = generate(elements, entropy);
thrust::device_vector<value_t> values_buffer_1 = generate(elements);
thrust::device_vector<key_t> keys_buffer_2(elements);
thrust::device_vector<value_t> values_buffer_2(elements);
thrust::device_vector<key_t> keys_in = generate(elements, entropy);
thrust::device_vector<key_t> keys_out(elements, thrust::no_init);
thrust::device_vector<value_t> values_in = generate(elements);
thrust::device_vector<value_t> values_out(elements, thrust::no_init);

key_t* d_keys_buffer_1 = thrust::raw_pointer_cast(keys_buffer_1.data());
key_t* d_keys_buffer_2 = thrust::raw_pointer_cast(keys_buffer_2.data());
value_t* d_values_buffer_1 = thrust::raw_pointer_cast(values_buffer_1.data());
value_t* d_values_buffer_2 = thrust::raw_pointer_cast(values_buffer_2.data());

cub::DoubleBuffer<key_t> d_keys(d_keys_buffer_1, d_keys_buffer_2);
cub::DoubleBuffer<value_t> d_values(d_values_buffer_1, d_values_buffer_2);
const key_t* d_keys_in = thrust::raw_pointer_cast(keys_in.data());
key_t* d_keys_out = thrust::raw_pointer_cast(keys_out.data());
const value_t* d_values_in = thrust::raw_pointer_cast(values_in.data());
value_t* d_values_out = thrust::raw_pointer_cast(values_out.data());

// Enable throughput calculations and add "Size" column to results.
state.add_element_count(elements);
Expand All @@ -55,48 +44,22 @@ void radix_sort_values(nvbench::state& state, nvbench::type_list<KeyT, ValueT, O
state.add_global_memory_writes<KeyT>(elements);
state.add_global_memory_writes<ValueT>(elements);

// Allocate temporary storage:
std::size_t temp_size{};
cub::detail::radix_sort::dispatch<sort_order>(
nullptr,
temp_size,
d_keys,
d_values,
static_cast<offset_t>(elements),
begin_bit,
end_bit,
is_overwrite_ok,
0 /* stream */
#if !TUNE_BASE
,
cub::detail::identity_decomposer_t{},
policy_selector<KeyT>{}
#endif // !TUNE_BASE
);

thrust::device_vector<nvbench::uint8_t> temp(temp_size, thrust::no_init);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());

auto mr = cub::detail::device_memory_resource{};
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
cub::DoubleBuffer<key_t> keys = d_keys;
cub::DoubleBuffer<value_t> values = d_values;

cub::detail::radix_sort::dispatch<sort_order>(
temp_storage,
temp_size,
keys,
values,
static_cast<offset_t>(elements),
begin_bit,
end_bit,
is_overwrite_ok,
launch.get_stream()
cub::DeviceRadixSort::SortPairs(
d_keys_in,
d_keys_out,
d_values_in,
d_values_out,
static_cast<OffsetT>(elements),
cuda::std::execution::env{
::cuda::stream_ref{launch.get_stream().get_stream()},
mr,
#if !TUNE_BASE
,
cub::detail::identity_decomposer_t{},
policy_selector<KeyT>{}
cuda::execution::__tune2(policy_selector<key_t, value_t, offset_t>{})
#endif // !TUNE_BASE
);
});
});
}

Expand Down
35 changes: 28 additions & 7 deletions cub/benchmarks/bench/radix_sort/policy_selector.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,33 @@ struct policy_selector
template <typename KeyT, typename ValueT, typename OffsetT, cub::SortOrder SortOrder>
constexpr std::size_t max_onesweep_temp_storage_size()
{
using portion_offset = int;
using onesweep_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::OnesweepPolicy;
using portion_offset = int;

constexpr auto active_policy = policy_selector<KeyT, ValueT, OffsetT>{}(cuda::arch_id{});

constexpr auto onesweep = active_policy.onesweep;
using onesweep_policy_t = AgentRadixSortOnesweepPolicy<
0,
0,
void,
onesweep.rank_num_parts,
onesweep.rank_algorith,
onesweep.scan_algorithm,
onesweep.store_algorithm,
onesweep.radix_bits,
NoScaling<onesweep.block_threads, onesweep.items_per_thread>>;

using agent_radix_sort_onesweep_t =
cub::AgentRadixSortOnesweep<onesweep_policy, SortOrder, KeyT, ValueT, OffsetT, portion_offset>;
cub::AgentRadixSortOnesweep<onesweep_policy_t, SortOrder, KeyT, ValueT, OffsetT, portion_offset>;

using hist_policy = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t::HistogramPolicy;
using hist_agent = cub::AgentRadixSortHistogram<hist_policy, SortOrder, KeyT, OffsetT>;
constexpr auto histogram = active_policy.histogram;
using histogram_policy_t =
AgentRadixSortHistogramPolicy<histogram.block_threads,
histogram.items_per_thread,
histogram.num_parts,
void,
histogram.radix_bits>;
using hist_agent = cub::AgentRadixSortHistogram<histogram_policy_t, SortOrder, KeyT, OffsetT>;

return cuda::std::max(sizeof(typename agent_radix_sort_onesweep_t::TempStorage),
sizeof(typename hist_agent::TempStorage));
Expand All @@ -91,10 +111,11 @@ constexpr std::size_t max_onesweep_temp_storage_size()
template <typename KeyT, typename ValueT, typename OffsetT, cub::SortOrder SortOrder>
constexpr std::size_t max_temp_storage_size()
{
using policy_t = typename policy_hub_t<KeyT, ValueT, OffsetT>::policy_t;
using offset_t = cub::detail::choose_offset_t<OffsetT>;
using policy_t = typename policy_hub_t<KeyT, ValueT, offset_t>::policy_t;

static_assert(policy_t::ONESWEEP);
return max_onesweep_temp_storage_size<KeyT, ValueT, OffsetT, SortOrder>();
return max_onesweep_temp_storage_size<KeyT, ValueT, offset_t, SortOrder>();
}

template <typename KeyT, typename ValueT, typename OffsetT, cub::SortOrder SortOrder>
Expand Down
3 changes: 1 addition & 2 deletions cub/benchmarks/bench/transform/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@
#include <nvbench_helper.cuh>

#if !TUNE_BASE
// TODO(bgruber): can we get by without the base class?
struct policy_selector : cub::detail::transform::tuning<policy_selector>
struct policy_selector
{
_CCCL_API constexpr auto operator()(cuda::arch_id) const -> cub::detail::transform::transform_policy
{
Expand Down
Loading