Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cub/cub/agent/agent_batched_topk.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,9 @@ struct agent_batched_topk_worker_per_segment

// Resolve Segment Parameters
const auto segment_size = segment_sizes.get_param(segment_id);
const auto k = k_param.get_param(segment_id);
const auto direction = select_directions.get_param(segment_id);
const auto k = ::cuda::std::min(
k_param.get_param(segment_id), static_cast<decltype(k_param.get_param(segment_id))>(segment_size));
const auto direction = select_directions.get_param(segment_id);

// Determine padding key based on direction
const key_t padding_key =
Expand Down
21 changes: 7 additions & 14 deletions cub/cub/device/dispatch/dispatch_batched_topk.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -198,14 +198,10 @@ struct dispatch_batched_topk
static constexpr bool keys_only = ::cuda::std::is_same_v<ValueInputItItT, NullType**>;

template <typename ActiveWorkerPerSegmentPolicyTPolicyT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_fixed_segment_size()
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_one_worker_per_segment()
{
using max_policy_t = typename SelectedPolicy::max_policy;

// Currently, only uniform segment sizes are supported
static_assert(!params::is_per_segment_param_v<SegmentSizeParameterT>,
"Only uniform segment sizes are currently supported.");

// Instantiate the kernel with the selected policy and check shared memory requirements
using topk_policy_t = ActiveWorkerPerSegmentPolicyTPolicyT;

Expand Down Expand Up @@ -281,17 +277,14 @@ struct dispatch_batched_topk
SelectDirectionParameterT,
NumSegmentsParameterT>;

// Currently, we only support fixed-size segments that fit into shared memory
// Currently, we only support segments that fit into shared memory
// TODO (elstehle): extend support for variable-size segments
static_assert(
!params::is_per_segment_param_v<SegmentSizeParameterT>
&& find_smallest_covering_policy_t::supports_one_worker_per_segment,
"Currently only small, fixed-size segments are supported, where each segment can be processed by a single thread "
"block.");
if constexpr (!params::is_per_segment_param_v<SegmentSizeParameterT>
&& find_smallest_covering_policy_t::supports_one_worker_per_segment)
static_assert(find_smallest_covering_policy_t::supports_one_worker_per_segment,
"Currently only small segments are supported, where each segment can be processed by a single thread "
"block.");
if constexpr (find_smallest_covering_policy_t::supports_one_worker_per_segment)
{
return invoke_fixed_segment_size<typename find_smallest_covering_policy_t::worker_per_segment_policy_t>();
return invoke_one_worker_per_segment<typename find_smallest_covering_policy_t::worker_per_segment_policy_t>();
}
else
{
Expand Down
105 changes: 103 additions & 2 deletions cub/test/catch2_test_device_segmented_topk_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <thrust/count.h>
#include <thrust/detail/raw_pointer_cast.h>
#include <thrust/scan.h>

#include <cuda/iterator>
#include <cuda/std/__algorithm/min.h>
Expand Down Expand Up @@ -168,11 +169,111 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments",
cub::detail::batched_topk::num_segments_uniform<>{num_segments},
cub::detail::batched_topk::total_num_items_guarantee{num_segments * segment_size});
// Prepare expected results
segmented_sort_keys(expected_keys, num_segments, segment_size, direction);
fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction);
compact_sorted_keys_to_topk(expected_keys, segment_size, k);

// Since the results of top-k are unordered, sort output segments before comparison.
segmented_sort_keys(keys_out_buffer, num_segments, k, direction);
fixed_size_segmented_sort_keys(keys_out_buffer, num_segments, k, direction);

REQUIRE(expected_keys == keys_out_buffer);
}

C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segments",
"[keys][segmented][topk][device]",
key_types,
max_segment_size_list,
max_num_k_list)
{
using segment_size_t = cuda::std::int64_t;
using segment_index_t = cuda::std::int64_t;

using key_t = c2h::get<0, TestType>;

// Statically constrained maximum segment size and k
constexpr segment_size_t static_max_segment_size = c2h::get<1, TestType>::value;
constexpr segment_size_t static_max_k = c2h::get<2, TestType>::value;

// Test both directions (as runtime value)
const auto direction = GENERATE_COPY(cub::detail::topk::select::min, cub::detail::topk::select::max);

constexpr segment_size_t min_items = 1;
constexpr segment_size_t max_items = 1000000;

// Number of items
const segment_size_t num_items = GENERATE_COPY(
take(2, random(min_items, max_items)),
values({
min_items,
max_items,
}));

// Generate segment sizes
constexpr segment_size_t min_segment_size = 1;
constexpr auto max_segment_size = static_max_segment_size;
c2h::device_vector<segment_size_t> segment_offsets =
c2h::gen_uniform_offsets<segment_size_t>(C2H_SEED(3), num_items, min_segment_size, max_segment_size);
const segment_index_t num_segments = static_cast<segment_index_t>(segment_offsets.size() - 1);
auto segment_offsets_it = thrust::raw_pointer_cast(segment_offsets.data());
auto segment_size_it = cuda::make_transform_iterator(
cuda::make_counting_iterator(segment_index_t{0}), segment_size_op<segment_size_t*>{segment_offsets_it});

// Set the k value
const segment_size_t k =
GENERATE_COPY(values({segment_size_t{1}, static_max_k}), take(3, random(segment_size_t{1}, static_max_k)));

// Capture test parameters
CAPTURE(c2h::type_name<key_t>(),
c2h::type_name<segment_size_t>(),
c2h::type_name<segment_index_t>(),
static_max_segment_size,
static_max_k,
k,
num_segments,
direction);

// Compute compacted output offsets:
// Each output segment holds exactly min(k, segment_size[i]) items, tightly packed.
auto compacted_output_sizes_it = cuda::make_transform_iterator(
cuda::make_counting_iterator(segment_index_t{0}),
get_output_size_op{segment_offsets.cbegin(), cuda::constant_iterator(k)});
c2h::device_vector<segment_size_t> compacted_offsets(num_segments + 1, thrust::no_init);
thrust::exclusive_scan(
compacted_output_sizes_it, compacted_output_sizes_it + num_segments + 1, compacted_offsets.begin());
segment_size_t total_output_size = compacted_offsets.back();

// Prepare keys input & output
c2h::device_vector<key_t> keys_in_buffer(num_items, thrust::no_init);
c2h::device_vector<key_t> keys_out_buffer(total_output_size, thrust::no_init);
const int num_key_seeds = 1;
c2h::gen(C2H_SEED(num_key_seeds), keys_in_buffer);
auto d_keys_in_ptr = thrust::raw_pointer_cast(keys_in_buffer.data());
auto d_keys_out_ptr = thrust::raw_pointer_cast(keys_out_buffer.data());
auto d_keys_in =
cuda::make_permutation_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_offsets.cbegin());
auto d_keys_out =
cuda::make_permutation_iterator(cuda::make_counting_iterator(d_keys_out_ptr), compacted_offsets.cbegin());

// Copy input for verification
c2h::device_vector<key_t> expected_keys(keys_in_buffer);

// Run the top-k algorithm
batched_topk_keys(
d_keys_in,
d_keys_out,
cub::detail::batched_topk::segment_size_per_segment<decltype(segment_size_it), 1, static_max_segment_size>{
segment_size_it},
cub::detail::batched_topk::k_uniform<1, static_max_k>{k},
cub::detail::batched_topk::select_direction_uniform{direction},
cub::detail::batched_topk::num_segments_uniform<>{num_segments},
cub::detail::batched_topk::total_num_items_guarantee{num_items});

// Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k
segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction);
expected_keys = compact_to_topk_batched(expected_keys, segment_offsets, k);

// Since the results of top-k are unordered, sort compacted output segments before comparison
segmented_sort_keys(
keys_out_buffer, num_segments, compacted_offsets.cbegin(), compacted_offsets.cbegin() + 1, direction);

REQUIRE(expected_keys == keys_out_buffer);
}
Expand Down
Loading