Skip to content

Commit 49cdfe7

Browse files
authored
Work around NVCC constant argument type mismatch (#9382)
Default cuda::args::constant value_type through remove_cvref_t so dependent constant expressions do not produce different host-stub and device-registration types. Restore the CUB segmented top-k launch wrappers to pass cuda::args::constant directly, add the regression coverage, and lower the CTK12.0/GCC7 CUB host-launch CI shard parallelism to avoid runner OOM.
1 parent be047dd commit 49cdfe7

5 files changed

Lines changed: 60 additions & 55 deletions

File tree

ci/matrix.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,12 @@ workflows:
2525

2626
pull_request:
2727
# Old CTK: Oldest/newest supported host compilers:
28-
- {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc7', 'gcc12', 'clang14', 'msvc2019', 'msvc14.39']}
28+
- {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc12', 'clang14', 'msvc2019', 'msvc14.39']}
29+
- {jobs: ['build'], project: ['libcudacxx', 'thrust'], std: 'minmax', ctk: '12.0', cxx: 'gcc7'}
30+
- {jobs: ['build_nolid', 'build_lid1', 'build_lid2'], project: 'cub', std: 'minmax', ctk: '12.0', cxx: 'gcc7'}
31+
# CTK12.0/GCC7 CUB host-launch builds are memory-heavy with benchmarks enabled; keep this shard below
32+
# the 61 GiB linux-amd64-cpu16 runner limit.
33+
- {jobs: ['build_lid0'], project: 'cub', std: 'minmax', ctk: '12.0', cxx: 'gcc7', environment: ['PARALLEL_LEVEL=8']}
2934
- {jobs: ['build'], std: 'minmax', ctk: '12.X', cxx: ['gcc7', 'gcc14', 'clang14', 'clang19', 'msvc2019', 'msvc2022' ]}
3035
- {jobs: ['build'], std: 'minmax', ctk: '13.0', cxx: ['gcc11', 'gcc15', 'clang15', 'clang20', 'msvc2019', 'msvc2022' ]}
3136
# Old CTK: cudax has a different support matrix:

cub/test/catch2_test_device_segmented_topk_keys.cu

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ struct is_minus_zero
2727
}
2828
};
2929

30-
template <cub::detail::topk::select Direction,
31-
typename KeyInputItItT,
30+
template <typename KeyInputItItT,
3231
typename KeyOutputItItT,
3332
typename SegmentSizeParamT,
3433
typename KParamT,
34+
typename SelectDirectionT,
3535
typename NumSegmentsParameterT,
3636
typename TotalNumItemsGuaranteeT>
3737
CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys(
@@ -41,6 +41,7 @@ CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys(
4141
KeyOutputItItT d_key_segments_out_it,
4242
SegmentSizeParamT segment_sizes,
4343
KParamT k,
44+
SelectDirectionT select_direction,
4445
NumSegmentsParameterT num_segments,
4546
TotalNumItemsGuaranteeT total_num_items_guarantee,
4647
cudaStream_t stream = nullptr)
@@ -55,15 +56,14 @@ CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys(
5556
values_it,
5657
segment_sizes,
5758
k,
58-
cuda::args::constant<Direction>{},
59+
select_direction,
5960
num_segments,
6061
total_num_items_guarantee,
6162
stream);
6263
}
6364

6465
// %PARAM% TEST_LAUNCH lid 0:1:2
65-
DECLARE_TMPL_LAUNCH_WRAPPER(
66-
dispatch_batched_topk_keys, batched_topk_keys, cub::detail::topk::select Direction, Direction);
66+
DECLARE_LAUNCH_WRAPPER(dispatch_batched_topk_keys, batched_topk_keys);
6767

6868
// Total segment size
6969
using max_segment_size_list = c2h::enum_type_list<cuda::std::size_t, 4 * 1024>;
@@ -163,11 +163,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments",
163163
c2h::device_vector<key_t> expected_keys(keys_in_buffer);
164164

165165
// Run the top-k algorithm
166-
batched_topk_keys<direction>(
166+
batched_topk_keys(
167167
d_keys_in,
168168
d_keys_out,
169169
cuda::args::immediate{segment_size, cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
170170
cuda::args::immediate{k, cuda::args::bounds<segment_size_t{1}, static_max_k>()},
171+
cuda::args::constant<direction>{},
171172
cuda::args::immediate{num_segments},
172173
cuda::args::immediate{num_segments * segment_size});
173174
// Prepare expected results
@@ -260,11 +261,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment
260261
c2h::device_vector<key_t> expected_keys(keys_in_buffer);
261262

262263
// Run the top-k algorithm
263-
batched_topk_keys<direction>(
264+
batched_topk_keys(
264265
d_keys_in,
265266
d_keys_out,
266267
cuda::args::deferred_sequence{segment_size_it, cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
267268
cuda::args::immediate{k, cuda::args::bounds<segment_size_t{1}, static_max_k>()},
269+
cuda::args::constant<direction>{},
268270
cuda::args::immediate{num_segments},
269271
cuda::args::immediate{num_items});
270272

@@ -357,12 +359,13 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with fixed-size segments and per
357359
c2h::device_vector<key_t> expected_keys(keys_in_buffer);
358360

359361
// Run the top-k algorithm with a per-segment k passed as a deferred sequence
360-
batched_topk_keys<direction>(
362+
batched_topk_keys(
361363
d_keys_in,
362364
d_keys_out,
363365
cuda::args::immediate{segment_size, cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
364366
cuda::args::deferred_sequence{
365367
thrust::raw_pointer_cast(segment_k.data()), cuda::args::bounds<segment_size_t{1}, static_max_k>()},
368+
cuda::args::constant<direction>{},
366369
cuda::args::immediate{num_segments},
367370
cuda::args::immediate{num_segments * segment_size});
368371

@@ -456,12 +459,13 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with variable-size segments and
456459
c2h::device_vector<key_t> expected_keys(keys_in_buffer);
457460

458461
// Run the top-k algorithm with a per-segment k passed as a deferred sequence
459-
batched_topk_keys<direction>(
462+
batched_topk_keys(
460463
d_keys_in,
461464
d_keys_out,
462465
cuda::args::deferred_sequence{segment_size_it, cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
463466
cuda::args::deferred_sequence{
464467
thrust::raw_pointer_cast(segment_k.data()), cuda::args::bounds<segment_size_t{1}, static_max_k>()},
468+
cuda::args::constant<direction>{},
465469
cuda::args::immediate{num_segments},
466470
cuda::args::immediate{num_items});
467471

@@ -500,11 +504,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys preserve -0.0f in output",
500504
auto d_keys_out_it =
501505
cuda::make_strided_iterator(cuda::make_counting_iterator(thrust::raw_pointer_cast(d_keys_out.data())), k);
502506

503-
batched_topk_keys<direction>(
507+
batched_topk_keys(
504508
d_keys_in_it,
505509
d_keys_out_it,
506510
cuda::args::immediate{segment_size, cuda::args::bounds<cuda::std::int64_t{1}, max_segment_size>()},
507511
cuda::args::immediate{k, cuda::args::bounds<cuda::std::int64_t{1}, k>()},
512+
cuda::args::constant<direction>{},
508513
cuda::args::immediate{num_segments},
509514
cuda::args::immediate{num_segments * segment_size});
510515

cub/test/catch2_test_device_segmented_topk_pairs.cu

Lines changed: 9 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -47,46 +47,8 @@ struct flag_intra_segment_duplicates
4747
template <typename ItemItT, typename SegIdItT>
4848
flag_intra_segment_duplicates(ItemItT, SegIdItT) -> flag_intra_segment_duplicates<ItemItT, SegIdItT>;
4949

50-
template <cub::detail::topk::select Direction,
51-
typename KeyInputItItT,
52-
typename KeyOutputItItT,
53-
typename ValueInputItItT,
54-
typename ValueOutputItItT,
55-
typename SegmentSizeParameterT,
56-
typename KParameterT,
57-
typename NumSegmentsParameterT,
58-
typename TotalNumItemsGuaranteeT>
59-
CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_pairs(
60-
void* d_temp_storage,
61-
size_t& temp_storage_bytes,
62-
KeyInputItItT d_key_segments_it,
63-
KeyOutputItItT d_key_segments_out_it,
64-
ValueInputItItT d_value_segments_it,
65-
ValueOutputItItT d_value_segments_out_it,
66-
SegmentSizeParameterT segment_sizes,
67-
KParameterT k,
68-
NumSegmentsParameterT num_segments,
69-
TotalNumItemsGuaranteeT total_num_items_guarantee,
70-
cudaStream_t stream = nullptr)
71-
{
72-
return cub::detail::batched_topk::dispatch(
73-
d_temp_storage,
74-
temp_storage_bytes,
75-
d_key_segments_it,
76-
d_key_segments_out_it,
77-
d_value_segments_it,
78-
d_value_segments_out_it,
79-
segment_sizes,
80-
k,
81-
cuda::args::constant<Direction>{},
82-
num_segments,
83-
total_num_items_guarantee,
84-
stream);
85-
}
86-
8750
// %PARAM% TEST_LAUNCH lid 0:1:2
88-
DECLARE_TMPL_LAUNCH_WRAPPER(
89-
dispatch_batched_topk_pairs, batched_topk_pairs, cub::detail::topk::select Direction, Direction);
51+
DECLARE_LAUNCH_WRAPPER(cub::detail::batched_topk::dispatch, batched_topk_pairs);
9052

9153
// Total segment size
9254
using max_segment_size_list = c2h::enum_type_list<cuda::std::size_t, 4 * 1024>;
@@ -262,13 +224,14 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments"
262224
c2h::device_vector<key_t> expected_keys(keys_in_buffer);
263225

264226
// Run the top-k algorithm
265-
batched_topk_pairs<direction>(
227+
batched_topk_pairs(
266228
d_keys_in,
267229
d_keys_out,
268230
d_values_in,
269231
d_values_out,
270232
cuda::args::immediate{segment_size, cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
271233
cuda::args::immediate{k, cuda::args::bounds<segment_size_t{1}, static_max_k>()},
234+
cuda::args::constant<direction>{},
272235
cuda::args::immediate{num_segments},
273236
cuda::args::immediate{num_segments * segment_size});
274237

@@ -382,13 +345,14 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen
382345
c2h::device_vector<key_t> expected_keys(keys_in_buffer);
383346

384347
// Run the top-k algorithm
385-
batched_topk_pairs<direction>(
348+
batched_topk_pairs(
386349
d_keys_in,
387350
d_keys_out,
388351
d_values_in,
389352
d_values_out,
390353
cuda::args::deferred_sequence{segment_size_it, cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
391354
cuda::args::immediate{k, cuda::args::bounds<segment_size_t{1}, static_max_k>()},
355+
cuda::args::constant<direction>{},
392356
cuda::args::immediate{num_segments},
393357
cuda::args::immediate{num_items});
394358

@@ -499,14 +463,15 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with fixed-size segments and pe
499463
c2h::device_vector<key_t> expected_keys(keys_in_buffer);
500464

501465
// Run the top-k algorithm with a per-segment k passed as a deferred sequence
502-
batched_topk_pairs<direction>(
466+
batched_topk_pairs(
503467
d_keys_in,
504468
d_keys_out,
505469
d_values_in,
506470
d_values_out,
507471
cuda::args::immediate{segment_size, cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
508472
cuda::args::deferred_sequence{
509473
thrust::raw_pointer_cast(segment_k.data()), cuda::args::bounds<segment_size_t{1}, static_max_k>()},
474+
cuda::args::constant<direction>{},
510475
cuda::args::immediate{num_segments},
511476
cuda::args::immediate{num_segments * segment_size});
512477

@@ -619,14 +584,15 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with variable-size segments and
619584
c2h::device_vector<key_t> expected_keys(keys_in_buffer);
620585

621586
// Run the top-k algorithm with a per-segment k passed as a deferred sequence
622-
batched_topk_pairs<direction>(
587+
batched_topk_pairs(
623588
d_keys_in,
624589
d_keys_out,
625590
d_values_in,
626591
d_values_out,
627592
cuda::args::deferred_sequence{segment_size_it, cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
628593
cuda::args::deferred_sequence{
629594
thrust::raw_pointer_cast(segment_k.data()), cuda::args::bounds<segment_size_t{1}, static_max_k>()},
595+
cuda::args::constant<direction>{},
630596
cuda::args::immediate{num_segments},
631597
cuda::args::immediate{num_items});
632598

libcudacxx/include/cuda/__argument/argument.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ inline constexpr bool __is_sequence_v =
9797
// spelling carries that intent.
9898

9999
//! @brief Wraps a compile-time constant argument value.
100-
template <auto _Value, class _Tp = decltype(_Value)>
100+
template <auto _Value, class _Tp = ::cuda::std::remove_cvref_t<decltype(_Value)>>
101101
class constant
102102
{
103103
public:

libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,30 @@ struct non_sequence_value
2020
int payload;
2121
};
2222

23+
enum class dependent_direction
24+
{
25+
min,
26+
max
27+
};
28+
29+
template <dependent_direction Value>
30+
struct dependent_direction_tag
31+
{
32+
static constexpr auto value = Value;
33+
};
34+
35+
template <class Tag>
36+
TEST_FUNC void test_dependent_constant_type()
37+
{
38+
constexpr auto direction = Tag::value;
39+
using constant_t = cuda::args::constant<direction>;
40+
41+
// Regression: NVCC bug generated a host stub using a cv/ref-qualified constant type while device registration used
42+
// the unqualified type, causing cudaErrorInvalidDeviceFunction when launching the kernel.
43+
static_assert(cuda::std::is_same_v<typename constant_t::value_type, dependent_direction>);
44+
static_assert(cuda::std::is_same_v<constant_t, cuda::args::constant<Tag::value, dependent_direction>>);
45+
}
46+
2347
TEST_FUNC void test()
2448
{
2549
// Basic value
@@ -47,6 +71,11 @@ TEST_FUNC void test()
4771
static_assert(cuda::args::__unwrap(sa_neg) == -1);
4872
}
4973

74+
// Dependent value
75+
{
76+
test_dependent_constant_type<dependent_direction_tag<dependent_direction::max>>();
77+
}
78+
5079
#if TEST_HAS_CLASS_NTTP
5180
// Non-sequence values are accepted without scalar-only restrictions
5281
{

0 commit comments

Comments
 (0)