Work around NVCC constant argument type mismatch (#9382)

pciolkosz · web-flow · commit 49cdfe7e0b8e · 2026-06-22T19:05:44.000-07:00
Default cuda::args::constant value_type through remove_cvref_t so dependent constant expressions do not produce different host-stub and device-registration types.

Restore the CUB segmented top-k launch wrappers to pass cuda::args::constant directly, add the regression coverage, and lower the CTK12.0/GCC7 CUB host-launch CI shard parallelism to avoid runner OOM.
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
@@ -25,7 +25,12 @@ workflows:
 
   pull_request:
     # Old CTK: Oldest/newest supported host compilers:
-    - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc7',  'gcc12', 'clang14',            'msvc2019', 'msvc14.39']}
+    - {jobs: ['build'], std: 'minmax', ctk: '12.0', cxx: ['gcc12', 'clang14',            'msvc2019', 'msvc14.39']}
+    - {jobs: ['build'], project: ['libcudacxx', 'thrust'], std: 'minmax', ctk: '12.0', cxx: 'gcc7'}
+    - {jobs: ['build_nolid', 'build_lid1', 'build_lid2'], project: 'cub', std: 'minmax', ctk: '12.0', cxx: 'gcc7'}
+    # CTK12.0/GCC7 CUB host-launch builds are memory-heavy with benchmarks enabled; keep this shard below
+    # the 61 GiB linux-amd64-cpu16 runner limit.
+    - {jobs: ['build_lid0'], project: 'cub', std: 'minmax', ctk: '12.0', cxx: 'gcc7', environment: ['PARALLEL_LEVEL=8']}
     - {jobs: ['build'], std: 'minmax', ctk: '12.X', cxx: ['gcc7',  'gcc14',   'clang14', 'clang19', 'msvc2019', 'msvc2022' ]}
     - {jobs: ['build'], std: 'minmax', ctk: '13.0', cxx: ['gcc11', 'gcc15',   'clang15', 'clang20', 'msvc2019', 'msvc2022' ]}
     # Old CTK: cudax has a different support matrix:
diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu
@@ -27,11 +27,11 @@ struct is_minus_zero
   }
 };
 
-template <cub::detail::topk::select Direction,
-          typename KeyInputItItT,
+template <typename KeyInputItItT,
           typename KeyOutputItItT,
           typename SegmentSizeParamT,
           typename KParamT,
+          typename SelectDirectionT,
           typename NumSegmentsParameterT,
           typename TotalNumItemsGuaranteeT>
 CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys(
@@ -41,6 +41,7 @@ CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys(
   KeyOutputItItT d_key_segments_out_it,
   SegmentSizeParamT segment_sizes,
   KParamT k,
+  SelectDirectionT select_direction,
   NumSegmentsParameterT num_segments,
   TotalNumItemsGuaranteeT total_num_items_guarantee,
   cudaStream_t stream = nullptr)
@@ -55,15 +56,14 @@ CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys(
     values_it,
     segment_sizes,
     k,
-    cuda::args::constant<Direction>{},
+    select_direction,
     num_segments,
     total_num_items_guarantee,
     stream);
 }
 
 // %PARAM% TEST_LAUNCH lid 0:1:2
-DECLARE_TMPL_LAUNCH_WRAPPER(
-  dispatch_batched_topk_keys, batched_topk_keys, cub::detail::topk::select Direction, Direction);
+DECLARE_LAUNCH_WRAPPER(dispatch_batched_topk_keys, batched_topk_keys);
 
 // Total segment size
 using max_segment_size_list = c2h::enum_type_list<cuda::std::size_t, 4 * 1024>;
@@ -163,11 +163,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments",
   c2h::device_vector<key_t> expected_keys(keys_in_buffer);
 
   // Run the top-k algorithm
-  batched_topk_keys<direction>(
+  batched_topk_keys(
     d_keys_in,
     d_keys_out,
     cuda::args::immediate{segment_size, cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
     cuda::args::immediate{k, cuda::args::bounds<segment_size_t{1}, static_max_k>()},
+    cuda::args::constant<direction>{},
     cuda::args::immediate{num_segments},
     cuda::args::immediate{num_segments * segment_size});
   // Prepare expected results
@@ -260,11 +261,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment
   c2h::device_vector<key_t> expected_keys(keys_in_buffer);
 
   // Run the top-k algorithm
-  batched_topk_keys<direction>(
+  batched_topk_keys(
     d_keys_in,
     d_keys_out,
     cuda::args::deferred_sequence{segment_size_it, cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
     cuda::args::immediate{k, cuda::args::bounds<segment_size_t{1}, static_max_k>()},
+    cuda::args::constant<direction>{},
     cuda::args::immediate{num_segments},
     cuda::args::immediate{num_items});
 
@@ -357,12 +359,13 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with fixed-size segments and per
   c2h::device_vector<key_t> expected_keys(keys_in_buffer);
 
   // Run the top-k algorithm with a per-segment k passed as a deferred sequence
-  batched_topk_keys<direction>(
+  batched_topk_keys(
     d_keys_in,
     d_keys_out,
     cuda::args::immediate{segment_size, cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
     cuda::args::deferred_sequence{
       thrust::raw_pointer_cast(segment_k.data()), cuda::args::bounds<segment_size_t{1}, static_max_k>()},
+    cuda::args::constant<direction>{},
     cuda::args::immediate{num_segments},
     cuda::args::immediate{num_segments * segment_size});
 
@@ -456,12 +459,13 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with variable-size segments and
   c2h::device_vector<key_t> expected_keys(keys_in_buffer);
 
   // Run the top-k algorithm with a per-segment k passed as a deferred sequence
-  batched_topk_keys<direction>(
+  batched_topk_keys(
     d_keys_in,
     d_keys_out,
     cuda::args::deferred_sequence{segment_size_it, cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
     cuda::args::deferred_sequence{
       thrust::raw_pointer_cast(segment_k.data()), cuda::args::bounds<segment_size_t{1}, static_max_k>()},
+    cuda::args::constant<direction>{},
     cuda::args::immediate{num_segments},
     cuda::args::immediate{num_items});
 
@@ -500,11 +504,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys preserve -0.0f in output",
   auto d_keys_out_it =
     cuda::make_strided_iterator(cuda::make_counting_iterator(thrust::raw_pointer_cast(d_keys_out.data())), k);
 
-  batched_topk_keys<direction>(
+  batched_topk_keys(
     d_keys_in_it,
     d_keys_out_it,
     cuda::args::immediate{segment_size, cuda::args::bounds<cuda::std::int64_t{1}, max_segment_size>()},
     cuda::args::immediate{k, cuda::args::bounds<cuda::std::int64_t{1}, k>()},
+    cuda::args::constant<direction>{},
     cuda::args::immediate{num_segments},
     cuda::args::immediate{num_segments * segment_size});
 
diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu
@@ -47,46 +47,8 @@ struct flag_intra_segment_duplicates
 template <typename ItemItT, typename SegIdItT>
 flag_intra_segment_duplicates(ItemItT, SegIdItT) -> flag_intra_segment_duplicates<ItemItT, SegIdItT>;
 
-template <cub::detail::topk::select Direction,
-          typename KeyInputItItT,
-          typename KeyOutputItItT,
-          typename ValueInputItItT,
-          typename ValueOutputItItT,
-          typename SegmentSizeParameterT,
-          typename KParameterT,
-          typename NumSegmentsParameterT,
-          typename TotalNumItemsGuaranteeT>
-CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_pairs(
-  void* d_temp_storage,
-  size_t& temp_storage_bytes,
-  KeyInputItItT d_key_segments_it,
-  KeyOutputItItT d_key_segments_out_it,
-  ValueInputItItT d_value_segments_it,
-  ValueOutputItItT d_value_segments_out_it,
-  SegmentSizeParameterT segment_sizes,
-  KParameterT k,
-  NumSegmentsParameterT num_segments,
-  TotalNumItemsGuaranteeT total_num_items_guarantee,
-  cudaStream_t stream = nullptr)
-{
-  return cub::detail::batched_topk::dispatch(
-    d_temp_storage,
-    temp_storage_bytes,
-    d_key_segments_it,
-    d_key_segments_out_it,
-    d_value_segments_it,
-    d_value_segments_out_it,
-    segment_sizes,
-    k,
-    cuda::args::constant<Direction>{},
-    num_segments,
-    total_num_items_guarantee,
-    stream);
-}
-
 // %PARAM% TEST_LAUNCH lid 0:1:2
-DECLARE_TMPL_LAUNCH_WRAPPER(
-  dispatch_batched_topk_pairs, batched_topk_pairs, cub::detail::topk::select Direction, Direction);
+DECLARE_LAUNCH_WRAPPER(cub::detail::batched_topk::dispatch, batched_topk_pairs);
 
 // Total segment size
 using max_segment_size_list = c2h::enum_type_list<cuda::std::size_t, 4 * 1024>;
@@ -262,13 +224,14 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments"
   c2h::device_vector<key_t> expected_keys(keys_in_buffer);
 
   // Run the top-k algorithm
-  batched_topk_pairs<direction>(
+  batched_topk_pairs(
     d_keys_in,
     d_keys_out,
     d_values_in,
     d_values_out,
     cuda::args::immediate{segment_size, cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
     cuda::args::immediate{k, cuda::args::bounds<segment_size_t{1}, static_max_k>()},
+    cuda::args::constant<direction>{},
     cuda::args::immediate{num_segments},
     cuda::args::immediate{num_segments * segment_size});
 
@@ -382,13 +345,14 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen
   c2h::device_vector<key_t> expected_keys(keys_in_buffer);
 
   // Run the top-k algorithm
-  batched_topk_pairs<direction>(
+  batched_topk_pairs(
     d_keys_in,
     d_keys_out,
     d_values_in,
     d_values_out,
     cuda::args::deferred_sequence{segment_size_it, cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
     cuda::args::immediate{k, cuda::args::bounds<segment_size_t{1}, static_max_k>()},
+    cuda::args::constant<direction>{},
     cuda::args::immediate{num_segments},
     cuda::args::immediate{num_items});
 
@@ -499,14 +463,15 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with fixed-size segments and pe
   c2h::device_vector<key_t> expected_keys(keys_in_buffer);
 
   // Run the top-k algorithm with a per-segment k passed as a deferred sequence
-  batched_topk_pairs<direction>(
+  batched_topk_pairs(
     d_keys_in,
     d_keys_out,
     d_values_in,
     d_values_out,
     cuda::args::immediate{segment_size, cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
     cuda::args::deferred_sequence{
       thrust::raw_pointer_cast(segment_k.data()), cuda::args::bounds<segment_size_t{1}, static_max_k>()},
+    cuda::args::constant<direction>{},
     cuda::args::immediate{num_segments},
     cuda::args::immediate{num_segments * segment_size});
 
@@ -619,14 +584,15 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with variable-size segments and
   c2h::device_vector<key_t> expected_keys(keys_in_buffer);
 
   // Run the top-k algorithm with a per-segment k passed as a deferred sequence
-  batched_topk_pairs<direction>(
+  batched_topk_pairs(
     d_keys_in,
     d_keys_out,
     d_values_in,
     d_values_out,
     cuda::args::deferred_sequence{segment_size_it, cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
     cuda::args::deferred_sequence{
       thrust::raw_pointer_cast(segment_k.data()), cuda::args::bounds<segment_size_t{1}, static_max_k>()},
+    cuda::args::constant<direction>{},
     cuda::args::immediate{num_segments},
     cuda::args::immediate{num_items});
 
diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h
@@ -97,7 +97,7 @@ inline constexpr bool __is_sequence_v =
 // spelling carries that intent.
 
 //! @brief Wraps a compile-time constant argument value.
-template <auto _Value, class _Tp = decltype(_Value)>
+template <auto _Value, class _Tp = ::cuda::std::remove_cvref_t<decltype(_Value)>>
 class constant
 {
 public:
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp
@@ -20,6 +20,30 @@ struct non_sequence_value
   int payload;
 };
 
+enum class dependent_direction
+{
+  min,
+  max
+};
+
+template <dependent_direction Value>
+struct dependent_direction_tag
+{
+  static constexpr auto value = Value;
+};
+
+template <class Tag>
+TEST_FUNC void test_dependent_constant_type()
+{
+  constexpr auto direction = Tag::value;
+  using constant_t         = cuda::args::constant<direction>;
+
+  // Regression: NVCC bug generated a host stub using a cv/ref-qualified constant type while device registration used
+  // the unqualified type, causing cudaErrorInvalidDeviceFunction when launching the kernel.
+  static_assert(cuda::std::is_same_v<typename constant_t::value_type, dependent_direction>);
+  static_assert(cuda::std::is_same_v<constant_t, cuda::args::constant<Tag::value, dependent_direction>>);
+}
+
 TEST_FUNC void test()
 {
   // Basic value
@@ -47,6 +71,11 @@ TEST_FUNC void test()
     static_assert(cuda::args::__unwrap(sa_neg) == -1);
   }
 
+  // Dependent value
+  {
+    test_dependent_constant_type<dependent_direction_tag<dependent_direction::max>>();
+  }
+
 #if TEST_HAS_CLASS_NTTP
   // Non-sequence values are accepted without scalar-only restrictions
   {

Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ inline constexpr bool __is_sequence_v =`
`97`	`97`	`// spelling carries that intent.`
`98`	`98`
`99`	`99`	`//! @brief Wraps a compile-time constant argument value.`
`100`		`-template <auto _Value, class _Tp = decltype(_Value)>`
	`100`	`+template <auto _Value, class _Tp = ::cuda::std::remove_cvref_t<decltype(_Value)>>`
`101`	`101`	`class constant`
`102`	`102`	`{`
`103`	`103`	`public:`