Skip to content

Commit cb65d4f

Browse files
authored
Merge branch 'main' into stf_unstable_unique_utility_move
2 parents ce6a160 + 722c25c commit cb65d4f

31 files changed

+316
-297
lines changed

cub/benchmarks/CMakeLists.txt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,10 @@ function(add_bench_dir bench_dir)
9797
add_bench(base_bench_target ${base_bench_name} "${bench_src}")
9898
add_dependencies(${benches_meta_target} ${base_bench_target})
9999
target_compile_definitions(${base_bench_target} PRIVATE TUNE_BASE=1)
100-
target_compile_options(${base_bench_target} PRIVATE "--extended-lambda")
100+
target_compile_options(
101+
${base_bench_target}
102+
PRIVATE "$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>"
103+
)
101104

102105
if (CUB_ENABLE_TUNING)
103106
# tuning
@@ -115,7 +118,7 @@ function(add_bench_dir bench_dir)
115118
target_compile_options(
116119
${bench_target}
117120
PRIVATE #
118-
"--extended-lambda"
121+
"$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>"
119122
"-include${tuning_path}"
120123
)
121124
else()

cub/cub/agent/agent_histogram.cuh

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -226,12 +226,14 @@ struct AgentHistogram
226226
_TempStorage& temp_storage;
227227
WrappedSampleIteratorT d_wrapped_samples; // with cache modifier applied, if possible
228228
SampleT* d_native_samples; // possibly nullptr if unavailable
229-
int* num_output_bins; // one for each channel
230-
int* num_privatized_bins; // one for each channel
229+
const int* num_output_bins; // one for each channel
230+
const int* num_privatized_bins; // one for each channel
231231
CounterT* d_privatized_histograms[NumActiveChannels]; // one for each channel
232232
CounterT** d_output_histograms; // in global memory
233-
OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each channel
234-
PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each channel
233+
const OutputDecodeOpT* output_decode_op; // determines output bin-id from privatized counter index, one for each
234+
// channel
235+
const PrivatizedDecodeOpT* privatized_decode_op; // determines privatized counter index from sample, one for each
236+
// channel
235237
bool prefer_smem; // for privatized counterss
236238

237239
template <typename TwoDimSubscriptableCounterT>
@@ -587,12 +589,12 @@ struct AgentHistogram
587589
_CCCL_DEVICE _CCCL_FORCEINLINE AgentHistogram(
588590
TempStorage& temp_storage,
589591
SampleIteratorT d_samples,
590-
int* num_output_bins,
591-
int* num_privatized_bins,
592+
const int* num_output_bins,
593+
const int* num_privatized_bins,
592594
CounterT** d_output_histograms,
593595
CounterT** d_privatized_histograms,
594-
OutputDecodeOpT* output_decode_op,
595-
PrivatizedDecodeOpT* privatized_decode_op)
596+
const OutputDecodeOpT* output_decode_op,
597+
const PrivatizedDecodeOpT* privatized_decode_op)
596598
: temp_storage(temp_storage.Alias())
597599
, d_wrapped_samples(d_samples)
598600
, d_native_samples(NativePointer(d_wrapped_samples))

cub/cub/detail/warpspeed/look_ahead.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ _CCCL_DEVICE_API void warpLoadLookback(
177177
// warp-uniform.
178178
//
179179
template <int numTileStatesPerThread, typename AccumT, typename ScanOpT>
180-
[[nodiscard]] _CCCL_DEVICE_API AccumT warpIncrementalLookback(
180+
[[nodiscard]] _CCCL_DEVICE_API _CCCL_FORCEINLINE AccumT warpIncrementalLookback(
181181
SpecialRegisters specialRegisters,
182182
tile_state_t<AccumT>* ptrTileStates,
183183
const int idxTilePrev,

cub/cub/device/dispatch/dispatch_adjacent_difference.cuh

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,11 @@ CUB_NAMESPACE_BEGIN
3939
namespace detail::adjacent_difference
4040
{
4141
template <typename AgentDifferenceInitT, typename InputIteratorT, typename InputT, typename OffsetT>
42-
CUB_DETAIL_KERNEL_ATTRIBUTES void
43-
DeviceAdjacentDifferenceInitKernel(InputIteratorT first, InputT* result, OffsetT num_tiles, int items_per_tile)
42+
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceInitKernel(
43+
_CCCL_GRID_CONSTANT const InputIteratorT first,
44+
_CCCL_GRID_CONSTANT InputT* const result,
45+
_CCCL_GRID_CONSTANT const OffsetT num_tiles,
46+
_CCCL_GRID_CONSTANT const int items_per_tile)
4447
{
4548
const int tile_idx = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
4649
AgentDifferenceInitT::Process(tile_idx, first, result, num_tiles, items_per_tile);
@@ -55,11 +58,11 @@ template <typename PolicySelector,
5558
bool MayAlias,
5659
bool ReadLeft>
5760
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(
58-
InputIteratorT input,
59-
InputT* first_tile_previous,
60-
OutputIteratorT result,
61+
_CCCL_GRID_CONSTANT const InputIteratorT input,
62+
_CCCL_GRID_CONSTANT InputT* const first_tile_previous,
63+
_CCCL_GRID_CONSTANT const OutputIteratorT result,
6164
DifferenceOpT difference_op,
62-
OffsetT num_items)
65+
_CCCL_GRID_CONSTANT const OffsetT num_items)
6366
{
6467
static_assert(::cuda::std::is_empty_v<PolicySelector>);
6568
static constexpr adjacent_difference_policy policy = PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10});

cub/cub/device/dispatch/dispatch_batch_memcpy.cuh

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ template <typename BufferOffsetScanTileStateT, typename BlockOffsetScanTileState
6161
CUB_DETAIL_KERNEL_ATTRIBUTES void InitTileStateKernel(
6262
BufferOffsetScanTileStateT buffer_offset_scan_tile_state,
6363
BlockOffsetScanTileStateT block_offset_scan_tile_state,
64-
TileOffsetT num_tiles)
64+
_CCCL_GRID_CONSTANT const TileOffsetT num_tiles)
6565
{
6666
// Initialize tile status
6767
buffer_offset_scan_tile_state.InitializeStatus(num_tiles);
@@ -83,12 +83,12 @@ template <typename ChainedPolicyT,
8383
CopyAlg MemcpyOpt>
8484
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS))
8585
CUB_DETAIL_KERNEL_ATTRIBUTES void MultiBlockBatchMemcpyKernel(
86-
InputBufferIt input_buffer_it,
87-
OutputBufferIt output_buffer_it,
88-
BufferSizeIteratorT buffer_sizes,
89-
BufferTileOffsetItT buffer_tile_offsets,
86+
_CCCL_GRID_CONSTANT const InputBufferIt input_buffer_it,
87+
_CCCL_GRID_CONSTANT const OutputBufferIt output_buffer_it,
88+
_CCCL_GRID_CONSTANT const BufferSizeIteratorT buffer_sizes,
89+
_CCCL_GRID_CONSTANT const BufferTileOffsetItT buffer_tile_offsets,
9090
TileT buffer_offset_tile,
91-
TileOffsetT last_tile_offset)
91+
_CCCL_GRID_CONSTANT const TileOffsetT last_tile_offset)
9292
{
9393
using StatusWord = typename TileT::StatusWord;
9494
using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT;
@@ -210,16 +210,16 @@ template <typename ChainedPolicyT,
210210
CopyAlg MemcpyOpt>
211211
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS))
212212
CUB_DETAIL_KERNEL_ATTRIBUTES void BatchMemcpyKernel(
213-
InputBufferIt input_buffer_it,
214-
OutputBufferIt output_buffer_it,
215-
BufferSizeIteratorT buffer_sizes,
216-
BufferOffsetT num_buffers,
217-
BlevBufferSrcsOutItT blev_buffer_srcs,
218-
BlevBufferDstsOutItT blev_buffer_dsts,
219-
BlevBufferSizesOutItT blev_buffer_sizes,
220-
BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
221-
BLevBufferOffsetTileState blev_buffer_scan_state,
222-
BLevBlockOffsetTileState blev_block_scan_state)
213+
_CCCL_GRID_CONSTANT const InputBufferIt input_buffer_it,
214+
_CCCL_GRID_CONSTANT const OutputBufferIt output_buffer_it,
215+
_CCCL_GRID_CONSTANT const BufferSizeIteratorT buffer_sizes,
216+
_CCCL_GRID_CONSTANT const BufferOffsetT num_buffers,
217+
_CCCL_GRID_CONSTANT const BlevBufferSrcsOutItT blev_buffer_srcs,
218+
_CCCL_GRID_CONSTANT const BlevBufferDstsOutItT blev_buffer_dsts,
219+
_CCCL_GRID_CONSTANT const BlevBufferSizesOutItT blev_buffer_sizes,
220+
_CCCL_GRID_CONSTANT const BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets,
221+
_CCCL_GRID_CONSTANT const BLevBufferOffsetTileState blev_buffer_scan_state,
222+
_CCCL_GRID_CONSTANT const BLevBlockOffsetTileState blev_block_scan_state)
223223
{
224224
// Internal type used for storing a buffer's size
225225
using BufferSizeT = it_value_t<BufferSizeIteratorT>;

cub/cub/device/dispatch/dispatch_merge.cuh

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -89,11 +89,11 @@ template <typename PolicySelector,
8989
typename Offset,
9090
typename CompareOp>
9191
CUB_DETAIL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
92-
KeyIt1 keys1,
93-
Offset keys1_count,
94-
KeyIt2 keys2,
95-
Offset keys2_count,
96-
Offset num_diagonals,
92+
_CCCL_GRID_CONSTANT const KeyIt1 keys1,
93+
_CCCL_GRID_CONSTANT const Offset keys1_count,
94+
_CCCL_GRID_CONSTANT const KeyIt2 keys2,
95+
_CCCL_GRID_CONSTANT const Offset keys2_count,
96+
_CCCL_GRID_CONSTANT const Offset num_diagonals,
9797
Offset* key1_beg_offsets,
9898
CompareOp compare_op)
9999
{
@@ -137,14 +137,14 @@ __launch_bounds__(
137137
Offset,
138138
CompareOp>::type::block_threads)
139139
CUB_DETAIL_KERNEL_ATTRIBUTES void device_merge_kernel(
140-
KeyIt1 keys1,
141-
ValueIt1 items1,
142-
Offset num_keys1,
143-
KeyIt2 keys2,
144-
ValueIt2 items2,
145-
Offset num_keys2,
146-
KeyIt3 keys_result,
147-
ValueIt3 items_result,
140+
_CCCL_GRID_CONSTANT const KeyIt1 keys1,
141+
_CCCL_GRID_CONSTANT const ValueIt1 items1,
142+
_CCCL_GRID_CONSTANT const Offset num_keys1,
143+
_CCCL_GRID_CONSTANT const KeyIt2 keys2,
144+
_CCCL_GRID_CONSTANT const ValueIt2 items2,
145+
_CCCL_GRID_CONSTANT const Offset num_keys2,
146+
_CCCL_GRID_CONSTANT const KeyIt3 keys_result,
147+
_CCCL_GRID_CONSTANT const ValueIt3 items_result,
148148
CompareOp compare_op,
149149
Offset* key1_beg_offsets,
150150
vsmem_t global_temp_storage)

cub/cub/device/dispatch/dispatch_reduce_by_key.cuh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -190,16 +190,16 @@ template <typename PolicySelector,
190190
#endif
191191
__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
192192
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceByKeyKernel(
193-
KeysInputIteratorT d_keys_in,
194-
UniqueOutputIteratorT d_unique_out,
195-
ValuesInputIteratorT d_values_in,
196-
AggregatesOutputIteratorT d_aggregates_out,
197-
NumRunsOutputIteratorT d_num_runs_out,
193+
_CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in,
194+
_CCCL_GRID_CONSTANT const UniqueOutputIteratorT d_unique_out,
195+
_CCCL_GRID_CONSTANT const ValuesInputIteratorT d_values_in,
196+
_CCCL_GRID_CONSTANT const AggregatesOutputIteratorT d_aggregates_out,
197+
_CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out,
198198
ScanTileStateT tile_state,
199-
int start_tile,
199+
_CCCL_GRID_CONSTANT const int start_tile,
200200
EqualityOpT equality_op,
201201
ReductionOpT reduction_op,
202-
OffsetT num_items,
202+
_CCCL_GRID_CONSTANT const OffsetT num_items,
203203
_CCCL_GRID_CONSTANT const StreamingContextT streaming_context,
204204
vsmem_t vsmem)
205205
{

cub/cub/device/dispatch/dispatch_rle.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -178,14 +178,14 @@ template <typename PolicySelector,
178178
#endif // _CCCL_HAS_CONCEPTS()
179179
__launch_bounds__(int(PolicySelector{}(::cuda::arch_id{CUB_PTX_ARCH / 10}).block_threads))
180180
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRleSweepKernel(
181-
InputIteratorT d_in,
182-
OffsetsOutputIteratorT d_offsets_out,
183-
LengthsOutputIteratorT d_lengths_out,
184-
NumRunsOutputIteratorT d_num_runs_out,
181+
_CCCL_GRID_CONSTANT const InputIteratorT d_in,
182+
_CCCL_GRID_CONSTANT const OffsetsOutputIteratorT d_offsets_out,
183+
_CCCL_GRID_CONSTANT const LengthsOutputIteratorT d_lengths_out,
184+
_CCCL_GRID_CONSTANT const NumRunsOutputIteratorT d_num_runs_out,
185185
ScanTileStateT tile_status,
186186
EqualityOpT equality_op,
187-
OffsetT num_items,
188-
int num_tiles,
187+
_CCCL_GRID_CONSTANT const OffsetT num_items,
188+
_CCCL_GRID_CONSTANT const int num_tiles,
189189
_CCCL_GRID_CONSTANT const StreamingContextT streaming_context)
190190
{
191191
static constexpr non_trivial_runs::rle_non_trivial_runs_policy policy =

cub/cub/device/dispatch/dispatch_scan_by_key.cuh

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -121,16 +121,16 @@ template <typename ChainedPolicyT,
121121
typename KeyT = cub::detail::it_value_t<KeysInputIteratorT>>
122122
__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS))
123123
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyKernel(
124-
KeysInputIteratorT d_keys_in,
125-
KeyT* d_keys_prev_in,
126-
ValuesInputIteratorT d_values_in,
127-
ValuesOutputIteratorT d_values_out,
124+
_CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in,
125+
_CCCL_GRID_CONSTANT KeyT* const d_keys_prev_in,
126+
_CCCL_GRID_CONSTANT const ValuesInputIteratorT d_values_in,
127+
_CCCL_GRID_CONSTANT const ValuesOutputIteratorT d_values_out,
128128
ScanByKeyTileStateT tile_state,
129-
int start_tile,
129+
_CCCL_GRID_CONSTANT const int start_tile,
130130
EqualityOp equality_op,
131-
ScanOpT scan_op,
132-
InitValueT init_value,
133-
OffsetT num_items)
131+
_CCCL_GRID_CONSTANT const ScanOpT scan_op,
132+
_CCCL_GRID_CONSTANT const InitValueT init_value,
133+
_CCCL_GRID_CONSTANT const OffsetT num_items)
134134
{
135135
using ScanByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT;
136136

@@ -157,10 +157,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THRE
157157
template <typename ScanTileStateT, typename KeysInputIteratorT, typename OffsetT>
158158
CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyInitKernel(
159159
ScanTileStateT tile_state,
160-
KeysInputIteratorT d_keys_in,
160+
_CCCL_GRID_CONSTANT const KeysInputIteratorT d_keys_in,
161161
cub::detail::it_value_t<KeysInputIteratorT>* d_keys_prev_in,
162-
OffsetT items_per_tile,
163-
int num_tiles)
162+
_CCCL_GRID_CONSTANT const OffsetT items_per_tile,
163+
_CCCL_GRID_CONSTANT const int num_tiles)
164164
{
165165
// Initialize tile status
166166
tile_state.InitializeStatus(num_tiles);

cub/cub/device/dispatch/dispatch_segmented_sort.cuh

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -183,25 +183,25 @@ template <typename LargeKernelT,
183183
typename EndOffsetIteratorT,
184184
typename KernelLauncherFactory>
185185
__launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContinuationKernel(
186-
LargeKernelT large_kernel,
187-
SmallKernelT small_kernel,
188-
local_segment_index_t num_segments,
189-
KeyT* d_current_keys,
190-
KeyT* d_final_keys,
186+
_CCCL_GRID_CONSTANT const LargeKernelT large_kernel,
187+
_CCCL_GRID_CONSTANT const SmallKernelT small_kernel,
188+
_CCCL_GRID_CONSTANT const local_segment_index_t num_segments,
189+
_CCCL_GRID_CONSTANT KeyT* const d_current_keys,
190+
_CCCL_GRID_CONSTANT KeyT* const d_final_keys,
191191
device_double_buffer<KeyT> d_keys_double_buffer,
192-
ValueT* d_current_values,
193-
ValueT* d_final_values,
192+
_CCCL_GRID_CONSTANT ValueT* const d_current_values,
193+
_CCCL_GRID_CONSTANT ValueT* const d_final_values,
194194
device_double_buffer<ValueT> d_values_double_buffer,
195-
BeginOffsetIteratorT d_begin_offsets,
196-
EndOffsetIteratorT d_end_offsets,
197-
local_segment_index_t* group_sizes,
198-
local_segment_index_t* large_and_medium_segments_indices,
199-
local_segment_index_t* small_segments_indices,
200-
KernelLauncherFactory launcher_factory,
201-
int large_block_threads,
202-
int small_block_threads,
203-
int medium_segments_per_block,
204-
int small_segments_per_block)
195+
_CCCL_GRID_CONSTANT const BeginOffsetIteratorT d_begin_offsets,
196+
_CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets,
197+
_CCCL_GRID_CONSTANT local_segment_index_t* const group_sizes,
198+
_CCCL_GRID_CONSTANT local_segment_index_t* const large_and_medium_segments_indices,
199+
_CCCL_GRID_CONSTANT local_segment_index_t* const small_segments_indices,
200+
_CCCL_GRID_CONSTANT const KernelLauncherFactory launcher_factory,
201+
_CCCL_GRID_CONSTANT const int large_block_threads,
202+
_CCCL_GRID_CONSTANT const int small_block_threads,
203+
_CCCL_GRID_CONSTANT const int medium_segments_per_block,
204+
_CCCL_GRID_CONSTANT const int small_segments_per_block)
205205
{
206206
// In case of CDP:
207207
// 1. each CTA has a different main stream

0 commit comments

Comments
 (0)