Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ Checks:
- 'bugprone-*'
- '-bugprone-forward-declaration-namespace'
- '-bugprone-narrowing-conversions'
- '-bugprone-misplaced-widening-cast'
- '-bugprone-suspicious-include'
- '-bugprone-exception-escape'
- '-bugprone-crtp-constructor-accessibility'
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
// Bytes after `out_chars_aligned` to the last VectorT-aligned
// address at (or before) `out_begin` + `num_bytes`
uint32_t out_end_aligned{};
if (in_end_padding_req + alignment_offset > num_bytes)
if (in_end_padding_req + alignment_offset > num_bytes) // NOLINT(bugprone-misplaced-widening-cast)
{
out_end_aligned = out_start_aligned;
}
Expand Down
3 changes: 2 additions & 1 deletion cub/cub/agent/agent_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ struct agent_block_striped_t
_CCCL_PRAGMA_UNROLL_FULL()
for (int item = 0; item < items_per_thread; item++)
{
const auto idx = static_cast<OffsetT>(threads_per_block * item + threadIdx.x);
const auto idx =
static_cast<OffsetT>(threads_per_block * item + threadIdx.x); // NOLINT(bugprone-misplaced-widening-cast)

if (IsFullTile || idx < items_in_tile)
{
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ struct agent_t
struct alignas(cub::detail::LoadToSharedBufferAlignBytes<ValueT>()) buffer_t
{
// Need extra bytes of padding for TMA because this static buffer has to hold the two dynamically sized buffers.
static constexpr int bytes_needed = cub::detail::LoadToSharedBufferSizeBytes<ValueT>(items_per_tile + 1)
static constexpr int bytes_needed = cub::detail::LoadToSharedBufferSizeBytes<ValueT>(items_per_tile + 1ULL)
+ (alignof(ValueT) < bl2sh_minimum_align ? 2 * bl2sh_minimum_align : 0);

char c_array[bytes_needed];
Expand Down
8 changes: 6 additions & 2 deletions cub/cub/agent/agent_radix_sort_downsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,9 @@ struct AgentRadixSortDownsweep

key = bit_ordered_conversion::from_bit_ordered(decomposer, key);

if (FULL_TILE || (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
if (FULL_TILE
|| (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) // NOLINT(bugprone-misplaced-widening-cast)
< valid_items))
{
d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
}
Expand Down Expand Up @@ -291,7 +293,9 @@ struct AgentRadixSortDownsweep
{
ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];

if (FULL_TILE || (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
if (FULL_TILE
|| (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) // NOLINT(bugprone-misplaced-widening-cast)
< valid_items))
{
d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
}
Expand Down
3 changes: 2 additions & 1 deletion cub/cub/agent/agent_radix_sort_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,8 @@ struct AgentRadixSortHistogram
// Process the tiles.
OffsetT portion_offset = portion * MAX_PORTION_SIZE;
OffsetT portion_size = ::cuda::std::min(MAX_PORTION_SIZE, num_items - portion_offset);
for (OffsetT offset = blockIdx.x * TILE_ITEMS; offset < portion_size; offset += TILE_ITEMS * gridDim.x)
for (OffsetT offset = static_cast<OffsetT>(blockIdx.x) * TILE_ITEMS; offset < portion_size;
offset += static_cast<OffsetT>(TILE_ITEMS) * gridDim.x)
{
OffsetT tile_offset = portion_offset + offset;
bit_ordered_type keys[ITEMS_PER_THREAD];
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/agent/agent_radix_sort_onesweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ struct AgentRadixSortOnesweep
{
// gather and scatter the values
ValueT values[ITEMS_PER_THREAD];
LoadValues(block_idx * TILE_ITEMS, values);
LoadValues(block_idx * TILE_ITEMS, values); // NOLINT(bugprone-misplaced-widening-cast)
if (full_block)
{
StoreDirectWarpStriped(threadIdx.x, d_values_out + global_offset, values);
Expand Down Expand Up @@ -598,7 +598,7 @@ struct AgentRadixSortOnesweep

// load values
ValueT values[ITEMS_PER_THREAD];
LoadValues(block_idx * TILE_ITEMS, values);
LoadValues(block_idx * TILE_ITEMS, values); // NOLINT(bugprone-misplaced-widening-cast)

// scatter values
__syncthreads();
Expand All @@ -618,7 +618,7 @@ struct AgentRadixSortOnesweep
// if warp1 < warp2, all elements of warp1 occur before those of warp2
// in the source array
bit_ordered_type keys[ITEMS_PER_THREAD];
LoadKeys(block_idx * TILE_ITEMS, keys);
LoadKeys(block_idx * TILE_ITEMS, keys); // NOLINT(bugprone-misplaced-widening-cast)

// rank keys
int ranks[ITEMS_PER_THREAD];
Expand Down
5 changes: 3 additions & 2 deletions cub/cub/agent/agent_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -331,14 +331,15 @@ struct AgentReduceImpl
// Read first item
if (IsFirstTile && (thread_offset < valid_items))
{
thread_aggregate = transform_op(d_wrapped_in[block_offset + thread_offset]);
thread_aggregate =
transform_op(d_wrapped_in[block_offset + thread_offset]); // NOLINT(bugprone-misplaced-widening-cast)
thread_offset += NumThreads;
}

// Continue reading items (block-striped)
while (thread_offset < valid_items)
{
InputT item(d_wrapped_in[block_offset + thread_offset]);
InputT item(d_wrapped_in[block_offset + thread_offset]); // NOLINT(bugprone-misplaced-widening-cast)

thread_aggregate = reduction_op(thread_aggregate, transform_op(item));
thread_offset += NumThreads;
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -432,8 +432,8 @@ struct AgentReduceByKey
for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
{
KeyValuePairT pair = temp_storage.raw_exchange.Alias()[item];
d_unique_out[num_tile_segments_prefix + item] = pair.key;
d_aggregates_out[num_tile_segments_prefix + item] = pair.value;
d_unique_out[num_tile_segments_prefix + item] = pair.key; // NOLINT(bugprone-misplaced-widening-cast)
d_aggregates_out[num_tile_segments_prefix + item] = pair.value; // NOLINT(bugprone-misplaced-widening-cast)
}
}

Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ struct AgentRle
T tile_successor_item;
if (threadIdx.x == BLOCK_THREADS - 1)
{
tile_successor_item = d_in[tile_offset + TILE_ITEMS];
tile_successor_item = d_in[tile_offset + TILE_ITEMS]; // NOLINT(bugprone-misplaced-widening-cast)
}

BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity)
Expand All @@ -388,7 +388,7 @@ struct AgentRle
T tile_successor_item;
if (threadIdx.x == BLOCK_THREADS - 1)
{
tile_successor_item = d_in[tile_offset + TILE_ITEMS];
tile_successor_item = d_in[tile_offset + TILE_ITEMS]; // NOLINT(bugprone-misplaced-widening-cast)
}

// Get the last item from the previous tile
Expand Down
3 changes: 2 additions & 1 deletion cub/cub/agent/agent_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,8 @@ struct AgentSelectIf

for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
{
*((d_selected_out + streaming_context.num_previously_selected()) + (num_selections_prefix + item)) =
*((d_selected_out + streaming_context.num_previously_selected())
+ (num_selections_prefix + item)) = // NOLINT(bugprone-misplaced-widening-cast)
temp_storage.raw_exchange.Alias()[item];
}
}
Expand Down
2 changes: 2 additions & 0 deletions cub/cub/agent/agent_three_way_partition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -345,11 +345,13 @@ struct AgentThreeWayPartition
__syncthreads();

// Gather items from shared memory and scatter to global
// NOLINTBEGIN(bugprone-misplaced-widening-cast)
auto first_base =
d_first_part_out + (streaming_context.num_previously_selected_first() + num_first_selections_prefix);
auto second_base =
d_second_part_out + (streaming_context.num_previously_selected_second() + num_second_selections_prefix);
auto unselected_base = d_unselected_out + (streaming_context.num_previously_rejected() + num_rejected_prefix);
// NOLINTEND(bugprone-misplaced-widening-cast)
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
{
int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x;
Expand Down
5 changes: 3 additions & 2 deletions cub/cub/agent/agent_topk.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -360,13 +360,14 @@ struct AgentTopK
{
key_in_t thread_data[items_per_thread];

const OffsetT items_per_pass = tile_items * gridDim.x;
const OffsetT items_per_pass =
static_cast<OffsetT>(tile_items * gridDim.x); // NOLINT(bugprone-misplaced-widening-cast)
const OffsetT total_num_blocks = ::cuda::ceil_div(num_items, tile_items);

const OffsetT num_remaining_elements = num_items % tile_items;
const OffsetT last_block_id = (total_num_blocks - 1) % gridDim.x;

OffsetT tile_base = blockIdx.x * tile_items;
OffsetT tile_base = static_cast<OffsetT>(blockIdx.x * tile_items); // NOLINT(bugprone-misplaced-widening-cast)
OffsetT offset = threadIdx.x * items_per_thread + tile_base;

for (int i_block = blockIdx.x; i_block < total_num_blocks - 1; i_block += gridDim.x)
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_unique_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ struct AgentUniqueByKey
_CCCL_PRAGMA_NOUNROLL()
for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
{
items_out[num_selections_prefix + item] = GetShared(tag)[item];
items_out[num_selections_prefix + item] = GetShared(tag)[item]; // NOLINT(bugprone-misplaced-widening-cast)
}

__syncthreads();
Expand Down
10 changes: 6 additions & 4 deletions cub/cub/block/block_load.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_ite
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 0; i < ItemsPerThread; i++)
{
dst_items[i] = block_src_it[linear_tid * ItemsPerThread + i];
dst_items[i] = block_src_it[linear_tid * ItemsPerThread + i]; // NOLINT(bugprone-misplaced-widening-cast)
}
}

Expand Down Expand Up @@ -310,7 +310,7 @@ LoadDirectStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_ite
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 0; i < ItemsPerThread; i++)
{
dst_items[i] = block_src_it[linear_tid + i * ThreadsPerBlock];
dst_items[i] = block_src_it[linear_tid + i * ThreadsPerBlock]; // NOLINT(bugprone-misplaced-widening-cast)
}
}

Expand All @@ -323,7 +323,8 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 0; i < ItemsPerThread; i++)
{
dst_items[i] = transform_op(block_src_it[linear_tid + i * ThreadsPerBlock]);
dst_items[i] =
transform_op(block_src_it[linear_tid + i * ThreadsPerBlock]); // NOLINT(bugprone-misplaced-widening-cast)
}
}
} // namespace detail
Expand Down Expand Up @@ -481,7 +482,8 @@ LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 0; i < ItemsPerThread; i++)
{
new (&dst_items[i]) T(block_src_it[warp_offset + tid + (i * detail::warp_threads)]);
new (&dst_items[i])
T(block_src_it[warp_offset + tid + (i * detail::warp_threads)]); // NOLINT(bugprone-misplaced-widening-cast)
}
}

Expand Down
8 changes: 4 additions & 4 deletions cub/cub/block/block_store.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ template <typename T, int ItemsPerThread, typename OutputIteratorT>
_CCCL_DEVICE _CCCL_FORCEINLINE void
StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T (&items)[ItemsPerThread])
{
OutputIteratorT thread_itr = block_itr + (linear_tid * ItemsPerThread);
OutputIteratorT thread_itr = block_itr + (linear_tid * ItemsPerThread); // NOLINT(bugprone-misplaced-widening-cast)

// Store directly in thread-blocked order
_CCCL_PRAGMA_UNROLL_FULL()
Expand Down Expand Up @@ -104,7 +104,7 @@ template <typename T, int ItemsPerThread, typename OutputIteratorT>
_CCCL_DEVICE _CCCL_FORCEINLINE void
StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T (&items)[ItemsPerThread], int valid_items)
{
OutputIteratorT thread_itr = block_itr + (linear_tid * ItemsPerThread);
OutputIteratorT thread_itr = block_itr + (linear_tid * ItemsPerThread); // NOLINT(bugprone-misplaced-widening-cast)

// Store directly in thread-blocked order
_CCCL_PRAGMA_UNROLL_FULL()
Expand Down Expand Up @@ -346,7 +346,7 @@ StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[Ite
_CCCL_PRAGMA_UNROLL_FULL()
for (int ITEM = 0; ITEM < ItemsPerThread; ITEM++)
{
thread_itr[(ITEM * detail::warp_threads)] = items[ITEM];
thread_itr[(ITEM * detail::warp_threads)] = items[ITEM]; // NOLINT(bugprone-misplaced-widening-cast)
}
}

Expand Down Expand Up @@ -402,7 +402,7 @@ StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[Ite
{
if (warp_offset + tid + (ITEM * detail::warp_threads) < valid_items)
{
thread_itr[(ITEM * detail::warp_threads)] = items[ITEM];
thread_itr[(ITEM * detail::warp_threads)] = items[ITEM]; // NOLINT(bugprone-misplaced-widening-cast)
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/detail/rfa.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ private:
}
else
{
return data[Fold + i];
return data[Fold + i]; // NOLINT(bugprone-misplaced-widening-cast)
}
}

Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/device_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -750,7 +750,7 @@ public:

if constexpr (sizeof(OffsetT) > sizeof(int))
{
if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
if ((static_cast<unsigned long long>(num_rows) * row_stride_bytes) < static_cast<unsigned long long>(INT_MAX))
{
// Down-convert OffsetT data type
return detail::histogram::dispatch_even<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
Expand Down Expand Up @@ -1450,7 +1450,7 @@ public:

if constexpr (sizeof(OffsetT) > sizeof(int))
{
if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
if ((static_cast<unsigned long long>(num_rows) * row_stride_bytes) < static_cast<unsigned long long>(INT_MAX))
{
// Down-convert OffsetT data type
return detail::histogram::dispatch_range<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ __launch_bounds__(int(current_policy<PolicySelector>().large_buffer.threads_per_

constexpr uint32_t BLOCK_THREADS = static_cast<uint32_t>(policy.threads_per_block);
constexpr uint32_t ITEMS_PER_THREAD = static_cast<uint32_t>(policy.bytes_per_thread);
constexpr BufferSizeT TILE_SIZE = static_cast<BufferSizeT>(BLOCK_THREADS * ITEMS_PER_THREAD);
constexpr BufferSizeT TILE_SIZE = static_cast<BufferSizeT>(BLOCK_THREADS) * ITEMS_PER_THREAD;

BufferOffsetT num_blev_buffers = buffer_offset_tile.LoadValid(last_tile_offset);

Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ invoke_static_block_size(OffsetT num_items, OpT op, cudaStream_t stream, ForPoli
{
const int threads_per_block = active_policy.threads_per_block;
const int items_per_thread = active_policy.items_per_thread;
const auto tile_size = static_cast<OffsetT>(threads_per_block * items_per_thread);
const auto tile_size = static_cast<OffsetT>(threads_per_block) * static_cast<OffsetT>(items_per_thread);
const auto num_tiles = ::cuda::ceil_div(num_items, tile_size);

#ifdef CUB_DEBUG_LOG
Expand Down
3 changes: 2 additions & 1 deletion cub/cub/device/dispatch/dispatch_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,8 @@ _CCCL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
ValueIt3,
Offset,
CompareOp>::type::items_per_tile;
const Offset diagonal_idx = static_cast<Offset>(blockDim.x * blockIdx.x + threadIdx.x);
const Offset diagonal_idx =
static_cast<Offset>(blockDim.x * blockIdx.x + threadIdx.x); // NOLINT(bugprone-misplaced-widening-cast)
Comment thread
Jacobfaib marked this conversation as resolved.
if (diagonal_idx < num_diagonals)
{
const Offset diagonal_num = (::cuda::std::min) (diagonal_idx * items_per_tile, keys1_count + keys2_count);
Expand Down
3 changes: 2 additions & 1 deletion cub/cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1097,7 +1097,8 @@ public:
}

// Force kernel code-generation in all compiler passes
if (num_items <= static_cast<OffsetT>(policy.single_tile.threads_per_block * policy.single_tile.items_per_thread))
if (num_items <= (static_cast<OffsetT>(policy.single_tile.threads_per_block)
* static_cast<OffsetT>(policy.single_tile.items_per_thread)))
{
// Small, single tile size
return __invoke_single_tile(kernel_source.RadixSortSingleTileKernel(), policy.single_tile);
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -782,8 +782,8 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
#endif // _CCCL_HOSTED() && defined(CUB_DEBUG_LOG)

// Check for small, single tile size
if (num_items
<= static_cast<OffsetT>(active_policy.single_tile.threads_per_block * active_policy.single_tile.items_per_thread))
if (num_items <= (static_cast<OffsetT>(active_policy.single_tile.threads_per_block)
* static_cast<OffsetT>(active_policy.single_tile.items_per_thread)))
{
// Return if the caller is simply requesting the size of the storage allocation
if (d_temp_storage == nullptr)
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -359,8 +359,8 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
}))
#endif // _CCCL_HOSTED() && defined(CUB_DEBUG_LOG)

const auto tile_items =
static_cast<OffsetT>(active_policy.single_tile.threads_per_block * active_policy.single_tile.items_per_thread);
const auto tile_items = static_cast<OffsetT>(active_policy.single_tile.threads_per_block)
* static_cast<OffsetT>(active_policy.single_tile.items_per_thread);

using deterministic_add_t = deterministic_sum_t<AccumT>;
using input_unwrapped_it_t = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<InputIteratorT>;
Expand Down
3 changes: 2 additions & 1 deletion cub/cub/device/dispatch/dispatch_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,8 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(

const int threads_per_block = active_policy.threads_per_block;
const int items_per_thread = active_policy.items_per_thread;
const auto tile_size = static_cast<global_offset_t>(threads_per_block * items_per_thread);
const auto tile_size =
static_cast<global_offset_t>(threads_per_block) * static_cast<global_offset_t>(items_per_thread);

auto capped_num_items_per_invocation = num_items;
if constexpr (use_streaming_invocation)
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_segmented_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_fixed_size(
return cudaSuccess;
}

const auto num_segments_per_invocation =
constexpr auto num_segments_per_invocation =
static_cast<::cuda::std::int64_t>(::cuda::std::numeric_limits<::cuda::std::int32_t>::max());
const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);

Expand Down Expand Up @@ -812,7 +812,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_fixed_size(
return error;
}

d_in += num_segments_per_invocation * segment_size;
d_in += num_segments_per_invocation * segment_size; // NOLINT(bugprone-misplaced-widening-cast)
d_out += num_segments_per_invocation;

if (const auto error = CubDebug(cudaPeekAtLastError()))
Expand Down Expand Up @@ -923,7 +923,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_fixed_size(
return error;
}

d_in += num_segments_per_invocation * segment_size;
d_in += num_segments_per_invocation * segment_size; // NOLINT(bugprone-misplaced-widening-cast)
d_out += num_segments_per_invocation;

if (const auto error = CubDebug(cudaPeekAtLastError()))
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_segmented_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1360,7 +1360,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
auto large_segments_selector = kernel_source.LargeSegmentsSelector(
active_policy.medium_segment.items_per_tile(), d_begin_offsets, d_end_offsets);
auto small_segments_selector = kernel_source.SmallSegmentsSelector(
active_policy.small_segment.items_per_tile() + 1, d_begin_offsets, d_end_offsets);
static_cast<OffsetT>(active_policy.small_segment.items_per_tile()) + 1, d_begin_offsets, d_end_offsets);

auto device_partition_temp_storage = keys_slot->create_alias<uint8_t>();
if (partition_segments)
Expand Down
Loading
Loading