NVIDIA · Jacobfaib · May 26, 2026 · Jun 22, 2026
diff --git a/.clang-tidy b/.clang-tidy
@@ -32,7 +32,6 @@ Checks:
   - 'bugprone-*'
   - '-bugprone-forward-declaration-namespace'
   - '-bugprone-narrowing-conversions'
-  - '-bugprone-misplaced-widening-cast'
   - '-bugprone-suspicious-include'
   - '-bugprone-exception-escape'
   - '-bugprone-crtp-constructor-accessibility'

@@ -236,7 +236,7 @@ GetAlignedPtrs(const void* in_begin, void* out_begin, ByteOffsetT num_bytes)
   // Bytes after `out_chars_aligned` to the last VectorT-aligned
   // address at (or before) `out_begin` + `num_bytes`
   uint32_t out_end_aligned{};
-  if (in_end_padding_req + alignment_offset > num_bytes)
+  if (in_end_padding_req + alignment_offset > num_bytes) // NOLINT(bugprone-misplaced-widening-cast)
   {
     out_end_aligned = out_start_aligned;
   }

@@ -41,7 +41,8 @@ struct agent_block_striped_t
     _CCCL_PRAGMA_UNROLL_FULL()
     for (int item = 0; item < items_per_thread; item++)
     {
-      const auto idx = static_cast<OffsetT>(threads_per_block * item + threadIdx.x);
+      const auto idx =
+        static_cast<OffsetT>(threads_per_block * item + threadIdx.x); // NOLINT(bugprone-misplaced-widening-cast)
 
       if (IsFullTile || idx < items_in_tile)
       {

@@ -78,7 +78,7 @@ struct agent_t
   struct alignas(cub::detail::LoadToSharedBufferAlignBytes<ValueT>()) buffer_t
   {
     // Need extra bytes of padding for TMA because this static buffer has to hold the two dynamically sized buffers.
-    static constexpr int bytes_needed = cub::detail::LoadToSharedBufferSizeBytes<ValueT>(items_per_tile + 1)
+    static constexpr int bytes_needed = cub::detail::LoadToSharedBufferSizeBytes<ValueT>(items_per_tile + 1ULL)
                                       + (alignof(ValueT) < bl2sh_minimum_align ? 2 * bl2sh_minimum_align : 0);
 
     char c_array[bytes_needed];

@@ -257,7 +257,9 @@ struct AgentRadixSortDownsweep
 
       key = bit_ordered_conversion::from_bit_ordered(decomposer, key);
 
-      if (FULL_TILE || (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+      if (FULL_TILE
+          || (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) // NOLINT(bugprone-misplaced-widening-cast)
+              < valid_items))
       {
         d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
       }
@@ -291,7 +293,9 @@ struct AgentRadixSortDownsweep
     {
       ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
 
-      if (FULL_TILE || (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+      if (FULL_TILE
+          || (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) // NOLINT(bugprone-misplaced-widening-cast)
+              < valid_items))
       {
         d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
       }

@@ -247,7 +247,8 @@ struct AgentRadixSortHistogram
       // Process the tiles.
       OffsetT portion_offset = portion * MAX_PORTION_SIZE;
       OffsetT portion_size   = ::cuda::std::min(MAX_PORTION_SIZE, num_items - portion_offset);
-      for (OffsetT offset = blockIdx.x * TILE_ITEMS; offset < portion_size; offset += TILE_ITEMS * gridDim.x)
+      for (OffsetT offset = static_cast<OffsetT>(blockIdx.x) * TILE_ITEMS; offset < portion_size;
+           offset += static_cast<OffsetT>(TILE_ITEMS) * gridDim.x)
       {
         OffsetT tile_offset = portion_offset + offset;
         bit_ordered_type keys[ITEMS_PER_THREAD];

@@ -398,7 +398,7 @@ struct AgentRadixSortOnesweep
     {
       // gather and scatter the values
       ValueT values[ITEMS_PER_THREAD];
-      LoadValues(block_idx * TILE_ITEMS, values);
+      LoadValues(block_idx * TILE_ITEMS, values); // NOLINT(bugprone-misplaced-widening-cast)
       if (full_block)
       {
         StoreDirectWarpStriped(threadIdx.x, d_values_out + global_offset, values);
@@ -598,7 +598,7 @@ struct AgentRadixSortOnesweep
 
     // load values
     ValueT values[ITEMS_PER_THREAD];
-    LoadValues(block_idx * TILE_ITEMS, values);
+    LoadValues(block_idx * TILE_ITEMS, values); // NOLINT(bugprone-misplaced-widening-cast)
 
     // scatter values
     __syncthreads();
@@ -618,7 +618,7 @@ struct AgentRadixSortOnesweep
     // if warp1 < warp2, all elements of warp1 occur before those of warp2
     // in the source array
     bit_ordered_type keys[ITEMS_PER_THREAD];
-    LoadKeys(block_idx * TILE_ITEMS, keys);
+    LoadKeys(block_idx * TILE_ITEMS, keys); // NOLINT(bugprone-misplaced-widening-cast)
 
     // rank keys
     int ranks[ITEMS_PER_THREAD];

@@ -331,14 +331,15 @@ struct AgentReduceImpl
     // Read first item
     if (IsFirstTile && (thread_offset < valid_items))
     {
-      thread_aggregate = transform_op(d_wrapped_in[block_offset + thread_offset]);
+      thread_aggregate =
+        transform_op(d_wrapped_in[block_offset + thread_offset]); // NOLINT(bugprone-misplaced-widening-cast)
       thread_offset += NumThreads;
     }
 
     // Continue reading items (block-striped)
     while (thread_offset < valid_items)
     {
-      InputT item(d_wrapped_in[block_offset + thread_offset]);
+      InputT item(d_wrapped_in[block_offset + thread_offset]); // NOLINT(bugprone-misplaced-widening-cast)
 
       thread_aggregate = reduction_op(thread_aggregate, transform_op(item));
       thread_offset += NumThreads;

@@ -432,8 +432,8 @@ struct AgentReduceByKey
     for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
     {
       KeyValuePairT pair                                = temp_storage.raw_exchange.Alias()[item];
-      d_unique_out[num_tile_segments_prefix + item]     = pair.key;
-      d_aggregates_out[num_tile_segments_prefix + item] = pair.value;
+      d_unique_out[num_tile_segments_prefix + item]     = pair.key; // NOLINT(bugprone-misplaced-widening-cast)
+      d_aggregates_out[num_tile_segments_prefix + item] = pair.value; // NOLINT(bugprone-misplaced-widening-cast)
     }
   }
 

@@ -362,7 +362,7 @@ struct AgentRle
       T tile_successor_item;
       if (threadIdx.x == BLOCK_THREADS - 1)
       {
-        tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+        tile_successor_item = d_in[tile_offset + TILE_ITEMS]; // NOLINT(bugprone-misplaced-widening-cast)
       }
 
       BlockDiscontinuityT(temp_storage.aliasable.scan_storage.discontinuity)
@@ -388,7 +388,7 @@ struct AgentRle
       T tile_successor_item;
       if (threadIdx.x == BLOCK_THREADS - 1)
       {
-        tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+        tile_successor_item = d_in[tile_offset + TILE_ITEMS]; // NOLINT(bugprone-misplaced-widening-cast)
       }
 
       // Get the last item from the previous tile

@@ -634,7 +634,8 @@ struct AgentSelectIf
 
     for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
     {
-      *((d_selected_out + streaming_context.num_previously_selected()) + (num_selections_prefix + item)) =
+      *((d_selected_out + streaming_context.num_previously_selected())
+        + (num_selections_prefix + item)) = // NOLINT(bugprone-misplaced-widening-cast)
         temp_storage.raw_exchange.Alias()[item];
     }
   }

@@ -345,11 +345,13 @@ struct AgentThreeWayPartition
     __syncthreads();
 
     // Gather items from shared memory and scatter to global
+    // NOLINTBEGIN(bugprone-misplaced-widening-cast)
     auto first_base =
       d_first_part_out + (streaming_context.num_previously_selected_first() + num_first_selections_prefix);
     auto second_base =
       d_second_part_out + (streaming_context.num_previously_selected_second() + num_second_selections_prefix);
     auto unselected_base = d_unselected_out + (streaming_context.num_previously_rejected() + num_rejected_prefix);
+    // NOLINTEND(bugprone-misplaced-widening-cast)
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
     {
       int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x;

@@ -360,13 +360,14 @@ struct AgentTopK
   {
     key_in_t thread_data[items_per_thread];
 
-    const OffsetT items_per_pass   = tile_items * gridDim.x;
+    const OffsetT items_per_pass =
+      static_cast<OffsetT>(tile_items * gridDim.x); // NOLINT(bugprone-misplaced-widening-cast)
     const OffsetT total_num_blocks = ::cuda::ceil_div(num_items, tile_items);
 
     const OffsetT num_remaining_elements = num_items % tile_items;
     const OffsetT last_block_id          = (total_num_blocks - 1) % gridDim.x;
 
-    OffsetT tile_base = blockIdx.x * tile_items;
+    OffsetT tile_base = static_cast<OffsetT>(blockIdx.x * tile_items); // NOLINT(bugprone-misplaced-widening-cast)
     OffsetT offset    = threadIdx.x * items_per_thread + tile_base;
 
     for (int i_block = blockIdx.x; i_block < total_num_blocks - 1; i_block += gridDim.x)

@@ -268,7 +268,7 @@ struct AgentUniqueByKey
     _CCCL_PRAGMA_NOUNROLL()
     for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
     {
-      items_out[num_selections_prefix + item] = GetShared(tag)[item];
+      items_out[num_selections_prefix + item] = GetShared(tag)[item]; // NOLINT(bugprone-misplaced-widening-cast)
     }
 
     __syncthreads();

@@ -66,7 +66,7 @@ LoadDirectBlocked(int linear_tid, RandomAccessIterator block_src_it, T (&dst_ite
   _CCCL_PRAGMA_UNROLL_FULL()
   for (int i = 0; i < ItemsPerThread; i++)
   {
-    dst_items[i] = block_src_it[linear_tid * ItemsPerThread + i];
+    dst_items[i] = block_src_it[linear_tid * ItemsPerThread + i]; // NOLINT(bugprone-misplaced-widening-cast)
   }
 }
 
@@ -310,7 +310,7 @@ LoadDirectStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst_ite
   _CCCL_PRAGMA_UNROLL_FULL()
   for (int i = 0; i < ItemsPerThread; i++)
   {
-    dst_items[i] = block_src_it[linear_tid + i * ThreadsPerBlock];
+    dst_items[i] = block_src_it[linear_tid + i * ThreadsPerBlock]; // NOLINT(bugprone-misplaced-widening-cast)
   }
 }
 
@@ -323,7 +323,8 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void load_transform_direct_striped(
   _CCCL_PRAGMA_UNROLL_FULL()
   for (int i = 0; i < ItemsPerThread; i++)
   {
-    dst_items[i] = transform_op(block_src_it[linear_tid + i * ThreadsPerBlock]);
+    dst_items[i] =
+      transform_op(block_src_it[linear_tid + i * ThreadsPerBlock]); // NOLINT(bugprone-misplaced-widening-cast)
   }
 }
 } // namespace detail
@@ -481,7 +482,8 @@ LoadDirectWarpStriped(int linear_tid, RandomAccessIterator block_src_it, T (&dst
   _CCCL_PRAGMA_UNROLL_FULL()
   for (int i = 0; i < ItemsPerThread; i++)
   {
-    new (&dst_items[i]) T(block_src_it[warp_offset + tid + (i * detail::warp_threads)]);
+    new (&dst_items[i])
+      T(block_src_it[warp_offset + tid + (i * detail::warp_threads)]); // NOLINT(bugprone-misplaced-widening-cast)
   }
 }
 

@@ -59,7 +59,7 @@ template <typename T, int ItemsPerThread, typename OutputIteratorT>
 _CCCL_DEVICE _CCCL_FORCEINLINE void
 StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T (&items)[ItemsPerThread])
 {
-  OutputIteratorT thread_itr = block_itr + (linear_tid * ItemsPerThread);
+  OutputIteratorT thread_itr = block_itr + (linear_tid * ItemsPerThread); // NOLINT(bugprone-misplaced-widening-cast)
 
   // Store directly in thread-blocked order
   _CCCL_PRAGMA_UNROLL_FULL()
@@ -104,7 +104,7 @@ template <typename T, int ItemsPerThread, typename OutputIteratorT>
 _CCCL_DEVICE _CCCL_FORCEINLINE void
 StoreDirectBlocked(int linear_tid, OutputIteratorT block_itr, T (&items)[ItemsPerThread], int valid_items)
 {
-  OutputIteratorT thread_itr = block_itr + (linear_tid * ItemsPerThread);
+  OutputIteratorT thread_itr = block_itr + (linear_tid * ItemsPerThread); // NOLINT(bugprone-misplaced-widening-cast)
 
   // Store directly in thread-blocked order
   _CCCL_PRAGMA_UNROLL_FULL()
@@ -346,7 +346,7 @@ StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[Ite
   _CCCL_PRAGMA_UNROLL_FULL()
   for (int ITEM = 0; ITEM < ItemsPerThread; ITEM++)
   {
-    thread_itr[(ITEM * detail::warp_threads)] = items[ITEM];
+    thread_itr[(ITEM * detail::warp_threads)] = items[ITEM]; // NOLINT(bugprone-misplaced-widening-cast)
   }
 }
 
@@ -402,7 +402,7 @@ StoreDirectWarpStriped(int linear_tid, OutputIteratorT block_itr, T (&items)[Ite
   {
     if (warp_offset + tid + (ITEM * detail::warp_threads) < valid_items)
     {
-      thread_itr[(ITEM * detail::warp_threads)] = items[ITEM];
+      thread_itr[(ITEM * detail::warp_threads)] = items[ITEM]; // NOLINT(bugprone-misplaced-widening-cast)
     }
   }
 }

@@ -218,7 +218,7 @@ private:
     }
     else
     {
-      return data[Fold + i];
+      return data[Fold + i]; // NOLINT(bugprone-misplaced-widening-cast)
     }
   }
 

@@ -750,7 +750,7 @@ public:
 
     if constexpr (sizeof(OffsetT) > sizeof(int))
     {
-      if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
+      if ((static_cast<unsigned long long>(num_rows) * row_stride_bytes) < static_cast<unsigned long long>(INT_MAX))
       {
         // Down-convert OffsetT data type
         return detail::histogram::dispatch_even<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
@@ -1450,7 +1450,7 @@ public:
 
     if constexpr (sizeof(OffsetT) > sizeof(int))
     {
-      if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
+      if ((static_cast<unsigned long long>(num_rows) * row_stride_bytes) < static_cast<unsigned long long>(INT_MAX))
       {
         // Down-convert OffsetT data type
         return detail::histogram::dispatch_range<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(

@@ -103,7 +103,7 @@ __launch_bounds__(int(current_policy<PolicySelector>().large_buffer.threads_per_
 
   constexpr uint32_t BLOCK_THREADS    = static_cast<uint32_t>(policy.threads_per_block);
   constexpr uint32_t ITEMS_PER_THREAD = static_cast<uint32_t>(policy.bytes_per_thread);
-  constexpr BufferSizeT TILE_SIZE     = static_cast<BufferSizeT>(BLOCK_THREADS * ITEMS_PER_THREAD);
+  constexpr BufferSizeT TILE_SIZE     = static_cast<BufferSizeT>(BLOCK_THREADS) * ITEMS_PER_THREAD;
 
   BufferOffsetT num_blev_buffers = buffer_offset_tile.LoadValid(last_tile_offset);
 

@@ -81,7 +81,7 @@ invoke_static_block_size(OffsetT num_items, OpT op, cudaStream_t stream, ForPoli
 {
   const int threads_per_block = active_policy.threads_per_block;
   const int items_per_thread  = active_policy.items_per_thread;
-  const auto tile_size        = static_cast<OffsetT>(threads_per_block * items_per_thread);
+  const auto tile_size        = static_cast<OffsetT>(threads_per_block) * static_cast<OffsetT>(items_per_thread);
   const auto num_tiles        = ::cuda::ceil_div(num_items, tile_size);
 
 #ifdef CUB_DEBUG_LOG

@@ -107,7 +107,8 @@ _CCCL_KERNEL_ATTRIBUTES void device_partition_merge_path_kernel(
                        ValueIt3,
                        Offset,
                        CompareOp>::type::items_per_tile;
-  const Offset diagonal_idx = static_cast<Offset>(blockDim.x * blockIdx.x + threadIdx.x);
+  const Offset diagonal_idx =
+    static_cast<Offset>(blockDim.x * blockIdx.x + threadIdx.x); // NOLINT(bugprone-misplaced-widening-cast)
   if (diagonal_idx < num_diagonals)
   {
     const Offset diagonal_num      = (::cuda::std::min) (diagonal_idx * items_per_tile, keys1_count + keys2_count);

@@ -1097,7 +1097,8 @@ public:
     }
 
     // Force kernel code-generation in all compiler passes
-    if (num_items <= static_cast<OffsetT>(policy.single_tile.threads_per_block * policy.single_tile.items_per_thread))
+    if (num_items <= (static_cast<OffsetT>(policy.single_tile.threads_per_block)
+                      * static_cast<OffsetT>(policy.single_tile.items_per_thread)))
     {
       // Small, single tile size
       return __invoke_single_tile(kernel_source.RadixSortSingleTileKernel(), policy.single_tile);

@@ -782,8 +782,8 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
 #endif // _CCCL_HOSTED() && defined(CUB_DEBUG_LOG)
 
   // Check for small, single tile size
-  if (num_items
-      <= static_cast<OffsetT>(active_policy.single_tile.threads_per_block * active_policy.single_tile.items_per_thread))
+  if (num_items <= (static_cast<OffsetT>(active_policy.single_tile.threads_per_block)
+                    * static_cast<OffsetT>(active_policy.single_tile.items_per_thread)))
   {
     // Return if the caller is simply requesting the size of the storage allocation
     if (d_temp_storage == nullptr)

@@ -359,8 +359,8 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
                }))
 #endif // _CCCL_HOSTED() && defined(CUB_DEBUG_LOG)
 
-  const auto tile_items =
-    static_cast<OffsetT>(active_policy.single_tile.threads_per_block * active_policy.single_tile.items_per_thread);
+  const auto tile_items = static_cast<OffsetT>(active_policy.single_tile.threads_per_block)
+                        * static_cast<OffsetT>(active_policy.single_tile.items_per_thread);
 
   using deterministic_add_t  = deterministic_sum_t<AccumT>;
   using input_unwrapped_it_t = THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator_t<InputIteratorT>;

@@ -688,7 +688,8 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
 
   const int threads_per_block = active_policy.threads_per_block;
   const int items_per_thread  = active_policy.items_per_thread;
-  const auto tile_size        = static_cast<global_offset_t>(threads_per_block * items_per_thread);
+  const auto tile_size =
+    static_cast<global_offset_t>(threads_per_block) * static_cast<global_offset_t>(items_per_thread);
 
   auto capped_num_items_per_invocation = num_items;
   if constexpr (use_streaming_invocation)

@@ -782,7 +782,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_fixed_size(
       return cudaSuccess;
     }
 
-    const auto num_segments_per_invocation =
+    constexpr auto num_segments_per_invocation =
       static_cast<::cuda::std::int64_t>(::cuda::std::numeric_limits<::cuda::std::int32_t>::max());
     const ::cuda::std::int64_t num_invocations = ::cuda::ceil_div(num_segments, num_segments_per_invocation);
 
@@ -812,7 +812,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_fixed_size(
         return error;
       }
 
-      d_in += num_segments_per_invocation * segment_size;
+      d_in += num_segments_per_invocation * segment_size; // NOLINT(bugprone-misplaced-widening-cast)
       d_out += num_segments_per_invocation;
 
       if (const auto error = CubDebug(cudaPeekAtLastError()))
@@ -923,7 +923,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_fixed_size(
       return error;
     }
 
-    d_in += num_segments_per_invocation * segment_size;
+    d_in += num_segments_per_invocation * segment_size; // NOLINT(bugprone-misplaced-widening-cast)
     d_out += num_segments_per_invocation;
 
     if (const auto error = CubDebug(cudaPeekAtLastError()))

@@ -1360,7 +1360,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
     auto large_segments_selector = kernel_source.LargeSegmentsSelector(
       active_policy.medium_segment.items_per_tile(), d_begin_offsets, d_end_offsets);
     auto small_segments_selector = kernel_source.SmallSegmentsSelector(
-      active_policy.small_segment.items_per_tile() + 1, d_begin_offsets, d_end_offsets);
+      static_cast<OffsetT>(active_policy.small_segment.items_per_tile()) + 1, d_begin_offsets, d_end_offsets);
 
     auto device_partition_temp_storage = keys_slot->create_alias<uint8_t>();
     if (partition_segments)
-Original file line number
+Diff line change
@@ Expand Up / @@ -218,7 +218,7 @@ private: @@
         }
         else
         {
-          return data[Fold + i];
+          return data[Fold + i]; // NOLINT(bugprone-misplaced-widening-cast)
         }
       }
@@ Expand Down @@