NVIDIA · Jacobfaib · May 27, 2026
diff --git a/.clang-tidy b/.clang-tidy
@@ -31,7 +31,6 @@ Checks:
   - 'clang-diagnostic-*'
   - 'bugprone-*'
   - '-bugprone-forward-declaration-namespace'
-  - '-bugprone-narrowing-conversions'
   - '-bugprone-misplaced-widening-cast'
   - '-bugprone-signed-char-misuse'
   - '-bugprone-suspicious-include'
@@ -209,7 +208,6 @@ Checks:
   - '-cppcoreguidelines-noexcept-move-operations'
   # REVIEW ME: This warns about any usage of operator[], suggesting usage of .at() instead. Could
   - '-cppcoreguidelines-pro-bounds-avoid-unchecked-container-access'
-  # Covered by bugprone-narrowing-conversions
   - '-cppcoreguidelines-narrowing-conversions'
   # Covered by misc-unconventional-assign-operator
   - '-cppcoreguidelines-c-copy-assignment-signature'

@@ -37,7 +37,7 @@ __device__ unsigned g_busy_sink;
 // commits its value.
 __global__ void slow_set_kernel(int* arr, int n, int value, int iters)
 {
-  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
   if (tid >= n)
   {
     return;

@@ -17,8 +17,8 @@
 
 __global__ void axpy(int cnt, double a, const double* x, double* y)
 {
-  int tid      = blockIdx.x * blockDim.x + threadIdx.x;
-  int nthreads = gridDim.x * blockDim.x;
+  int tid      = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+  int nthreads = static_cast<int>(gridDim.x * blockDim.x);
 
   for (int i = tid; i < cnt; i += nthreads)
   {
@@ -48,8 +48,8 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")
 
   for (size_t i = 0; i < N; i++)
   {
-    X[i] = X0(i);
-    Y[i] = Y0(i);
+    X[i] = X0(static_cast<int>(i));
+    Y[i] = Y0(static_cast<int>(i));
   }
 
   const double alpha = 3.14;

@@ -17,8 +17,8 @@
 
 __global__ void fill_kernel(int cnt, double* data, double value)
 {
-  int tid      = blockIdx.x * blockDim.x + threadIdx.x;
-  int nthreads = gridDim.x * blockDim.x;
+  int tid      = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+  int nthreads = static_cast<int>(gridDim.x * blockDim.x);
 
   for (int i = tid; i < cnt; i += nthreads)
   {
@@ -51,7 +51,7 @@ static void verify_callback(stf_host_launch_deps_handle deps)
   auto* data = static_cast<double*>(stf_host_launch_deps_get(deps, 0));
   for (size_t i = 0; i < v->N; i++)
   {
-    if (fabs(data[i] - (42.0 + i)) > 1e-10)
+    if (fabs(data[i] - (42.0 + static_cast<double>(i))) > 1e-10)
     {
       *v->passed = false;
       return;

@@ -23,7 +23,7 @@
 
 __global__ void scale_inplace(int n, float* data, float factor)
 {
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int i = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
   if (i < n)
   {
     data[i] *= factor;

@@ -18,8 +18,8 @@
 
 __global__ void scale_kernel(int cnt, double* data, double factor)
 {
-  const int tid      = blockIdx.x * blockDim.x + threadIdx.x;
-  const int nthreads = gridDim.x * blockDim.x;
+  const int tid      = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+  const int nthreads = static_cast<int>(gridDim.x * blockDim.x);
   for (int i = tid; i < cnt; i += nthreads)
   {
     data[i] *= factor;
@@ -28,8 +28,8 @@ __global__ void scale_kernel(int cnt, double* data, double factor)
 
 __global__ void increment_kernel(int cnt, double* data)
 {
-  const int tid      = blockIdx.x * blockDim.x + threadIdx.x;
-  const int nthreads = gridDim.x * blockDim.x;
+  const int tid      = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+  const int nthreads = static_cast<int>(gridDim.x * blockDim.x);
   for (int i = tid; i < cnt; i += nthreads)
   {
     data[i] += 1.0;

@@ -32,7 +32,7 @@ __device__ unsigned g_busy_sink;
 // ctx1 is still running when ctx2's kernel races in.
 __global__ void slow_set_kernel(int* arr, int n, int value, int iters)
 {
-  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
   if (tid >= n)
   {
     return;

@@ -18,8 +18,8 @@
 
 __global__ void scale_kernel(int cnt, double* data, double factor)
 {
-  const int tid      = blockIdx.x * blockDim.x + threadIdx.x;
-  const int nthreads = gridDim.x * blockDim.x;
+  const int tid      = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
+  const int nthreads = static_cast<int>(gridDim.x * blockDim.x);
   for (int i = tid; i < cnt; i += nthreads)
   {
     data[i] *= factor;

@@ -227,7 +227,7 @@ C2H_TEST("DeviceHistogram::HistogramEven API usage", "[histogram][device]")
   value_t<level_t> lower_level_val{lower_level};
   value_t<level_t> upper_level_val{upper_level};
 
-  size_t row_stride_samples = num_samples;
+  int64_t row_stride_samples = static_cast<int64_t>(num_samples);
 
   histogram_even(
     d_samples_ptr,

@@ -413,8 +413,8 @@ extern "C" __device__ void {0}(void* lhs_ptr, void* rhs_ptr, void* out_ptr) {{
 
   for (std::size_t i = 0; i < n_segments; ++i)
   {
-    auto segment_begin_it = host_input.begin() + segments[i];
-    auto segment_end_it   = host_input.begin() + segments[i + 1];
+    auto segment_begin_it = host_input.begin() + static_cast<std::ptrdiff_t>(segments[i]);
+    auto segment_end_it   = host_input.begin() + static_cast<std::ptrdiff_t>(segments[i + 1]);
     host_output[i]        = std::reduce(segment_begin_it, segment_end_it, v0, [](pair lhs, pair rhs) {
       return pair{static_cast<short>(lhs.a + rhs.a), lhs.b + rhs.b};
     });
@@ -484,8 +484,8 @@ extern "C" __device__ void {0}(void* lhs_ptr, void* rhs_ptr, void* out_ptr) {{
 
   for (std::size_t i = 0; i < n_segments; ++i)
   {
-    auto segment_begin_it = host_input.begin() + segments[i];
-    auto segment_end_it   = host_input.begin() + segments[i + 1];
+    auto segment_begin_it = host_input.begin() + static_cast<std::ptrdiff_t>(segments[i]);
+    auto segment_end_it   = host_input.begin() + static_cast<std::ptrdiff_t>(segments[i + 1]);
     host_output[i]        = std::reduce(segment_begin_it, segment_end_it, v0, [](pair lhs, pair rhs) {
       return pair{static_cast<short>(lhs.a + rhs.a), lhs.b + rhs.b};
     });

@@ -645,7 +645,7 @@ C2H_TEST("SegmentedSort works with variable segment sizes", "[segmented_sort][va
   for (std::size_t i = 0; i < n_segments; ++i)
   {
     start_offsets[i] = current_offset;
-    current_offset += segment_sizes[i];
+    current_offset += static_cast<SizeT>(segment_sizes[i]);
     end_offsets[i] = current_offset;
   }
 

@@ -46,7 +46,7 @@ inline std::string inspect_sass(const void* cubin, size_t cubin_size)
     throw std::runtime_error("Failed to create temporary file.");
   }
 
-  temp_in_file.write(static_cast<const char*>(cubin), cubin_size);
+  temp_in_file.write(static_cast<const char*>(cubin), static_cast<std::streamsize>(cubin_size));
   temp_in_file.close();
 
   std::string command = "nvdisasm -gi ";

@@ -55,7 +55,7 @@ class cpu_timer
 
   void print_elapsed_seconds(const std::string& label)
   {
-    printf("%0.6f s: %s\n", this->elapsed_us() / 1000000.f, label.c_str());
+    printf("%0.6f s: %s\n", static_cast<float>(this->elapsed_us()) / 1000000.f, label.c_str());
   }
 
   void print_elapsed_seconds_and_reset(const std::string& label)

@@ -100,10 +100,11 @@ generate_lhs_rhs(std::size_t num_items_lhs, std::size_t num_items_rhs, bit_entro
   // selected for lhs and *all* items after the pivot point.
   constexpr std::size_t num_pivot_points = 1;
   thrust::device_vector<offset_t> pivot_point(num_pivot_points);
-  auto counting_it = thrust::make_counting_iterator(offset_t{0});
+  auto counting_it            = thrust::make_counting_iterator(offset_t{0});
+  using counting_difference_t = typename decltype(counting_it)::difference_type;
   thrust::copy_if(
     counting_it,
-    counting_it + elements,
+    counting_it + static_cast<counting_difference_t>(elements),
     rnd_selector_val.begin(),
     cuda::make_tabulate_output_iterator(write_pivot_point_t<offset_t>{
       static_cast<offset_t>(num_items_lhs), thrust::raw_pointer_cast(pivot_point.data())}),

@@ -841,7 +841,7 @@ private:
     BufferSizeIteratorT tile_buffer_sizes,
     BlockBufferOffsetT num_wlev_buffers)
   {
-    const int32_t warp_id              = threadIdx.x / warp_threads;
+    const int32_t warp_id              = static_cast<int32_t>(threadIdx.x / warp_threads);
     constexpr uint32_t warps_per_block = BLOCK_THREADS / warp_threads;
 
     for (BlockBufferOffsetT buffer_offset = warp_id; buffer_offset < num_wlev_buffers; buffer_offset += warps_per_block)

@@ -239,7 +239,7 @@ struct AgentHistogram
     _CCCL_PRAGMA_UNROLL_FULL()
     for (int ch = 0; ch < NumActiveChannels; ++ch)
     {
-      for (int bin = threadIdx.x; bin < num_privatized_bins[ch]; bin += threads_per_block)
+      for (int bin = static_cast<int>(threadIdx.x); bin < num_privatized_bins[ch]; bin += threads_per_block)
       {
         privatized_histograms[ch][bin] = 0;
       }
@@ -262,7 +262,7 @@ struct AgentHistogram
     for (int ch = 0; ch < NumActiveChannels; ++ch)
     {
       const int channel_bins = num_privatized_bins[ch];
-      for (int bin = threadIdx.x; bin < channel_bins; bin += threads_per_block)
+      for (int bin = static_cast<int>(threadIdx.x); bin < channel_bins; bin += threads_per_block)
       {
         int output_bin       = -1;
         const CounterT count = privatized_histograms[ch][bin];
@@ -479,7 +479,7 @@ struct AgentHistogram
     ::cuda::std::true_type is_work_stealing)
   {
     int num_tiles                = num_rows * tiles_per_row;
-    int tile_idx                 = (blockIdx.y * gridDim.x) + blockIdx.x;
+    int tile_idx                 = static_cast<int>((blockIdx.y * gridDim.x) + blockIdx.x);
     OffsetT num_even_share_tiles = gridDim.x * gridDim.y;
 
     while (tile_idx < num_tiles)
@@ -530,7 +530,7 @@ struct AgentHistogram
   _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles(
     OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int, GridQueue<int>, ::cuda::std::false_type)
   {
-    for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
+    for (int row = static_cast<int>(blockIdx.y); row < num_rows; row += static_cast<int>(gridDim.y))
     {
       OffsetT row_begin   = row * row_stride_samples;
       OffsetT row_end     = row_begin + (num_row_pixels * NumChannels);
@@ -605,7 +605,7 @@ struct AgentHistogram
                                                : // prefer gmem privatized histograms
                       blockIdx.x & 1) // prefer blended privatized histograms
   {
-    const int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
+    const int blockId = static_cast<int>((blockIdx.y * gridDim.x) + blockIdx.x);
 
     // TODO(bgruber): d_privatized_histograms seems only used when !prefer_smem, can we skip it if prefer_smem?
     // Initialize the locations of this block's privatized histograms

@@ -153,7 +153,7 @@ struct AgentRadixSortHistogram
   {
     // Initialize bins to 0.
     _CCCL_PRAGMA_UNROLL_FULL()
-    for (int bin = threadIdx.x; bin < RADIX_DIGITS; bin += BLOCK_THREADS)
+    for (int bin = static_cast<int>(threadIdx.x); bin < RADIX_DIGITS; bin += BLOCK_THREADS)
     {
       _CCCL_PRAGMA_UNROLL_FULL()
       for (int pass = 0; pass < num_passes; ++pass)
@@ -213,7 +213,7 @@ struct AgentRadixSortHistogram
   _CCCL_DEVICE _CCCL_FORCEINLINE void AccumulateGlobalHistograms()
   {
     _CCCL_PRAGMA_UNROLL_FULL()
-    for (int bin = threadIdx.x; bin < RADIX_DIGITS; bin += BLOCK_THREADS)
+    for (int bin = static_cast<int>(threadIdx.x); bin < RADIX_DIGITS; bin += BLOCK_THREADS)
     {
       _CCCL_PRAGMA_UNROLL_FULL()
       for (int pass = 0; pass < num_passes; ++pass)

@@ -671,8 +671,8 @@ struct AgentRadixSortOnesweep
       , num_items(num_items)
       , current_bit(current_bit)
       , num_bits(num_bits)
-      , warp(threadIdx.x / WARP_THREADS)
-      , lane(::cuda::ptx::get_sreg_laneid())
+      , warp(static_cast<int>(threadIdx.x / WARP_THREADS))
+      , lane(static_cast<int>(::cuda::ptx::get_sreg_laneid()))
       , decomposer(decomposer)
   {
     // initialization

@@ -401,7 +401,7 @@ struct AgentRadixSortUpsweep
     for (int BIN_BASE = RADIX_DIGITS % BLOCK_THREADS; (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
          BIN_BASE += BLOCK_THREADS)
     {
-      int bin_idx       = BIN_BASE + threadIdx.x;
+      int bin_idx       = static_cast<int>(BIN_BASE + threadIdx.x);
       OffsetT bin_count = 0;
 
       _CCCL_PRAGMA_UNROLL_FULL()
@@ -421,7 +421,7 @@ struct AgentRadixSortUpsweep
     // Remainder
     if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
     {
-      int bin_idx       = threadIdx.x;
+      int bin_idx       = static_cast<int>(threadIdx.x);
       OffsetT bin_count = 0;
 
       _CCCL_PRAGMA_UNROLL_FULL()

@@ -429,7 +429,7 @@ struct AgentReduceByKey
 
     __syncthreads();
 
-    for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
+    for (int item = static_cast<int>(threadIdx.x); item < num_tile_segments; item += BLOCK_THREADS)
     {
       KeyValuePairT pair                                = temp_storage.raw_exchange.Alias()[item];
       d_unique_out[num_tile_segments_prefix + item]     = pair.key;
@@ -732,7 +732,7 @@ struct AgentReduceByKey
     // block
 
     // Current tile index
-    int tile_idx = start_tile + blockIdx.x;
+    int tile_idx = static_cast<int>(start_tile + blockIdx.x);
 
     // Global offset for the current tile
     OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx;

@@ -433,7 +433,7 @@ struct AgentRle
   {
     // Perform warpscans
     unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-    int lane_id          = ::cuda::ptx::get_sreg_laneid();
+    int lane_id          = static_cast<int>(::cuda::ptx::get_sreg_laneid());
 
     LengthOffsetPair identity;
     identity.key   = 0;
@@ -518,7 +518,7 @@ struct AgentRle
     ::cuda::std::true_type is_warp_time_slice)
   {
     unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-    int lane_id          = ::cuda::ptx::get_sreg_laneid();
+    int lane_id          = static_cast<int>(::cuda::ptx::get_sreg_laneid());
 
     // Locally compact items within the warp (first warp)
     if (warp_id == 0)
@@ -588,7 +588,7 @@ struct AgentRle
     ::cuda::std::false_type is_warp_time_slice)
   {
     unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-    int lane_id          = ::cuda::ptx::get_sreg_laneid();
+    int lane_id          = static_cast<int>(::cuda::ptx::get_sreg_laneid());
 
     // Unzip
     OffsetT run_offsets[ITEMS_PER_THREAD];
@@ -1012,7 +1012,7 @@ struct AgentRle
   ConsumeRange(int num_tiles, ScanTileStateT& tile_status, NumRunsIteratorT d_num_runs_out)
   {
     // Blocks are launched in increasing order, so just assign one tile per block
-    int tile_idx          = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
+    int tile_idx          = static_cast<int>((blockIdx.x * gridDim.y) + blockIdx.y); // Current tile index
     OffsetT tile_offset   = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(TILE_ITEMS);
     OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile)
 

@@ -408,7 +408,7 @@ struct AgentScan
     // block
 
     // Current tile index
-    int tile_idx = start_tile + blockIdx.x;
+    int tile_idx = static_cast<int>(start_tile + blockIdx.x);
 
     // Global offset for the current tile
     OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx;

@@ -428,7 +428,7 @@ struct AgentScanByKey
    */
   _CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT num_items, ScanTileStateT& tile_state, int start_tile)
   {
-    int tile_idx          = blockIdx.x;
+    int tile_idx          = static_cast<int>(blockIdx.x);
     OffsetT tile_base     = OffsetT(ITEMS_PER_TILE) * tile_idx;
     OffsetT num_remaining = num_items - tile_base;
 

@@ -632,7 +632,7 @@ struct AgentSelectIf
 
     __syncthreads();
 
-    for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
+    for (int item = static_cast<int>(threadIdx.x); item < num_tile_selections; item += BLOCK_THREADS)
     {
       *((d_selected_out + streaming_context.num_previously_selected()) + (num_selections_prefix + item)) =
         temp_storage.raw_exchange.Alias()[item];
@@ -1040,11 +1040,11 @@ struct AgentSelectIf
     int tile_idx{};
     if constexpr (SELECT_METHOD != USE_DISCONTINUITY)
     {
-      tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
+      tile_idx = static_cast<int>((blockIdx.x * gridDim.y) + blockIdx.y); // Current tile index
     }
     else
     {
-      tile_idx = blockIdx.x; // Current tile index
+      tile_idx = static_cast<int>(blockIdx.x); // Current tile index
     }
     OffsetT tile_offset = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(TILE_ITEMS);
 

@@ -553,7 +553,7 @@ struct AgentThreeWayPartition
   {
     // Blocks are launched in increasing order, so just assign one tile per block
     // Current tile index
-    const int tile_idx = blockIdx.x;
+    const int tile_idx = static_cast<int>(blockIdx.x);
 
     // Global offset for the current tile
     const OffsetT tile_offset = tile_idx * TILE_ITEMS;