Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ Checks:
- 'clang-diagnostic-*'
- 'bugprone-*'
- '-bugprone-forward-declaration-namespace'
- '-bugprone-narrowing-conversions'
- '-bugprone-misplaced-widening-cast'
- '-bugprone-signed-char-misuse'
- '-bugprone-suspicious-include'
Expand Down Expand Up @@ -209,7 +208,6 @@ Checks:
- '-cppcoreguidelines-noexcept-move-operations'
# REVIEW ME: This warns about any usage of operator[], suggesting usage of .at() instead. Could
- '-cppcoreguidelines-pro-bounds-avoid-unchecked-container-access'
# Covered by bugprone-narrowing-conversions
- '-cppcoreguidelines-narrowing-conversions'
# Covered by misc-unconventional-assign-operator
- '-cppcoreguidelines-c-copy-assignment-signature'
Expand Down
2 changes: 1 addition & 1 deletion c/experimental/stf/test/test_async_resources_handle.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ __device__ unsigned g_busy_sink;
// commits its value.
__global__ void slow_set_kernel(int* arr, int n, int value, int iters)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int tid = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
if (tid >= n)
{
return;
Expand Down
8 changes: 4 additions & 4 deletions c/experimental/stf/test/test_cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

__global__ void axpy(int cnt, double a, const double* x, double* y)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int nthreads = gridDim.x * blockDim.x;
int tid = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
int nthreads = static_cast<int>(gridDim.x * blockDim.x);

for (int i = tid; i < cnt; i += nthreads)
{
Expand Down Expand Up @@ -48,8 +48,8 @@ C2H_TEST("axpy with stf cuda_kernel", "[cuda_kernel]")

for (size_t i = 0; i < N; i++)
{
X[i] = X0(i);
Y[i] = Y0(i);
X[i] = X0(static_cast<int>(i));
Y[i] = Y0(static_cast<int>(i));
}

const double alpha = 3.14;
Expand Down
6 changes: 3 additions & 3 deletions c/experimental/stf/test/test_host_launch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

__global__ void fill_kernel(int cnt, double* data, double value)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int nthreads = gridDim.x * blockDim.x;
int tid = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
int nthreads = static_cast<int>(gridDim.x * blockDim.x);

for (int i = tid; i < cnt; i += nthreads)
{
Expand Down Expand Up @@ -51,7 +51,7 @@ static void verify_callback(stf_host_launch_deps_handle deps)
auto* data = static_cast<double*>(stf_host_launch_deps_get(deps, 0));
for (size_t i = 0; i < v->N; i++)
{
if (fabs(data[i] - (42.0 + i)) > 1e-10)
if (fabs(data[i] - (42.0 + static_cast<double>(i))) > 1e-10)
{
*v->passed = false;
return;
Expand Down
2 changes: 1 addition & 1 deletion c/experimental/stf/test/test_logical_data_with_place.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

__global__ void scale_inplace(int n, float* data, float factor)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int i = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
if (i < n)
{
data[i] *= factor;
Expand Down
8 changes: 4 additions & 4 deletions c/experimental/stf/test/test_stackable.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

__global__ void scale_kernel(int cnt, double* data, double factor)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int nthreads = gridDim.x * blockDim.x;
const int tid = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
const int nthreads = static_cast<int>(gridDim.x * blockDim.x);
for (int i = tid; i < cnt; i += nthreads)
{
data[i] *= factor;
Expand All @@ -28,8 +28,8 @@ __global__ void scale_kernel(int cnt, double* data, double factor)

__global__ void increment_kernel(int cnt, double* data)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int nthreads = gridDim.x * blockDim.x;
const int tid = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
const int nthreads = static_cast<int>(gridDim.x * blockDim.x);
for (int i = tid; i < cnt; i += nthreads)
{
data[i] += 1.0;
Expand Down
2 changes: 1 addition & 1 deletion c/experimental/stf/test/test_stream_ctx_override.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ __device__ unsigned g_busy_sink;
// ctx1 is still running when ctx2's kernel races in.
__global__ void slow_set_kernel(int* arr, int n, int value, int iters)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int tid = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
if (tid >= n)
{
return;
Expand Down
4 changes: 2 additions & 2 deletions c/experimental/stf/test/test_task_get_graph.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

__global__ void scale_kernel(int cnt, double* data, double factor)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int nthreads = gridDim.x * blockDim.x;
const int tid = static_cast<int>(blockIdx.x * blockDim.x + threadIdx.x);
const int nthreads = static_cast<int>(gridDim.x * blockDim.x);
for (int i = tid; i < cnt; i += nthreads)
{
data[i] *= factor;
Expand Down
2 changes: 1 addition & 1 deletion c/parallel/test/test_histogram.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ C2H_TEST("DeviceHistogram::HistogramEven API usage", "[histogram][device]")
value_t<level_t> lower_level_val{lower_level};
value_t<level_t> upper_level_val{upper_level};

size_t row_stride_samples = num_samples;
int64_t row_stride_samples = static_cast<int64_t>(num_samples);

histogram_even(
d_samples_ptr,
Expand Down
8 changes: 4 additions & 4 deletions c/parallel/test/test_segmented_reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -413,8 +413,8 @@ extern "C" __device__ void {0}(void* lhs_ptr, void* rhs_ptr, void* out_ptr) {{

for (std::size_t i = 0; i < n_segments; ++i)
{
auto segment_begin_it = host_input.begin() + segments[i];
auto segment_end_it = host_input.begin() + segments[i + 1];
auto segment_begin_it = host_input.begin() + static_cast<std::ptrdiff_t>(segments[i]);
auto segment_end_it = host_input.begin() + static_cast<std::ptrdiff_t>(segments[i + 1]);
host_output[i] = std::reduce(segment_begin_it, segment_end_it, v0, [](pair lhs, pair rhs) {
return pair{static_cast<short>(lhs.a + rhs.a), lhs.b + rhs.b};
});
Expand Down Expand Up @@ -484,8 +484,8 @@ extern "C" __device__ void {0}(void* lhs_ptr, void* rhs_ptr, void* out_ptr) {{

for (std::size_t i = 0; i < n_segments; ++i)
{
auto segment_begin_it = host_input.begin() + segments[i];
auto segment_end_it = host_input.begin() + segments[i + 1];
auto segment_begin_it = host_input.begin() + static_cast<std::ptrdiff_t>(segments[i]);
auto segment_end_it = host_input.begin() + static_cast<std::ptrdiff_t>(segments[i + 1]);
host_output[i] = std::reduce(segment_begin_it, segment_end_it, v0, [](pair lhs, pair rhs) {
return pair{static_cast<short>(lhs.a + rhs.a), lhs.b + rhs.b};
});
Expand Down
2 changes: 1 addition & 1 deletion c/parallel/test/test_segmented_sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -645,7 +645,7 @@ C2H_TEST("SegmentedSort works with variable segment sizes", "[segmented_sort][va
for (std::size_t i = 0; i < n_segments; ++i)
{
start_offsets[i] = current_offset;
current_offset += segment_sizes[i];
current_offset += static_cast<SizeT>(segment_sizes[i]);
end_offsets[i] = current_offset;
}

Expand Down
2 changes: 1 addition & 1 deletion c/parallel/test/test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ inline std::string inspect_sass(const void* cubin, size_t cubin_size)
throw std::runtime_error("Failed to create temporary file.");
}

temp_in_file.write(static_cast<const char*>(cubin), cubin_size);
temp_in_file.write(static_cast<const char*>(cubin), static_cast<std::streamsize>(cubin_size));
temp_in_file.close();

std::string command = "nvdisasm -gi ";
Expand Down
2 changes: 1 addition & 1 deletion c2h/include/c2h/cpu_timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class cpu_timer

void print_elapsed_seconds(const std::string& label)
{
printf("%0.6f s: %s\n", this->elapsed_us() / 1000000.f, label.c_str());
printf("%0.6f s: %s\n", static_cast<float>(this->elapsed_us()) / 1000000.f, label.c_str());
Comment thread
Jacobfaib marked this conversation as resolved.
}

void print_elapsed_seconds_and_reset(const std::string& label)
Expand Down
5 changes: 3 additions & 2 deletions cub/benchmarks/bench/merge/merge_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,11 @@ generate_lhs_rhs(std::size_t num_items_lhs, std::size_t num_items_rhs, bit_entro
// selected for lhs and *all* items after the pivot point.
constexpr std::size_t num_pivot_points = 1;
thrust::device_vector<offset_t> pivot_point(num_pivot_points);
auto counting_it = thrust::make_counting_iterator(offset_t{0});
auto counting_it = thrust::make_counting_iterator(offset_t{0});
using counting_difference_t = typename decltype(counting_it)::difference_type;
thrust::copy_if(
counting_it,
counting_it + elements,
counting_it + static_cast<counting_difference_t>(elements),
rnd_selector_val.begin(),
cuda::make_tabulate_output_iterator(write_pivot_point_t<offset_t>{
static_cast<offset_t>(num_items_lhs), thrust::raw_pointer_cast(pivot_point.data())}),
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -841,7 +841,7 @@ private:
BufferSizeIteratorT tile_buffer_sizes,
BlockBufferOffsetT num_wlev_buffers)
{
const int32_t warp_id = threadIdx.x / warp_threads;
const int32_t warp_id = static_cast<int32_t>(threadIdx.x / warp_threads);
constexpr uint32_t warps_per_block = BLOCK_THREADS / warp_threads;

for (BlockBufferOffsetT buffer_offset = warp_id; buffer_offset < num_wlev_buffers; buffer_offset += warps_per_block)
Expand Down
10 changes: 5 additions & 5 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ struct AgentHistogram
_CCCL_PRAGMA_UNROLL_FULL()
for (int ch = 0; ch < NumActiveChannels; ++ch)
{
for (int bin = threadIdx.x; bin < num_privatized_bins[ch]; bin += threads_per_block)
for (int bin = static_cast<int>(threadIdx.x); bin < num_privatized_bins[ch]; bin += threads_per_block)
{
privatized_histograms[ch][bin] = 0;
}
Expand All @@ -262,7 +262,7 @@ struct AgentHistogram
for (int ch = 0; ch < NumActiveChannels; ++ch)
{
const int channel_bins = num_privatized_bins[ch];
for (int bin = threadIdx.x; bin < channel_bins; bin += threads_per_block)
for (int bin = static_cast<int>(threadIdx.x); bin < channel_bins; bin += threads_per_block)
{
int output_bin = -1;
const CounterT count = privatized_histograms[ch][bin];
Expand Down Expand Up @@ -479,7 +479,7 @@ struct AgentHistogram
::cuda::std::true_type is_work_stealing)
{
int num_tiles = num_rows * tiles_per_row;
int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x;
int tile_idx = static_cast<int>((blockIdx.y * gridDim.x) + blockIdx.x);
OffsetT num_even_share_tiles = gridDim.x * gridDim.y;

while (tile_idx < num_tiles)
Expand Down Expand Up @@ -530,7 +530,7 @@ struct AgentHistogram
_CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeTiles(
OffsetT num_row_pixels, OffsetT num_rows, OffsetT row_stride_samples, int, GridQueue<int>, ::cuda::std::false_type)
{
for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
for (int row = static_cast<int>(blockIdx.y); row < num_rows; row += static_cast<int>(gridDim.y))
{
OffsetT row_begin = row * row_stride_samples;
OffsetT row_end = row_begin + (num_row_pixels * NumChannels);
Expand Down Expand Up @@ -605,7 +605,7 @@ struct AgentHistogram
: // prefer gmem privatized histograms
blockIdx.x & 1) // prefer blended privatized histograms
{
const int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
const int blockId = static_cast<int>((blockIdx.y * gridDim.x) + blockIdx.x);

// TODO(bgruber): d_privatized_histograms seems only used when !prefer_smem, can we skip it if prefer_smem?
// Initialize the locations of this block's privatized histograms
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_radix_sort_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ struct AgentRadixSortHistogram
{
// Initialize bins to 0.
_CCCL_PRAGMA_UNROLL_FULL()
for (int bin = threadIdx.x; bin < RADIX_DIGITS; bin += BLOCK_THREADS)
for (int bin = static_cast<int>(threadIdx.x); bin < RADIX_DIGITS; bin += BLOCK_THREADS)
{
_CCCL_PRAGMA_UNROLL_FULL()
for (int pass = 0; pass < num_passes; ++pass)
Expand Down Expand Up @@ -213,7 +213,7 @@ struct AgentRadixSortHistogram
_CCCL_DEVICE _CCCL_FORCEINLINE void AccumulateGlobalHistograms()
{
_CCCL_PRAGMA_UNROLL_FULL()
for (int bin = threadIdx.x; bin < RADIX_DIGITS; bin += BLOCK_THREADS)
for (int bin = static_cast<int>(threadIdx.x); bin < RADIX_DIGITS; bin += BLOCK_THREADS)
{
_CCCL_PRAGMA_UNROLL_FULL()
for (int pass = 0; pass < num_passes; ++pass)
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_radix_sort_onesweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -671,8 +671,8 @@ struct AgentRadixSortOnesweep
, num_items(num_items)
, current_bit(current_bit)
, num_bits(num_bits)
, warp(threadIdx.x / WARP_THREADS)
, lane(::cuda::ptx::get_sreg_laneid())
, warp(static_cast<int>(threadIdx.x / WARP_THREADS))
, lane(static_cast<int>(::cuda::ptx::get_sreg_laneid()))
, decomposer(decomposer)
{
// initialization
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_radix_sort_upsweep.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ struct AgentRadixSortUpsweep
for (int BIN_BASE = RADIX_DIGITS % BLOCK_THREADS; (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
BIN_BASE += BLOCK_THREADS)
{
int bin_idx = BIN_BASE + threadIdx.x;
int bin_idx = static_cast<int>(BIN_BASE + threadIdx.x);
OffsetT bin_count = 0;

_CCCL_PRAGMA_UNROLL_FULL()
Expand All @@ -421,7 +421,7 @@ struct AgentRadixSortUpsweep
// Remainder
if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
{
int bin_idx = threadIdx.x;
int bin_idx = static_cast<int>(threadIdx.x);
OffsetT bin_count = 0;

_CCCL_PRAGMA_UNROLL_FULL()
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ struct AgentReduceByKey

__syncthreads();

for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
for (int item = static_cast<int>(threadIdx.x); item < num_tile_segments; item += BLOCK_THREADS)
{
KeyValuePairT pair = temp_storage.raw_exchange.Alias()[item];
d_unique_out[num_tile_segments_prefix + item] = pair.key;
Expand Down Expand Up @@ -732,7 +732,7 @@ struct AgentReduceByKey
// block

// Current tile index
int tile_idx = start_tile + blockIdx.x;
int tile_idx = static_cast<int>(start_tile + blockIdx.x);

// Global offset for the current tile
OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx;
Expand Down
8 changes: 4 additions & 4 deletions cub/cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,7 @@ struct AgentRle
{
// Perform warpscans
unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
int lane_id = ::cuda::ptx::get_sreg_laneid();
int lane_id = static_cast<int>(::cuda::ptx::get_sreg_laneid());

LengthOffsetPair identity;
identity.key = 0;
Expand Down Expand Up @@ -518,7 +518,7 @@ struct AgentRle
::cuda::std::true_type is_warp_time_slice)
{
unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
int lane_id = ::cuda::ptx::get_sreg_laneid();
int lane_id = static_cast<int>(::cuda::ptx::get_sreg_laneid());

// Locally compact items within the warp (first warp)
if (warp_id == 0)
Expand Down Expand Up @@ -588,7 +588,7 @@ struct AgentRle
::cuda::std::false_type is_warp_time_slice)
{
unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
int lane_id = ::cuda::ptx::get_sreg_laneid();
int lane_id = static_cast<int>(::cuda::ptx::get_sreg_laneid());

// Unzip
OffsetT run_offsets[ITEMS_PER_THREAD];
Expand Down Expand Up @@ -1012,7 +1012,7 @@ struct AgentRle
ConsumeRange(int num_tiles, ScanTileStateT& tile_status, NumRunsIteratorT d_num_runs_out)
{
// Blocks are launched in increasing order, so just assign one tile per block
int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
int tile_idx = static_cast<int>((blockIdx.x * gridDim.y) + blockIdx.y); // Current tile index
OffsetT tile_offset = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(TILE_ITEMS);
OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile)

Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ struct AgentScan
// block

// Current tile index
int tile_idx = start_tile + blockIdx.x;
int tile_idx = static_cast<int>(start_tile + blockIdx.x);

// Global offset for the current tile
OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx;
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ struct AgentScanByKey
*/
_CCCL_DEVICE _CCCL_FORCEINLINE void ConsumeRange(OffsetT num_items, ScanTileStateT& tile_state, int start_tile)
{
int tile_idx = blockIdx.x;
int tile_idx = static_cast<int>(blockIdx.x);
OffsetT tile_base = OffsetT(ITEMS_PER_TILE) * tile_idx;
OffsetT num_remaining = num_items - tile_base;

Expand Down
6 changes: 3 additions & 3 deletions cub/cub/agent/agent_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ struct AgentSelectIf

__syncthreads();

for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
for (int item = static_cast<int>(threadIdx.x); item < num_tile_selections; item += BLOCK_THREADS)
{
*((d_selected_out + streaming_context.num_previously_selected()) + (num_selections_prefix + item)) =
temp_storage.raw_exchange.Alias()[item];
Expand Down Expand Up @@ -1040,11 +1040,11 @@ struct AgentSelectIf
int tile_idx{};
if constexpr (SELECT_METHOD != USE_DISCONTINUITY)
{
tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
tile_idx = static_cast<int>((blockIdx.x * gridDim.y) + blockIdx.y); // Current tile index
}
else
{
tile_idx = blockIdx.x; // Current tile index
tile_idx = static_cast<int>(blockIdx.x); // Current tile index
}
OffsetT tile_offset = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(TILE_ITEMS);

Expand Down
2 changes: 1 addition & 1 deletion cub/cub/agent/agent_three_way_partition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ struct AgentThreeWayPartition
{
// Blocks are launched in increasing order, so just assign one tile per block
// Current tile index
const int tile_idx = blockIdx.x;
const int tile_idx = static_cast<int>(blockIdx.x);

// Global offset for the current tile
const OffsetT tile_offset = tile_idx * TILE_ITEMS;
Expand Down
Loading
Loading