Skip to content

Commit 5ce3c31

Browse files
authored
refactor: change calls to deprecated CUB functions and fix compiler warnings about return values (#15)
# What ❔ This PR changes calls to deprecated CUB functions and fix compiler warnings about return values.
1 parent 42a59a6 commit 5ce3c31

File tree

4 files changed

+28
-34
lines changed

4 files changed

+28
-34
lines changed

src/bellman-cuda-cub.cu

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,32 +7,28 @@ namespace common {
77
using namespace cub;
88

99
cudaError_t sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, int num_items, int begin_bit,
10-
int end_bit, cudaStream_t stream, bool debug_synchronous) {
11-
return DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream, debug_synchronous);
10+
int end_bit, cudaStream_t stream) {
11+
return DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream);
1212
}
1313

1414
cudaError_t sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, const unsigned *d_values_in,
15-
unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) {
16-
return DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream,
17-
debug_synchronous);
15+
unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream) {
16+
return DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream);
1817
}
1918

2019
cudaError_t sort_pairs_descending(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out,
21-
const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream,
22-
bool debug_synchronous) {
20+
const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream) {
2321
return DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit,
24-
end_bit, stream, debug_synchronous);
22+
end_bit, stream);
2523
}
2624

2725
cudaError_t run_length_encode(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_unique_out, unsigned *d_counts_out,
28-
unsigned *d_num_runs_out, int num_items, cudaStream_t stream, bool debug_synchronous) {
29-
return DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream,
30-
debug_synchronous);
26+
unsigned *d_num_runs_out, int num_items, cudaStream_t stream) {
27+
return DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
3128
}
3229

33-
cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream,
34-
bool debug_synchronous) {
35-
return DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
30+
cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream) {
31+
return DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
3632
}
3733

3834
} // namespace common
@@ -51,21 +47,20 @@ struct fq_mul {
5147
__device__ __forceinline__ storage operator()(const storage &a, const storage &b) const { return fd_q::mul(a, b); }
5248
};
5349

54-
cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream,
55-
bool debug_synchronous) {
56-
return DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, fq_add(), fd_q::storage(), stream, debug_synchronous);
50+
cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream) {
51+
return DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, fq_add(), fd_q::storage(), stream);
5752
}
5853

5954
cudaError_t inclusive_prefix_product(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
60-
cudaStream_t stream, bool debug_synchronous) {
61-
return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, fq_mul(), num_items, stream, debug_synchronous);
55+
cudaStream_t stream) {
56+
return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, fq_mul(), num_items, stream);
6257
}
6358

6459
cudaError_t inclusive_prefix_product_reverse(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
65-
cudaStream_t stream, bool debug_synchronous) {
60+
cudaStream_t stream) {
6661
auto i_in = std::reverse_iterator(d_in + num_items);
6762
auto i_out = std::reverse_iterator(d_out + num_items);
68-
return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, i_in, i_out, fq_mul(), num_items, stream, debug_synchronous);
63+
return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, i_in, i_out, fq_mul(), num_items, stream);
6964
}
7065

7166
} // namespace ff

src/bellman-cuda-cub.cuh

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,33 +4,32 @@
44
namespace common {
55

66
cudaError_t sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, int num_items, int begin_bit = 0,
7-
int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr, bool debug_synchronous = false);
7+
int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr);
88

99
cudaError_t sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, const unsigned *d_values_in,
10-
unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr,
11-
bool debug_synchronous = false);
10+
unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr);
1211

1312
cudaError_t sort_pairs_descending(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out,
1413
const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8,
15-
cudaStream_t stream = nullptr, bool debug_synchronous = false);
14+
cudaStream_t stream = nullptr);
1615

1716
cudaError_t run_length_encode(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_unique_out, unsigned *d_counts_out,
18-
unsigned *d_num_runs_out, int num_items, cudaStream_t stream = nullptr, bool debug_synchronous = false);
17+
unsigned *d_num_runs_out, int num_items, cudaStream_t stream = nullptr);
1918

20-
cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream = nullptr,
21-
bool debug_synchronous = false);
19+
cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items,
20+
cudaStream_t stream = nullptr);
2221

2322
} // namespace common
2423

2524
namespace ff {
2625

27-
cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream = nullptr,
28-
bool debug_synchronous = false);
26+
cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
27+
cudaStream_t stream = nullptr);
2928

3029
cudaError_t inclusive_prefix_product(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
31-
cudaStream_t stream = nullptr, bool debug_synchronous = false);
30+
cudaStream_t stream = nullptr);
3231

3332
cudaError_t inclusive_prefix_product_reverse(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
34-
cudaStream_t stream = nullptr, bool debug_synchronous = false);
33+
cudaStream_t stream = nullptr);
3534

3635
} // namespace ff

src/memory.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ template <typename T, ld_modifier MODIFIER> static constexpr __device__ __forcei
2323
case ld_modifier::cv:
2424
return __ldcv(ptr);
2525
}
26+
return *ptr;
2627
}
2728

2829
enum class st_modifier { none, wb, cg, cs, wt };

src/ntt_kernels.cu

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,8 @@ __device__ __forceinline__ typename FD::storage *index_to_addr(const per_device_
109109
// "addrs" passed from ntt_smem_stages_kernel should be in constant memory, which is dynamically indexable.
110110
// I guess nvcc moved ntt_smem_stages_kernel "inputs" and "outputs" to registers then tried to dynamically
111111
// index addr.data here in index_to_addr. Smart :eyeroll: Whatever, switch statement works.
112-
} else {
113-
return addrs.data[0] + idx;
114112
}
113+
return addrs.data[0] + idx;
115114
}
116115

117116
// Carries out up to MAX_SMEM_STAGES - log_tile_sz C-T stages in shared memory.

0 commit comments

Comments
 (0)