refactor: change calls to deprecated CUB functions and fix compiler warnings about return values (#15)

robik75 · web-flow · commit 5ce3c31dbb13 · 2024-09-25T17:20:37.000+02:00
# What ❔

This PR changes calls to deprecated CUB functions and fix compiler
warnings about return values.
diff --git a/src/bellman-cuda-cub.cu b/src/bellman-cuda-cub.cu
@@ -7,32 +7,28 @@ namespace common {
 using namespace cub;
 
 cudaError_t sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, int num_items, int begin_bit,
-                      int end_bit, cudaStream_t stream, bool debug_synchronous) {
-  return DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream, debug_synchronous);
+                      int end_bit, cudaStream_t stream) {
+  return DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, begin_bit, end_bit, stream);
 }
 
 cudaError_t sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, const unsigned *d_values_in,
-                       unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous) {
-  return DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream,
-                                    debug_synchronous);
+                       unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream) {
+  return DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit, end_bit, stream);
 }
 
 cudaError_t sort_pairs_descending(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out,
-                                  const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream,
-                                  bool debug_synchronous) {
+                                  const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit, int end_bit, cudaStream_t stream) {
   return DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, num_items, begin_bit,
-                                              end_bit, stream, debug_synchronous);
+                                              end_bit, stream);
 }
 
 cudaError_t run_length_encode(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_unique_out, unsigned *d_counts_out,
-                              unsigned *d_num_runs_out, int num_items, cudaStream_t stream, bool debug_synchronous) {
-  return DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream,
-                                       debug_synchronous);
+                              unsigned *d_num_runs_out, int num_items, cudaStream_t stream) {
+  return DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
 }
 
-cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream,
-                          bool debug_synchronous) {
-  return DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream) {
+  return DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
 }
 
 } // namespace common
@@ -51,21 +47,20 @@ struct fq_mul {
   __device__ __forceinline__ storage operator()(const storage &a, const storage &b) const { return fd_q::mul(a, b); }
 };
 
-cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream,
-                bool debug_synchronous) {
-  return DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, fq_add(), fd_q::storage(), stream, debug_synchronous);
+cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream) {
+  return DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, fq_add(), fd_q::storage(), stream);
 }
 
 cudaError_t inclusive_prefix_product(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
-                                     cudaStream_t stream, bool debug_synchronous) {
-  return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, fq_mul(), num_items, stream, debug_synchronous);
+                                     cudaStream_t stream) {
+  return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, fq_mul(), num_items, stream);
 }
 
 cudaError_t inclusive_prefix_product_reverse(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
-                                             cudaStream_t stream, bool debug_synchronous) {
+                                             cudaStream_t stream) {
   auto i_in = std::reverse_iterator(d_in + num_items);
   auto i_out = std::reverse_iterator(d_out + num_items);
-  return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, i_in, i_out, fq_mul(), num_items, stream, debug_synchronous);
+  return DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, i_in, i_out, fq_mul(), num_items, stream);
 }
 
 } // namespace ff
diff --git a/src/bellman-cuda-cub.cuh b/src/bellman-cuda-cub.cuh
@@ -4,33 +4,32 @@
 namespace common {
 
 cudaError_t sort_keys(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, int num_items, int begin_bit = 0,
-                      int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr, bool debug_synchronous = false);
+                      int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr);
 
 cudaError_t sort_pairs(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out, const unsigned *d_values_in,
-                       unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr,
-                       bool debug_synchronous = false);
+                       unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8, cudaStream_t stream = nullptr);
 
 cudaError_t sort_pairs_descending(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_keys_in, unsigned *d_keys_out,
                                   const unsigned *d_values_in, unsigned *d_values_out, int num_items, int begin_bit = 0, int end_bit = sizeof(unsigned) * 8,
-                                  cudaStream_t stream = nullptr, bool debug_synchronous = false);
+                                  cudaStream_t stream = nullptr);
 
 cudaError_t run_length_encode(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_unique_out, unsigned *d_counts_out,
-                              unsigned *d_num_runs_out, int num_items, cudaStream_t stream = nullptr, bool debug_synchronous = false);
+                              unsigned *d_num_runs_out, int num_items, cudaStream_t stream = nullptr);
 
-cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items, cudaStream_t stream = nullptr,
-                          bool debug_synchronous = false);
+cudaError_t exclusive_sum(void *d_temp_storage, size_t &temp_storage_bytes, const unsigned *d_in, unsigned *d_out, int num_items,
+                          cudaStream_t stream = nullptr);
 
 } // namespace common
 
 namespace ff {
 
-cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items, cudaStream_t stream = nullptr,
-                bool debug_synchronous = false);
+cudaError_t sum(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
+                cudaStream_t stream = nullptr);
 
 cudaError_t inclusive_prefix_product(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
-                                     cudaStream_t stream = nullptr, bool debug_synchronous = false);
+                                     cudaStream_t stream = nullptr);
 
 cudaError_t inclusive_prefix_product_reverse(void *d_temp_storage, size_t &temp_storage_bytes, const fd_q::storage *d_in, fd_q::storage *d_out, int num_items,
-                                             cudaStream_t stream = nullptr, bool debug_synchronous = false);
+                                             cudaStream_t stream = nullptr);
 
 } // namespace ff
diff --git a/src/memory.cuh b/src/memory.cuh
@@ -23,6 +23,7 @@ template <typename T, ld_modifier MODIFIER> static constexpr __device__ __forcei
   case ld_modifier::cv:
     return __ldcv(ptr);
   }
+  return *ptr;
 }
 
 enum class st_modifier { none, wb, cg, cs, wt };
diff --git a/src/ntt_kernels.cu b/src/ntt_kernels.cu
@@ -109,9 +109,8 @@ __device__ __forceinline__ typename FD::storage *index_to_addr(const per_device_
     // "addrs" passed from ntt_smem_stages_kernel should be in constant memory, which is dynamically indexable.
     // I guess nvcc moved ntt_smem_stages_kernel "inputs" and "outputs" to registers then tried to dynamically
     // index addr.data here in index_to_addr. Smart :eyeroll: Whatever, switch statement works.
-  } else {
-    return addrs.data[0] + idx;
   }
+  return addrs.data[0] + idx;
 }
 
 // Carries out up to MAX_SMEM_STAGES - log_tile_sz C-T stages in shared memory.

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@ template <typename T, ld_modifier MODIFIER> static constexpr __device__ __forcei`
`23`	`23`	`case ld_modifier::cv:`
`24`	`24`	`return __ldcv(ptr);`
`25`	`25`	`}`
	`26`	`+ return *ptr;`
`26`	`27`	`}`
`27`	`28`
`28`	`29`	`enum class st_modifier { none, wb, cg, cs, wt };`
Original file line number	Diff line number	Diff line change
`@@ -109,9 +109,8 @@ __device__ __forceinline__ typename FD::storage *index_to_addr(const per_device_`
`109`	`109`	`// "addrs" passed from ntt_smem_stages_kernel should be in constant memory, which is dynamically indexable.`
`110`	`110`	`// I guess nvcc moved ntt_smem_stages_kernel "inputs" and "outputs" to registers then tried to dynamically`
`111`	`111`	`// index addr.data here in index_to_addr. Smart :eyeroll: Whatever, switch statement works.`
`112`		`- } else {`
`113`		`- return addrs.data[0] + idx;`
`114`	`112`	`}`
	`113`	`+ return addrs.data[0] + idx;`
`115`	`114`	`}`
`116`	`115`
`117`	`116`	`// Carries out up to MAX_SMEM_STAGES - log_tile_sz C-T stages in shared memory.`