Update to Thrust 1.17.0 (#11437)

bdice · web-flow · commit 0df617810ce6 · 2022-08-11T02:25:06.000Z
Updates the bundled version of Thrust to 1.17.0. I will run benchmarks and include results in a comment below. Depends on #11457. Supersedes #10489, #10577, #10586. Closes #10841. **This should be merged concurrently with rapidsai/rapids-cmake#231 Authors: - Bradley Dice (https://github.com/bdice) Approvers: - David Wendt (https://github.com/davidwendt) - Nghia Truong (https://github.com/ttnghia) - Robert Maynard (https://github.com/robertmaynard) URL: #11437
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -80,6 +80,6 @@ function(find_and_configure_thrust VERSION)
   endif()
 endfunction()
 
-set(CUDF_MIN_VERSION_Thrust 1.15.0)
+set(CUDF_MIN_VERSION_Thrust 1.17.0)
 
 find_and_configure_thrust(${CUDF_MIN_VERSION_Thrust})
diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch
@@ -1,83 +1,116 @@
-diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h
-index 1ffeef0..5e80800 100644
---- a/thrust/system/cuda/detail/sort.h
-+++ b/thrust/system/cuda/detail/sort.h
-@@ -108,7 +108,7 @@ namespace __merge_sort {
-     key_type key2 = keys_shared[keys2_beg];
- 
- 
+diff --git a/cub/block/block_merge_sort.cuh b/cub/block/block_merge_sort.cuh
+index 4769df36..d86d6342 100644
+--- a/cub/block/block_merge_sort.cuh
++++ b/cub/block/block_merge_sort.cuh
+@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared,
+   KeyT key1 = keys_shared[keys1_beg];
+   KeyT key2 = keys_shared[keys2_beg];
+
 -#pragma unroll
 +#pragma unroll 1
-     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-     {
-       bool p = (keys2_beg < keys2_end) &&
-@@ -311,10 +311,10 @@ namespace __merge_sort {
-       void stable_odd_even_sort(key_type (&keys)[ITEMS_PER_THREAD],
-                                 item_type (&items)[ITEMS_PER_THREAD])
+   for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+   {
+     bool p = (keys2_beg < keys2_end) &&
+@@ -383,7 +383,7 @@ public:
+       //
+       KeyT max_key = oob_default;
+
+-      #pragma unroll
++      #pragma unroll 1
+       for (int item = 1; item < ITEMS_PER_THREAD; ++item)
        {
--#pragma unroll
-+#pragma unroll 1
-         for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-         {
--#pragma unroll
-+#pragma unroll 1
-           for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
-           {
-             if (compare_op(keys[j + 1], keys[j]))
-@@ -350,7 +350,7 @@ namespace __merge_sort {
-         // each thread has  sorted keys_loc
-         // merge sort keys_loc in shared memory
-         //
--#pragma unroll
-+#pragma unroll 1
-         for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2)
-         {
-           sync_threadblock();
-@@ -479,7 +479,7 @@ namespace __merge_sort {
-           // and fill the remainig keys with it
-           //
-           key_type max_key = keys_loc[0];
--#pragma unroll
-+#pragma unroll 1
-           for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-           {
-             if (ITEMS_PER_THREAD * tid + ITEM < num_remaining)
-diff a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
-index 41eb1d2..f2893b4 100644
+         if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
+@@ -407,7 +407,7 @@ public:
+     // each thread has sorted keys
+     // merge sort keys in shared memory
+     //
+-    #pragma unroll
++    #pragma unroll 1
+     for (int target_merged_threads_number = 2;
+          target_merged_threads_number <= NUM_THREADS;
+          target_merged_threads_number *= 2)
+diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
+index b188c75f..3f36656f 100644
 --- a/cub/device/dispatch/dispatch_radix_sort.cuh
 +++ b/cub/device/dispatch/dispatch_radix_sort.cuh
-@@ -723,7 +723,7 @@ struct DeviceRadixSortPolicy
- 
- 
+@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy
+
+
      /// SM60 (GP100)
 -    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
 +    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
      {
          enum {
              PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
-diff a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
-index f6aee45..dd64301 100644
+diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
+index e0470ccb..6a0c2ed6 100644
 --- a/cub/device/dispatch/dispatch_reduce.cuh
 +++ b/cub/device/dispatch/dispatch_reduce.cuh
-@@ -284,7 +284,7 @@ struct DeviceReducePolicy
+@@ -280,7 +280,7 @@ struct DeviceReducePolicy
      };
- 
+
      /// SM60
 -    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
 +    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
      {
          // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
          typedef AgentReducePolicy<
-diff a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
-index c0c6d59..937ee31 100644
+diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
+index c2d04588..ac2d10e0 100644
 --- a/cub/device/dispatch/dispatch_scan.cuh
 +++ b/cub/device/dispatch/dispatch_scan.cuh
-@@ -178,7 +178,7 @@ struct DeviceScanPolicy
+@@ -177,7 +177,7 @@ struct DeviceScanPolicy
      };
- 
+
      /// SM600
 -    struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
 +    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
      {
          typedef AgentScanPolicy<
                  128, 15,                                        ///< Threads per block, items per thread
+diff --git a/cub/thread/thread_sort.cuh b/cub/thread/thread_sort.cuh
+index 5d486789..b42fb5f0 100644
+--- a/cub/thread/thread_sort.cuh
++++ b/cub/thread/thread_sort.cuh
+@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD],
+ {
+   constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+-  #pragma unroll
++  #pragma unroll 1
+   for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+   {
+-  #pragma unroll
++  #pragma unroll 1
+     for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
+     {
+       if (compare_op(keys[j + 1], keys[j]))
+diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h
+index d0e3f94..76774b0 100644
+--- a/thrust/system/cuda/detail/dispatch.h
++++ b/thrust/system/cuda/detail/dispatch.h
+@@ -32,9 +32,8 @@
+         status = call arguments; \
+     } \
+     else { \
+-        auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
+-        status = call arguments; \
+-    }
++       throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
++    }
+
+ /**
+  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+@@ -52,10 +51,8 @@
+         status = call arguments; \
+     } \
+     else { \
+-        auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1); \
+-        auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2); \
+-        status = call arguments; \
+-    }
++       throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
++    }
+ /**
+  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+  * implementation. This version allows using different token sequences for callables