Skip to content

Commit 0df6178

Browse files
authored
Update to Thrust 1.17.0 (#11437)
Updates the bundled version of Thrust to 1.17.0. I will run benchmarks and include results in a comment below. Depends on #11457. Supersedes #10489, #10577, #10586. Closes #10841. **This should be merged concurrently with rapidsai/rapids-cmake#231 Authors: - Bradley Dice (https://github.com/bdice) Approvers: - David Wendt (https://github.com/davidwendt) - Nghia Truong (https://github.com/ttnghia) - Robert Maynard (https://github.com/robertmaynard) URL: #11437
1 parent 9257549 commit 0df6178

File tree

2 files changed

+88
-55
lines changed

2 files changed

+88
-55
lines changed

cpp/cmake/thirdparty/get_thrust.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,6 @@ function(find_and_configure_thrust VERSION)
8080
endif()
8181
endfunction()
8282

83-
set(CUDF_MIN_VERSION_Thrust 1.15.0)
83+
set(CUDF_MIN_VERSION_Thrust 1.17.0)
8484

8585
find_and_configure_thrust(${CUDF_MIN_VERSION_Thrust})

cpp/cmake/thrust.patch

Lines changed: 87 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,83 +1,116 @@
1-
diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h
2-
index 1ffeef0..5e80800 100644
3-
--- a/thrust/system/cuda/detail/sort.h
4-
+++ b/thrust/system/cuda/detail/sort.h
5-
@@ -108,7 +108,7 @@ namespace __merge_sort {
6-
key_type key2 = keys_shared[keys2_beg];
7-
8-
1+
diff --git a/cub/block/block_merge_sort.cuh b/cub/block/block_merge_sort.cuh
2+
index 4769df36..d86d6342 100644
3+
--- a/cub/block/block_merge_sort.cuh
4+
+++ b/cub/block/block_merge_sort.cuh
5+
@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared,
6+
KeyT key1 = keys_shared[keys1_beg];
7+
KeyT key2 = keys_shared[keys2_beg];
8+
99
-#pragma unroll
1010
+#pragma unroll 1
11-
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
12-
{
13-
bool p = (keys2_beg < keys2_end) &&
14-
@@ -311,10 +311,10 @@ namespace __merge_sort {
15-
void stable_odd_even_sort(key_type (&keys)[ITEMS_PER_THREAD],
16-
item_type (&items)[ITEMS_PER_THREAD])
11+
for (int item = 0; item < ITEMS_PER_THREAD; ++item)
12+
{
13+
bool p = (keys2_beg < keys2_end) &&
14+
@@ -383,7 +383,7 @@ public:
15+
//
16+
KeyT max_key = oob_default;
17+
18+
- #pragma unroll
19+
+ #pragma unroll 1
20+
for (int item = 1; item < ITEMS_PER_THREAD; ++item)
1721
{
18-
-#pragma unroll
19-
+#pragma unroll 1
20-
for (int i = 0; i < ITEMS_PER_THREAD; ++i)
21-
{
22-
-#pragma unroll
23-
+#pragma unroll 1
24-
for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
25-
{
26-
if (compare_op(keys[j + 1], keys[j]))
27-
@@ -350,7 +350,7 @@ namespace __merge_sort {
28-
// each thread has sorted keys_loc
29-
// merge sort keys_loc in shared memory
30-
//
31-
-#pragma unroll
32-
+#pragma unroll 1
33-
for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2)
34-
{
35-
sync_threadblock();
36-
@@ -479,7 +479,7 @@ namespace __merge_sort {
37-
// and fill the remainig keys with it
38-
//
39-
key_type max_key = keys_loc[0];
40-
-#pragma unroll
41-
+#pragma unroll 1
42-
for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
43-
{
44-
if (ITEMS_PER_THREAD * tid + ITEM < num_remaining)
45-
diff a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
46-
index 41eb1d2..f2893b4 100644
22+
if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
23+
@@ -407,7 +407,7 @@ public:
24+
// each thread has sorted keys
25+
// merge sort keys in shared memory
26+
//
27+
- #pragma unroll
28+
+ #pragma unroll 1
29+
for (int target_merged_threads_number = 2;
30+
target_merged_threads_number <= NUM_THREADS;
31+
target_merged_threads_number *= 2)
32+
diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
33+
index b188c75f..3f36656f 100644
4734
--- a/cub/device/dispatch/dispatch_radix_sort.cuh
4835
+++ b/cub/device/dispatch/dispatch_radix_sort.cuh
49-
@@ -723,7 +723,7 @@ struct DeviceRadixSortPolicy
50-
51-
36+
@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy
37+
38+
5239
/// SM60 (GP100)
5340
- struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
5441
+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
5542
{
5643
enum {
5744
PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100)
58-
diff a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
59-
index f6aee45..dd64301 100644
45+
diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
46+
index e0470ccb..6a0c2ed6 100644
6047
--- a/cub/device/dispatch/dispatch_reduce.cuh
6148
+++ b/cub/device/dispatch/dispatch_reduce.cuh
62-
@@ -284,7 +284,7 @@ struct DeviceReducePolicy
49+
@@ -280,7 +280,7 @@ struct DeviceReducePolicy
6350
};
64-
51+
6552
/// SM60
6653
- struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
6754
+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
6855
{
6956
// ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
7057
typedef AgentReducePolicy<
71-
diff a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
72-
index c0c6d59..937ee31 100644
58+
diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
59+
index c2d04588..ac2d10e0 100644
7360
--- a/cub/device/dispatch/dispatch_scan.cuh
7461
+++ b/cub/device/dispatch/dispatch_scan.cuh
75-
@@ -178,7 +178,7 @@ struct DeviceScanPolicy
62+
@@ -177,7 +177,7 @@ struct DeviceScanPolicy
7663
};
77-
64+
7865
/// SM600
7966
- struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
8067
+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
8168
{
8269
typedef AgentScanPolicy<
8370
128, 15, ///< Threads per block, items per thread
71+
diff --git a/cub/thread/thread_sort.cuh b/cub/thread/thread_sort.cuh
72+
index 5d486789..b42fb5f0 100644
73+
--- a/cub/thread/thread_sort.cuh
74+
+++ b/cub/thread/thread_sort.cuh
75+
@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD],
76+
{
77+
constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
78+
79+
- #pragma unroll
80+
+ #pragma unroll 1
81+
for (int i = 0; i < ITEMS_PER_THREAD; ++i)
82+
{
83+
- #pragma unroll
84+
+ #pragma unroll 1
85+
for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
86+
{
87+
if (compare_op(keys[j + 1], keys[j]))
88+
diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h
89+
index d0e3f94..76774b0 100644
90+
--- a/thrust/system/cuda/detail/dispatch.h
91+
+++ b/thrust/system/cuda/detail/dispatch.h
92+
@@ -32,9 +32,8 @@
93+
status = call arguments; \
94+
} \
95+
else { \
96+
- auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
97+
- status = call arguments; \
98+
- }
99+
+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
100+
+ }
101+
102+
/**
103+
* Dispatch between 32-bit and 64-bit index based versions of the same algorithm
104+
@@ -52,10 +51,8 @@
105+
status = call arguments; \
106+
} \
107+
else { \
108+
- auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1); \
109+
- auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2); \
110+
- status = call arguments; \
111+
- }
112+
+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
113+
+ }
114+
/**
115+
* Dispatch between 32-bit and 64-bit index based versions of the same algorithm
116+
* implementation. This version allows using different token sequences for callables

0 commit comments

Comments
 (0)