|
1 | | -diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h |
2 | | -index 1ffeef0..5e80800 100644 |
3 | | ---- a/thrust/system/cuda/detail/sort.h |
4 | | -+++ b/thrust/system/cuda/detail/sort.h |
5 | | -@@ -108,7 +108,7 @@ namespace __merge_sort { |
6 | | - key_type key2 = keys_shared[keys2_beg]; |
7 | | - |
8 | | - |
| 1 | +diff --git a/cub/block/block_merge_sort.cuh b/cub/block/block_merge_sort.cuh |
| 2 | +index 4769df36..d86d6342 100644 |
| 3 | +--- a/cub/block/block_merge_sort.cuh |
| 4 | ++++ b/cub/block/block_merge_sort.cuh |
| 5 | +@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared, |
| 6 | + KeyT key1 = keys_shared[keys1_beg]; |
| 7 | + KeyT key2 = keys_shared[keys2_beg]; |
| 8 | + |
9 | 9 | -#pragma unroll |
10 | 10 | +#pragma unroll 1 |
11 | | - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) |
12 | | - { |
13 | | - bool p = (keys2_beg < keys2_end) && |
14 | | -@@ -311,10 +311,10 @@ namespace __merge_sort { |
15 | | - void stable_odd_even_sort(key_type (&keys)[ITEMS_PER_THREAD], |
16 | | - item_type (&items)[ITEMS_PER_THREAD]) |
| 11 | + for (int item = 0; item < ITEMS_PER_THREAD; ++item) |
| 12 | + { |
| 13 | + bool p = (keys2_beg < keys2_end) && |
| 14 | +@@ -383,7 +383,7 @@ public: |
| 15 | + // |
| 16 | + KeyT max_key = oob_default; |
| 17 | + |
| 18 | +- #pragma unroll |
| 19 | ++ #pragma unroll 1 |
| 20 | + for (int item = 1; item < ITEMS_PER_THREAD; ++item) |
17 | 21 | { |
18 | | --#pragma unroll |
19 | | -+#pragma unroll 1 |
20 | | - for (int i = 0; i < ITEMS_PER_THREAD; ++i) |
21 | | - { |
22 | | --#pragma unroll |
23 | | -+#pragma unroll 1 |
24 | | - for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) |
25 | | - { |
26 | | - if (compare_op(keys[j + 1], keys[j])) |
27 | | -@@ -350,7 +350,7 @@ namespace __merge_sort { |
28 | | - // each thread has sorted keys_loc |
29 | | - // merge sort keys_loc in shared memory |
30 | | - // |
31 | | --#pragma unroll |
32 | | -+#pragma unroll 1 |
33 | | - for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2) |
34 | | - { |
35 | | - sync_threadblock(); |
36 | | -@@ -479,7 +479,7 @@ namespace __merge_sort { |
37 | | - // and fill the remainig keys with it |
38 | | - // |
39 | | - key_type max_key = keys_loc[0]; |
40 | | --#pragma unroll |
41 | | -+#pragma unroll 1 |
42 | | - for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) |
43 | | - { |
44 | | - if (ITEMS_PER_THREAD * tid + ITEM < num_remaining) |
45 | | -diff a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh |
46 | | -index 41eb1d2..f2893b4 100644 |
| 22 | + if (ITEMS_PER_THREAD * linear_tid + item < valid_items) |
| 23 | +@@ -407,7 +407,7 @@ public: |
| 24 | + // each thread has sorted keys |
| 25 | + // merge sort keys in shared memory |
| 26 | + // |
| 27 | +- #pragma unroll |
| 28 | ++ #pragma unroll 1 |
| 29 | + for (int target_merged_threads_number = 2; |
| 30 | + target_merged_threads_number <= NUM_THREADS; |
| 31 | + target_merged_threads_number *= 2) |
| 32 | +diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh |
| 33 | +index b188c75f..3f36656f 100644 |
47 | 34 | --- a/cub/device/dispatch/dispatch_radix_sort.cuh |
48 | 35 | +++ b/cub/device/dispatch/dispatch_radix_sort.cuh |
49 | | -@@ -723,7 +723,7 @@ struct DeviceRadixSortPolicy |
50 | | - |
51 | | - |
| 36 | +@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy |
| 37 | + |
| 38 | + |
52 | 39 | /// SM60 (GP100) |
53 | 40 | - struct Policy600 : ChainedPolicy<600, Policy600, Policy500> |
54 | 41 | + struct Policy600 : ChainedPolicy<600, Policy600, Policy600> |
55 | 42 | { |
56 | 43 | enum { |
57 | 44 | PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) |
58 | | -diff a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh |
59 | | -index f6aee45..dd64301 100644 |
| 45 | +diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh |
| 46 | +index e0470ccb..6a0c2ed6 100644 |
60 | 47 | --- a/cub/device/dispatch/dispatch_reduce.cuh |
61 | 48 | +++ b/cub/device/dispatch/dispatch_reduce.cuh |
62 | | -@@ -284,7 +284,7 @@ struct DeviceReducePolicy |
| 49 | +@@ -280,7 +280,7 @@ struct DeviceReducePolicy |
63 | 50 | }; |
64 | | - |
| 51 | + |
65 | 52 | /// SM60 |
66 | 53 | - struct Policy600 : ChainedPolicy<600, Policy600, Policy350> |
67 | 54 | + struct Policy600 : ChainedPolicy<600, Policy600, Policy600> |
68 | 55 | { |
69 | 56 | // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) |
70 | 57 | typedef AgentReducePolicy< |
71 | | -diff a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh |
72 | | -index c0c6d59..937ee31 100644 |
| 58 | +diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh |
| 59 | +index c2d04588..ac2d10e0 100644 |
73 | 60 | --- a/cub/device/dispatch/dispatch_scan.cuh |
74 | 61 | +++ b/cub/device/dispatch/dispatch_scan.cuh |
75 | | -@@ -178,7 +178,7 @@ struct DeviceScanPolicy |
| 62 | +@@ -177,7 +177,7 @@ struct DeviceScanPolicy |
76 | 63 | }; |
77 | | - |
| 64 | + |
78 | 65 | /// SM600 |
79 | 66 | - struct Policy600 : ChainedPolicy<600, Policy600, Policy520> |
80 | 67 | + struct Policy600 : ChainedPolicy<600, Policy600, Policy600> |
81 | 68 | { |
82 | 69 | typedef AgentScanPolicy< |
83 | 70 | 128, 15, ///< Threads per block, items per thread |
| 71 | +diff --git a/cub/thread/thread_sort.cuh b/cub/thread/thread_sort.cuh |
| 72 | +index 5d486789..b42fb5f0 100644 |
| 73 | +--- a/cub/thread/thread_sort.cuh |
| 74 | ++++ b/cub/thread/thread_sort.cuh |
| 75 | +@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], |
| 76 | + { |
| 77 | + constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value; |
| 78 | + |
| 79 | +- #pragma unroll |
| 80 | ++ #pragma unroll 1 |
| 81 | + for (int i = 0; i < ITEMS_PER_THREAD; ++i) |
| 82 | + { |
| 83 | +- #pragma unroll |
| 84 | ++ #pragma unroll 1 |
| 85 | + for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) |
| 86 | + { |
| 87 | + if (compare_op(keys[j + 1], keys[j])) |
| 88 | +diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h |
| 89 | +index d0e3f94..76774b0 100644 |
| 90 | +--- a/thrust/system/cuda/detail/dispatch.h |
| 91 | ++++ b/thrust/system/cuda/detail/dispatch.h |
| 92 | +@@ -32,9 +32,8 @@ |
| 93 | + status = call arguments; \ |
| 94 | + } \ |
| 95 | + else { \ |
| 96 | +- auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \ |
| 97 | +- status = call arguments; \ |
| 98 | +- } |
| 99 | ++ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ |
| 100 | ++ } |
| 101 | + |
| 102 | + /** |
| 103 | + * Dispatch between 32-bit and 64-bit index based versions of the same algorithm |
| 104 | +@@ -52,10 +51,8 @@ |
| 105 | + status = call arguments; \ |
| 106 | + } \ |
| 107 | + else { \ |
| 108 | +- auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1); \ |
| 109 | +- auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2); \ |
| 110 | +- status = call arguments; \ |
| 111 | +- } |
| 112 | ++ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ |
| 113 | ++ } |
| 114 | + /** |
| 115 | + * Dispatch between 32-bit and 64-bit index based versions of the same algorithm |
| 116 | + * implementation. This version allows using different token sequences for callables |
0 commit comments