|
| 1 | +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors |
| 2 | +// |
| 3 | +// SPDX-License-Identifier: BSD-3-Clause |
| 4 | + |
| 5 | +#ifndef GKO_OMP_COMPONENTS_PREFIX_SUM_HPP_ |
| 6 | +#define GKO_OMP_COMPONENTS_PREFIX_SUM_HPP_ |
| 7 | + |
| 8 | +#include <algorithm> |
| 9 | +#include <iterator> |
| 10 | +#include <limits> |
| 11 | +#include <string> |
| 12 | + |
| 13 | +#include <omp.h> |
| 14 | + |
| 15 | +#include "core/base/allocator.hpp" |
| 16 | +#include "core/base/iterator_factory.hpp" |
| 17 | + |
| 18 | + |
| 19 | +namespace gko { |
| 20 | +namespace kernels { |
| 21 | +namespace omp { |
| 22 | +namespace components { |
| 23 | + |
| 24 | + |
| 25 | +/* |
| 26 | + * Similar to prefix_sum, only reduces within runs of the same key value (each |
| 27 | + * key run must only occur once, otherwise the scan operation is not necessarily |
| 28 | + * associaive). It also doesn't ignore the last value! |
| 29 | + * Similar to thrust::exclusive_scan_by_key |
| 30 | + */ |
| 31 | +template <typename KeyIterator, typename Iterator, |
| 32 | + typename ScanOp = |
| 33 | + std::plus<typename std::iterator_traits<Iterator>::value_type>> |
| 34 | +void segmented_prefix_sum( |
| 35 | + std::shared_ptr<const OmpExecutor> exec, KeyIterator key, Iterator it, |
| 36 | + const size_type num_entries, |
| 37 | + typename std::iterator_traits<KeyIterator>::value_type key_init = {}, |
| 38 | + typename std::iterator_traits<Iterator>::value_type init = {}, |
| 39 | + ScanOp op = {}) |
| 40 | +{ |
| 41 | + using key_type = typename std::iterator_traits<KeyIterator>::value_type; |
| 42 | + using value_type = typename std::iterator_traits<Iterator>::value_type; |
| 43 | + // the operation only makes sense for arrays of size at least 2 |
| 44 | + if (num_entries < 2) { |
| 45 | + if (num_entries == 0) { |
| 46 | + return; |
| 47 | + } else { |
| 48 | + *it = init; |
| 49 | + return; |
| 50 | + } |
| 51 | + } |
| 52 | + |
| 53 | + const int nthreads = omp_get_max_threads(); |
| 54 | + vector<value_type> proc_sums(nthreads, init, {exec}); |
| 55 | + vector<key_type> proc_first_key(nthreads, key_init, {exec}); |
| 56 | + vector<key_type> proc_last_key(nthreads, key_init, {exec}); |
| 57 | + const size_type def_num_witems = (num_entries - 1) / nthreads + 1; |
| 58 | + |
| 59 | +#pragma omp parallel |
| 60 | + { |
| 61 | + const int thread_id = omp_get_thread_num(); |
| 62 | + const size_type startidx = thread_id * def_num_witems; |
| 63 | + const size_type endidx = |
| 64 | + std::min(num_entries, (thread_id + 1) * def_num_witems); |
| 65 | + |
| 66 | + auto partial_sum = init; |
| 67 | + auto cur_key = startidx < num_entries ? key[startidx] : key_init; |
| 68 | + proc_first_key[thread_id] = cur_key; |
| 69 | + for (size_type i = startidx; i < endidx; ++i) { |
| 70 | + auto value = it[i]; |
| 71 | + auto new_key = key[i]; |
| 72 | + if (cur_key != new_key) { |
| 73 | + partial_sum = init; |
| 74 | + cur_key = new_key; |
| 75 | + } |
| 76 | + it[i] = partial_sum; |
| 77 | + partial_sum = op(partial_sum, value); |
| 78 | + } |
| 79 | + |
| 80 | + proc_sums[thread_id] = partial_sum; |
| 81 | + proc_last_key[thread_id] = cur_key; |
| 82 | + |
| 83 | +#pragma omp barrier |
| 84 | + |
| 85 | +#pragma omp single |
| 86 | + { |
| 87 | + for (int i = 0; i < nthreads - 1; i++) { |
| 88 | + // the next block carries over the previous partial sum |
| 89 | + // if it starts and ends with the same key as the next one |
| 90 | + if (proc_last_key[i] == proc_first_key[i + 1] && |
| 91 | + proc_first_key[i + 1] == proc_last_key[i + 1]) { |
| 92 | + proc_sums[i + 1] = op(proc_sums[i], proc_sums[i + 1]); |
| 93 | + } |
| 94 | + } |
| 95 | + } |
| 96 | + |
| 97 | + if (thread_id > 0) { |
| 98 | + for (size_type i = startidx; i < endidx; i++) { |
| 99 | + if (key[i] == proc_last_key[thread_id - 1]) { |
| 100 | + it[i] = op(it[i], proc_sums[thread_id - 1]); |
| 101 | + } |
| 102 | + } |
| 103 | + } |
| 104 | + } |
| 105 | +} |
| 106 | + |
| 107 | + |
| 108 | +} // namespace components |
| 109 | +} // namespace omp |
| 110 | +} // namespace kernels |
| 111 | +} // namespace gko |
| 112 | + |
| 113 | +#endif // GKO_OMP_COMPONENTS_PREFIX_SUM_HPP_ |
0 commit comments