|
1 | | -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors |
| 1 | +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors |
2 | 2 | // |
3 | 3 | // SPDX-License-Identifier: BSD-3-Clause |
4 | 4 |
|
5 | 5 | #include "core/factorization/ilu_kernels.hpp" |
6 | 6 |
|
7 | 7 | #include <ginkgo/core/base/array.hpp> |
8 | 8 |
|
| 9 | +#include "common/cuda_hip/base/math.hpp" |
9 | 10 | #include "common/cuda_hip/base/runtime.hpp" |
10 | 11 | #include "common/cuda_hip/base/sparselib_bindings.hpp" |
11 | | - |
| 12 | +#include "common/cuda_hip/components/cooperative_groups.hpp" |
| 13 | +#include "common/cuda_hip/components/reduction.hpp" |
| 14 | +#include "common/cuda_hip/components/syncfree.hpp" |
| 15 | +#include "common/cuda_hip/components/thread_ids.hpp" |
| 16 | +#include "core/matrix/csr_lookup.hpp" |
12 | 17 |
|
13 | 18 | namespace gko { |
14 | 19 | namespace kernels { |
@@ -58,6 +63,130 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( |
58 | 63 | GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL); |
59 | 64 |
|
60 | 65 |
|
| 66 | +constexpr static int default_block_size = 512; |
| 67 | + |
| 68 | +namespace kernel { |
| 69 | + |
| 70 | + |
| 71 | +template <typename ValueType, typename IndexType> |
| 72 | +__global__ __launch_bounds__(default_block_size) void factorize_on_both( |
| 73 | + const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols, |
| 74 | + const IndexType* __restrict__ storage_offsets, |
| 75 | + const int32* __restrict__ storage, const int64* __restrict__ row_descs, |
| 76 | + const IndexType* __restrict__ diag_idxs, ValueType* __restrict__ vals, |
| 77 | + const IndexType* __restrict__ matrix_row_ptrs, |
| 78 | + const IndexType* __restrict__ matrix_cols, |
| 79 | + const IndexType* __restrict__ matrix_storage_offsets, |
| 80 | + const int32* __restrict__ matrix_storage, |
| 81 | + const int64* __restrict__ matrix_row_descs, |
| 82 | + ValueType* __restrict__ matrix_vals, syncfree_storage dep_storage, |
| 83 | + size_type num_rows) |
| 84 | +{ |
| 85 | + using scheduler_t = |
| 86 | + syncfree_scheduler<default_block_size, config::warp_size, IndexType>; |
| 87 | + __shared__ typename scheduler_t::shared_storage sh_dep_storage; |
| 88 | + scheduler_t scheduler(dep_storage, sh_dep_storage); |
| 89 | + const auto row = scheduler.get_work_id(); |
| 90 | + if (row >= num_rows) { |
| 91 | + return; |
| 92 | + } |
| 93 | + const auto warp = |
| 94 | + group::tiled_partition<config::warp_size>(group::this_thread_block()); |
| 95 | + const auto lane = warp.thread_rank(); |
| 96 | + const auto row_begin = row_ptrs[row]; |
| 97 | + const auto row_diag = diag_idxs[row]; |
| 98 | + const auto row_end = row_ptrs[row + 1]; |
| 99 | + gko::matrix::csr::device_sparsity_lookup<IndexType> lookup{ |
| 100 | + row_ptrs, cols, storage_offsets, |
| 101 | + storage, row_descs, static_cast<size_type>(row)}; |
| 102 | + gko::matrix::csr::device_sparsity_lookup<IndexType> matrix_lookup{ |
| 103 | + matrix_row_ptrs, matrix_cols, matrix_storage_offsets, |
| 104 | + matrix_storage, matrix_row_descs, static_cast<size_type>(row)}; |
| 105 | + auto factor_nz = row_begin; |
| 106 | + const auto matrix_row_begin = matrix_row_ptrs[row]; |
| 107 | + auto matrix_nz = matrix_row_begin; |
| 108 | + const auto matrix_row_diag = matrix_lookup.lookup_unsafe(row) + matrix_nz; |
| 109 | + // for each lower triangular entry: eliminate with corresponding row |
| 110 | + while (matrix_nz < matrix_row_diag || factor_nz < row_diag) { |
| 111 | + auto dep_matrix = matrix_nz < matrix_row_diag |
| 112 | + ? matrix_cols[matrix_nz] |
| 113 | + : device_numeric_limits<IndexType>::max(); |
| 114 | + auto dep_factor = factor_nz < row_diag |
| 115 | + ? cols[factor_nz] |
| 116 | + : device_numeric_limits<IndexType>::max(); |
| 117 | + auto dep = min(dep_matrix, dep_factor); |
| 118 | + // we can load the value before synchronizing because the following |
| 119 | + // updates only go past the diagonal of the dependency row, i.e. at |
| 120 | + // least column dep + 1 |
| 121 | + const auto val = |
| 122 | + (dep == dep_factor) ? vals[factor_nz] : matrix_vals[matrix_nz]; |
| 123 | + const auto diag_idx = diag_idxs[dep]; |
| 124 | + const auto dep_end = row_ptrs[dep + 1]; |
| 125 | + scheduler.wait(dep); |
| 126 | + const auto diag = vals[diag_idx]; |
| 127 | + const auto scale = val / diag; |
| 128 | + if (lane == 0) { |
| 129 | + vals[factor_nz] = scale; |
| 130 | + } |
| 131 | + // subtract all entries past the diagonal |
| 132 | + // we only need to consider the entries in the factor not entire |
| 133 | + // one. |
| 134 | + for (auto upper_nz = diag_idx + 1 + lane; upper_nz < dep_end; |
| 135 | + upper_nz += config::warp_size) { |
| 136 | + const auto upper_col = cols[upper_nz]; |
| 137 | + const auto upper_val = vals[upper_nz]; |
| 138 | + |
| 139 | + const auto idx = lookup[upper_col]; |
| 140 | + if (idx != invalid_index<IndexType>()) { |
| 141 | + vals[row_begin + idx] -= scale * upper_val; |
| 142 | + } |
| 143 | + // but we still need to operate on the matrix because we drop |
| 144 | + // the entries after row operation need to keep the track here. |
| 145 | + const auto matrix_idx = matrix_lookup[upper_col]; |
| 146 | + if (matrix_idx != invalid_index<IndexType>()) { |
| 147 | + matrix_vals[matrix_row_begin + matrix_idx] -= scale * val; |
| 148 | + } |
| 149 | + } |
| 150 | + matrix_nz += (dep == dep_matrix); |
| 151 | + factor_nz += (dep == dep_factor); |
| 152 | + } |
| 153 | + scheduler.mark_ready(); |
| 154 | +} |
| 155 | + |
| 156 | +} // namespace kernel |
| 157 | + |
| 158 | +template <typename ValueType, typename IndexType> |
| 159 | +void factorize_on_both(std::shared_ptr<const DefaultExecutor> exec, |
| 160 | + const IndexType* lookup_offsets, |
| 161 | + const int64* lookup_descs, const int32* lookup_storage, |
| 162 | + const IndexType* diag_idxs, |
| 163 | + matrix::Csr<ValueType, IndexType>* factors, |
| 164 | + const IndexType* matrix_lookup_offsets, |
| 165 | + const int64* matrix_lookup_descs, |
| 166 | + const int32* matrix_lookup_storage, |
| 167 | + matrix::Csr<ValueType, IndexType>* matrix, |
| 168 | + array<int>& tmp_storage) |
| 169 | +{ |
| 170 | + const auto num_rows = factors->get_size()[0]; |
| 171 | + if (num_rows > 0) { |
| 172 | + syncfree_storage storage(exec, tmp_storage, num_rows); |
| 173 | + const auto num_blocks = |
| 174 | + ceildiv(num_rows, default_block_size / config::warp_size); |
| 175 | + kernel::factorize_on_both<<<num_blocks, default_block_size, 0, |
| 176 | + exec->get_stream()>>>( |
| 177 | + factors->get_const_row_ptrs(), factors->get_const_col_idxs(), |
| 178 | + lookup_offsets, lookup_storage, lookup_descs, diag_idxs, |
| 179 | + as_device_type(factors->get_values()), matrix->get_const_row_ptrs(), |
| 180 | + matrix->get_const_col_idxs(), matrix_lookup_offsets, |
| 181 | + matrix_lookup_storage, matrix_lookup_descs, |
| 182 | + as_device_type(matrix->get_values()), storage, num_rows); |
| 183 | + } |
| 184 | +} |
| 185 | + |
| 186 | +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( |
| 187 | + GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL); |
| 188 | + |
| 189 | + |
61 | 190 | } // namespace ilu_factorization |
62 | 191 | } // namespace GKO_DEVICE_NAMESPACE |
63 | 192 | } // namespace kernels |
|
0 commit comments