Skip to content

Commit 37d40fc

Browse files
committed
add the later drop and process the matrix and factor together
1 parent 057e577 commit 37d40fc

10 files changed

Lines changed: 596 additions & 16 deletions

File tree

common/cuda_hip/factorization/ilu_kernels.cpp

Lines changed: 131 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
1-
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
1+
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
22
//
33
// SPDX-License-Identifier: BSD-3-Clause
44

55
#include "core/factorization/ilu_kernels.hpp"
66

77
#include <ginkgo/core/base/array.hpp>
88

9+
#include "common/cuda_hip/base/math.hpp"
910
#include "common/cuda_hip/base/runtime.hpp"
1011
#include "common/cuda_hip/base/sparselib_bindings.hpp"
11-
12+
#include "common/cuda_hip/components/cooperative_groups.hpp"
13+
#include "common/cuda_hip/components/reduction.hpp"
14+
#include "common/cuda_hip/components/syncfree.hpp"
15+
#include "common/cuda_hip/components/thread_ids.hpp"
16+
#include "core/matrix/csr_lookup.hpp"
1217

1318
namespace gko {
1419
namespace kernels {
@@ -58,6 +63,130 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
5863
GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
5964

6065

66+
constexpr static int default_block_size = 512;
67+
68+
namespace kernel {
69+
70+
71+
template <typename ValueType, typename IndexType>
72+
__global__ __launch_bounds__(default_block_size) void factorize_on_both(
73+
const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols,
74+
const IndexType* __restrict__ storage_offsets,
75+
const int32* __restrict__ storage, const int64* __restrict__ row_descs,
76+
const IndexType* __restrict__ diag_idxs, ValueType* __restrict__ vals,
77+
const IndexType* __restrict__ matrix_row_ptrs,
78+
const IndexType* __restrict__ matrix_cols,
79+
const IndexType* __restrict__ matrix_storage_offsets,
80+
const int32* __restrict__ matrix_storage,
81+
const int64* __restrict__ matrix_row_descs,
82+
ValueType* __restrict__ matrix_vals, syncfree_storage dep_storage,
83+
size_type num_rows)
84+
{
85+
using scheduler_t =
86+
syncfree_scheduler<default_block_size, config::warp_size, IndexType>;
87+
__shared__ typename scheduler_t::shared_storage sh_dep_storage;
88+
scheduler_t scheduler(dep_storage, sh_dep_storage);
89+
const auto row = scheduler.get_work_id();
90+
if (row >= num_rows) {
91+
return;
92+
}
93+
const auto warp =
94+
group::tiled_partition<config::warp_size>(group::this_thread_block());
95+
const auto lane = warp.thread_rank();
96+
const auto row_begin = row_ptrs[row];
97+
const auto row_diag = diag_idxs[row];
98+
const auto row_end = row_ptrs[row + 1];
99+
gko::matrix::csr::device_sparsity_lookup<IndexType> lookup{
100+
row_ptrs, cols, storage_offsets,
101+
storage, row_descs, static_cast<size_type>(row)};
102+
gko::matrix::csr::device_sparsity_lookup<IndexType> matrix_lookup{
103+
matrix_row_ptrs, matrix_cols, matrix_storage_offsets,
104+
matrix_storage, matrix_row_descs, static_cast<size_type>(row)};
105+
auto factor_nz = row_begin;
106+
const auto matrix_row_begin = matrix_row_ptrs[row];
107+
auto matrix_nz = matrix_row_begin;
108+
const auto matrix_row_diag = matrix_lookup.lookup_unsafe(row) + matrix_nz;
109+
// for each lower triangular entry: eliminate with corresponding row
110+
while (matrix_nz < matrix_row_diag || factor_nz < row_diag) {
111+
auto dep_matrix = matrix_nz < matrix_row_diag
112+
? matrix_cols[matrix_nz]
113+
: device_numeric_limits<IndexType>::max();
114+
auto dep_factor = factor_nz < row_diag
115+
? cols[factor_nz]
116+
: device_numeric_limits<IndexType>::max();
117+
auto dep = min(dep_matrix, dep_factor);
118+
// we can load the value before synchronizing because the following
119+
// updates only go past the diagonal of the dependency row, i.e. at
120+
// least column dep + 1
121+
const auto val =
122+
(dep == dep_factor) ? vals[factor_nz] : matrix_vals[matrix_nz];
123+
const auto diag_idx = diag_idxs[dep];
124+
const auto dep_end = row_ptrs[dep + 1];
125+
scheduler.wait(dep);
126+
const auto diag = vals[diag_idx];
127+
const auto scale = val / diag;
128+
if (lane == 0) {
129+
vals[factor_nz] = scale;
130+
}
131+
// subtract all entries past the diagonal
132+
// we only need to consider the entries in the factor not entire
133+
// one.
134+
for (auto upper_nz = diag_idx + 1 + lane; upper_nz < dep_end;
135+
upper_nz += config::warp_size) {
136+
const auto upper_col = cols[upper_nz];
137+
const auto upper_val = vals[upper_nz];
138+
139+
const auto idx = lookup[upper_col];
140+
if (idx != invalid_index<IndexType>()) {
141+
vals[row_begin + idx] -= scale * upper_val;
142+
}
143+
// but we still need to operate on the matrix because we drop
144+
// the entries after row operation need to keep the track here.
145+
const auto matrix_idx = matrix_lookup[upper_col];
146+
if (matrix_idx != invalid_index<IndexType>()) {
147+
matrix_vals[matrix_row_begin + matrix_idx] -= scale * val;
148+
}
149+
}
150+
matrix_nz += (dep == dep_matrix);
151+
factor_nz += (dep == dep_factor);
152+
}
153+
scheduler.mark_ready();
154+
}
155+
156+
} // namespace kernel
157+
158+
template <typename ValueType, typename IndexType>
159+
void factorize_on_both(std::shared_ptr<const DefaultExecutor> exec,
160+
const IndexType* lookup_offsets,
161+
const int64* lookup_descs, const int32* lookup_storage,
162+
const IndexType* diag_idxs,
163+
matrix::Csr<ValueType, IndexType>* factors,
164+
const IndexType* matrix_lookup_offsets,
165+
const int64* matrix_lookup_descs,
166+
const int32* matrix_lookup_storage,
167+
matrix::Csr<ValueType, IndexType>* matrix,
168+
array<int>& tmp_storage)
169+
{
170+
const auto num_rows = factors->get_size()[0];
171+
if (num_rows > 0) {
172+
syncfree_storage storage(exec, tmp_storage, num_rows);
173+
const auto num_blocks =
174+
ceildiv(num_rows, default_block_size / config::warp_size);
175+
kernel::factorize_on_both<<<num_blocks, default_block_size, 0,
176+
exec->get_stream()>>>(
177+
factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
178+
lookup_offsets, lookup_storage, lookup_descs, diag_idxs,
179+
as_device_type(factors->get_values()), matrix->get_const_row_ptrs(),
180+
matrix->get_const_col_idxs(), matrix_lookup_offsets,
181+
matrix_lookup_storage, matrix_lookup_descs,
182+
as_device_type(matrix->get_values()), storage, num_rows);
183+
}
184+
}
185+
186+
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
187+
GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL);
188+
189+
61190
} // namespace ilu_factorization
62191
} // namespace GKO_DEVICE_NAMESPACE
63192
} // namespace kernels

core/device_hooks/common_kernels.inc.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,6 +1015,7 @@ namespace ilu_factorization {
10151015

10161016

10171017
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
1018+
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL);
10181019

10191020

10201021
} // namespace ilu_factorization

core/factorization/ilu.cpp

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ GKO_REGISTER_OPERATION(initialize_l_u, factorization::initialize_l_u);
3838
GKO_REGISTER_OPERATION(fill_array, components::fill_array);
3939
GKO_REGISTER_OPERATION(initialize, lu_factorization::initialize);
4040
GKO_REGISTER_OPERATION(factorize, lu_factorization::factorize);
41+
GKO_REGISTER_OPERATION(factorize_on_both, ilu_factorization::factorize_on_both);
4142

4243

4344
} // anonymous namespace
@@ -145,10 +146,25 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
145146
}
146147
// run numerical factorization
147148
array<int> tmp{exec};
148-
exec->run(ilu_factorization::make_factorize(
149-
lookup.storage_offsets.get_const_data(),
150-
lookup.row_descs.get_const_data(), lookup.storage.get_const_data(),
151-
diag_idxs.get_const_data(), factors.get(), false, tmp));
149+
if (parameters_.later_drop) {
150+
auto copy_matrix = local_system_matrix->clone();
151+
const auto matrix_lookup =
152+
matrix::csr::build_lookup(copy_matrix.get());
153+
exec->run(ilu_factorization::make_factorize_on_both(
154+
lookup.storage_offsets.get_const_data(),
155+
lookup.row_descs.get_const_data(),
156+
lookup.storage.get_const_data(), diag_idxs.get_const_data(),
157+
factors.get(), matrix_lookup.storage_offsets.get_const_data(),
158+
matrix_lookup.row_descs.get_const_data(),
159+
matrix_lookup.storage.get_const_data(), copy_matrix.get(),
160+
tmp));
161+
} else {
162+
exec->run(ilu_factorization::make_factorize(
163+
lookup.storage_offsets.get_const_data(),
164+
lookup.row_descs.get_const_data(),
165+
lookup.storage.get_const_data(), diag_idxs.get_const_data(),
166+
factors.get(), false, tmp));
167+
}
152168
ilu = factors;
153169
} else {
154170
exec->run(

core/factorization/ilu_kernels.hpp

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
1+
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
22
//
33
// SPDX-License-Identifier: BSD-3-Clause
44

@@ -23,11 +23,22 @@ namespace kernels {
2323
#define GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL(ValueType, IndexType) \
2424
void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec, \
2525
matrix::Csr<ValueType, IndexType>* system_matrix)
26-
27-
28-
#define GKO_DECLARE_ALL_AS_TEMPLATES \
29-
template <typename ValueType, typename IndexType> \
30-
GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL(ValueType, IndexType)
26+
#define GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL(ValueType, IndexType) \
27+
void factorize_on_both( \
28+
std::shared_ptr<const DefaultExecutor> exec, \
29+
const IndexType* lookup_offsets, const int64* lookup_descs, \
30+
const int32* lookup_storage, const IndexType* diag_idxs, \
31+
matrix::Csr<ValueType, IndexType>* factors, \
32+
const IndexType* matrix_lookup_offsets, \
33+
const int64* matrix_lookup_descs, const int32* matrix_lookup_storage, \
34+
matrix::Csr<ValueType, IndexType>* matrix, array<int>& tmp_storage)
35+
36+
37+
#define GKO_DECLARE_ALL_AS_TEMPLATES \
38+
template <typename ValueType, typename IndexType> \
39+
GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL(ValueType, IndexType); \
40+
template <typename ValueType, typename IndexType> \
41+
GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL(ValueType, IndexType)
3142

3243

3344
GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(ilu_factorization,

dpcpp/factorization/ilu_kernels.dp.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
1+
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
22
//
33
// SPDX-License-Identifier: BSD-3-Clause
44

@@ -24,6 +24,21 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
2424
GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
2525

2626

27+
template <typename ValueType, typename IndexType>
28+
void factorize_on_both(std::shared_ptr<const DefaultExecutor> exec,
29+
const IndexType* lookup_offsets,
30+
const int64* lookup_descs, const int32* lookup_storage,
31+
const IndexType* diag_idxs,
32+
matrix::Csr<ValueType, IndexType>* factors,
33+
const IndexType* matrix_lookup_offsets,
34+
const int64* matrix_lookup_descs,
35+
const int32* matrix_lookup_storage,
36+
matrix::Csr<ValueType, IndexType>* matrix,
37+
array<int>& tmp_storage) GKO_NOT_IMPLEMENTED;
38+
39+
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
40+
GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL);
41+
2742
} // namespace ilu_factorization
2843
} // namespace dpcpp
2944
} // namespace kernels

include/ginkgo/core/factorization/ilu.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ class Ilu : public Composition<ValueType> {
109109
*/
110110
incomplete_algorithm GKO_FACTORY_PARAMETER_SCALAR(
111111
algorithm, incomplete_algorithm::sparselib);
112+
113+
bool GKO_FACTORY_PARAMETER_SCALAR(later_drop, false);
112114
};
113115
GKO_ENABLE_LIN_OP_FACTORY(Ilu, parameters, Factory);
114116
GKO_ENABLE_BUILD_METHOD(Factory);

omp/factorization/ilu_kernels.cpp

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1-
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
1+
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
22
//
33
// SPDX-License-Identifier: BSD-3-Clause
44

55
#include "core/factorization/ilu_kernels.hpp"
66

7+
#include "core/matrix/csr_lookup.hpp"
8+
79

810
namespace gko {
911
namespace kernels {
@@ -24,6 +26,85 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
2426
GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
2527

2628

29+
template <typename ValueType, typename IndexType>
30+
void factorize_on_both(std::shared_ptr<const DefaultExecutor> exec,
31+
const IndexType* lookup_offsets,
32+
const int64* lookup_descs, const int32* lookup_storage,
33+
const IndexType* diag_idxs,
34+
matrix::Csr<ValueType, IndexType>* factors,
35+
const IndexType* matrix_lookup_offsets,
36+
const int64* matrix_lookup_descs,
37+
const int32* matrix_lookup_storage,
38+
matrix::Csr<ValueType, IndexType>* matrix,
39+
array<int>& tmp_storage)
40+
{
41+
const auto num_rows = factors->get_size()[0];
42+
const auto row_ptrs = factors->get_const_row_ptrs();
43+
const auto cols = factors->get_const_col_idxs();
44+
const auto vals = factors->get_values();
45+
// TODO parallelize
46+
for (size_type row = 0; row < num_rows; row++) {
47+
const auto row_begin = row_ptrs[row];
48+
const auto row_diag = diag_idxs[row];
49+
matrix::csr::device_sparsity_lookup<IndexType> lookup{
50+
row_ptrs, cols, lookup_offsets, lookup_storage, lookup_descs, row};
51+
matrix::csr::device_sparsity_lookup<IndexType> matrix_lookup{
52+
matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
53+
matrix_lookup_offsets, matrix_lookup_storage,
54+
matrix_lookup_descs, row};
55+
auto factor_nz = row_begin;
56+
const auto matrix_row_begin = matrix->get_const_row_ptrs()[row];
57+
auto matrix_nz = matrix_row_begin;
58+
const auto matrix_row_diag =
59+
matrix_lookup.lookup_unsafe(row) + matrix_nz;
60+
while (matrix_nz < matrix_row_diag || factor_nz < row_diag) {
61+
auto dep_matrix = matrix_nz < matrix_row_diag
62+
? matrix->get_const_col_idxs()[matrix_nz]
63+
: std::numeric_limits<IndexType>::max();
64+
auto dep_factor = factor_nz < row_diag
65+
? cols[factor_nz]
66+
: std::numeric_limits<IndexType>::max();
67+
auto dep = min(dep_matrix, dep_factor);
68+
const auto dep_diag_idx = diag_idxs[dep];
69+
const auto dep_diag = vals[dep_diag_idx];
70+
const auto dep_end = row_ptrs[dep + 1];
71+
const auto scale =
72+
((dep == dep_factor) ? vals[factor_nz]
73+
: matrix->get_const_values()[matrix_nz]) /
74+
dep_diag;
75+
if (dep == dep_factor) {
76+
vals[factor_nz] = scale;
77+
}
78+
if (dep == dep_matrix) {
79+
matrix->get_values()[matrix_nz] = scale;
80+
}
81+
// we only need to consider the entries in the factor not entire
82+
// one.
83+
for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end; dep_nz++) {
84+
const auto col = cols[dep_nz];
85+
const auto val = vals[dep_nz];
86+
const auto idx = lookup[col];
87+
if (idx != invalid_index<IndexType>()) {
88+
vals[row_begin + idx] -= scale * val;
89+
}
90+
// but we still need to operate on the matrix because we drop
91+
// the entries after row operation need to keep the track here.
92+
const auto matrix_idx = matrix_lookup[col];
93+
if (matrix_idx != invalid_index<IndexType>()) {
94+
matrix->get_values()[matrix_row_begin + matrix_idx] -=
95+
scale * val;
96+
}
97+
}
98+
matrix_nz += (dep == dep_matrix);
99+
factor_nz += (dep == dep_factor);
100+
}
101+
}
102+
}
103+
104+
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
105+
GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL);
106+
107+
27108
} // namespace ilu_factorization
28109
} // namespace omp
29110
} // namespace kernels

0 commit comments

Comments
 (0)