add the later drop and process the matrix and factor together

yhmtsai · yhmtsai · commit 37d40fc927cc · 2025-04-16T18:57:09.000+02:00
diff --git a/common/cuda_hip/factorization/ilu_kernels.cpp b/common/cuda_hip/factorization/ilu_kernels.cpp
@@ -1,14 +1,19 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "core/factorization/ilu_kernels.hpp"
 
 #include <ginkgo/core/base/array.hpp>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
 #include "common/cuda_hip/base/sparselib_bindings.hpp"
-
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/syncfree.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/matrix/csr_lookup.hpp"
 
 namespace gko {
 namespace kernels {
@@ -58,6 +63,130 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
+constexpr static int default_block_size = 512;
+
+namespace kernel {
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void factorize_on_both(
+    const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols,
+    const IndexType* __restrict__ storage_offsets,
+    const int32* __restrict__ storage, const int64* __restrict__ row_descs,
+    const IndexType* __restrict__ diag_idxs, ValueType* __restrict__ vals,
+    const IndexType* __restrict__ matrix_row_ptrs,
+    const IndexType* __restrict__ matrix_cols,
+    const IndexType* __restrict__ matrix_storage_offsets,
+    const int32* __restrict__ matrix_storage,
+    const int64* __restrict__ matrix_row_descs,
+    ValueType* __restrict__ matrix_vals, syncfree_storage dep_storage,
+    size_type num_rows)
+{
+    using scheduler_t =
+        syncfree_scheduler<default_block_size, config::warp_size, IndexType>;
+    __shared__ typename scheduler_t::shared_storage sh_dep_storage;
+    scheduler_t scheduler(dep_storage, sh_dep_storage);
+    const auto row = scheduler.get_work_id();
+    if (row >= num_rows) {
+        return;
+    }
+    const auto warp =
+        group::tiled_partition<config::warp_size>(group::this_thread_block());
+    const auto lane = warp.thread_rank();
+    const auto row_begin = row_ptrs[row];
+    const auto row_diag = diag_idxs[row];
+    const auto row_end = row_ptrs[row + 1];
+    gko::matrix::csr::device_sparsity_lookup<IndexType> lookup{
+        row_ptrs, cols,      storage_offsets,
+        storage,  row_descs, static_cast<size_type>(row)};
+    gko::matrix::csr::device_sparsity_lookup<IndexType> matrix_lookup{
+        matrix_row_ptrs, matrix_cols,      matrix_storage_offsets,
+        matrix_storage,  matrix_row_descs, static_cast<size_type>(row)};
+    auto factor_nz = row_begin;
+    const auto matrix_row_begin = matrix_row_ptrs[row];
+    auto matrix_nz = matrix_row_begin;
+    const auto matrix_row_diag = matrix_lookup.lookup_unsafe(row) + matrix_nz;
+    // for each lower triangular entry: eliminate with corresponding row
+    while (matrix_nz < matrix_row_diag || factor_nz < row_diag) {
+        auto dep_matrix = matrix_nz < matrix_row_diag
+                              ? matrix_cols[matrix_nz]
+                              : device_numeric_limits<IndexType>::max();
+        auto dep_factor = factor_nz < row_diag
+                              ? cols[factor_nz]
+                              : device_numeric_limits<IndexType>::max();
+        auto dep = min(dep_matrix, dep_factor);
+        // we can load the value before synchronizing because the following
+        // updates only go past the diagonal of the dependency row, i.e. at
+        // least column dep + 1
+        const auto val =
+            (dep == dep_factor) ? vals[factor_nz] : matrix_vals[matrix_nz];
+        const auto diag_idx = diag_idxs[dep];
+        const auto dep_end = row_ptrs[dep + 1];
+        scheduler.wait(dep);
+        const auto diag = vals[diag_idx];
+        const auto scale = val / diag;
+        if (lane == 0) {
+            vals[factor_nz] = scale;
+        }
+        // subtract all entries past the diagonal
+        // we only need to consider the entries in the factor not entire
+        // one.
+        for (auto upper_nz = diag_idx + 1 + lane; upper_nz < dep_end;
+             upper_nz += config::warp_size) {
+            const auto upper_col = cols[upper_nz];
+            const auto upper_val = vals[upper_nz];
+
+            const auto idx = lookup[upper_col];
+            if (idx != invalid_index<IndexType>()) {
+                vals[row_begin + idx] -= scale * upper_val;
+            }
+            // but we still need to operate on the matrix because we drop
+            // the entries after row operation need to keep the track here.
+            const auto matrix_idx = matrix_lookup[upper_col];
+            if (matrix_idx != invalid_index<IndexType>()) {
+                matrix_vals[matrix_row_begin + matrix_idx] -= scale * val;
+            }
+        }
+        matrix_nz += (dep == dep_matrix);
+        factor_nz += (dep == dep_factor);
+    }
+    scheduler.mark_ready();
+}
+
+}  // namespace kernel
+
+template <typename ValueType, typename IndexType>
+void factorize_on_both(std::shared_ptr<const DefaultExecutor> exec,
+                       const IndexType* lookup_offsets,
+                       const int64* lookup_descs, const int32* lookup_storage,
+                       const IndexType* diag_idxs,
+                       matrix::Csr<ValueType, IndexType>* factors,
+                       const IndexType* matrix_lookup_offsets,
+                       const int64* matrix_lookup_descs,
+                       const int32* matrix_lookup_storage,
+                       matrix::Csr<ValueType, IndexType>* matrix,
+                       array<int>& tmp_storage)
+{
+    const auto num_rows = factors->get_size()[0];
+    if (num_rows > 0) {
+        syncfree_storage storage(exec, tmp_storage, num_rows);
+        const auto num_blocks =
+            ceildiv(num_rows, default_block_size / config::warp_size);
+        kernel::factorize_on_both<<<num_blocks, default_block_size, 0,
+                                    exec->get_stream()>>>(
+            factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
+            lookup_offsets, lookup_storage, lookup_descs, diag_idxs,
+            as_device_type(factors->get_values()), matrix->get_const_row_ptrs(),
+            matrix->get_const_col_idxs(), matrix_lookup_offsets,
+            matrix_lookup_storage, matrix_lookup_descs,
+            as_device_type(matrix->get_values()), storage, num_rows);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL);
+
+
 }  // namespace ilu_factorization
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -1015,6 +1015,7 @@ namespace ilu_factorization {
 
 
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL);
 
 
 }  // namespace ilu_factorization
diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
@@ -38,6 +38,7 @@ GKO_REGISTER_OPERATION(initialize_l_u, factorization::initialize_l_u);
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
 GKO_REGISTER_OPERATION(initialize, lu_factorization::initialize);
 GKO_REGISTER_OPERATION(factorize, lu_factorization::factorize);
+GKO_REGISTER_OPERATION(factorize_on_both, ilu_factorization::factorize_on_both);
 
 
 }  // anonymous namespace
@@ -145,10 +146,25 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
         }
         // run numerical factorization
         array<int> tmp{exec};
-        exec->run(ilu_factorization::make_factorize(
-            lookup.storage_offsets.get_const_data(),
-            lookup.row_descs.get_const_data(), lookup.storage.get_const_data(),
-            diag_idxs.get_const_data(), factors.get(), false, tmp));
+        if (parameters_.later_drop) {
+            auto copy_matrix = local_system_matrix->clone();
+            const auto matrix_lookup =
+                matrix::csr::build_lookup(copy_matrix.get());
+            exec->run(ilu_factorization::make_factorize_on_both(
+                lookup.storage_offsets.get_const_data(),
+                lookup.row_descs.get_const_data(),
+                lookup.storage.get_const_data(), diag_idxs.get_const_data(),
+                factors.get(), matrix_lookup.storage_offsets.get_const_data(),
+                matrix_lookup.row_descs.get_const_data(),
+                matrix_lookup.storage.get_const_data(), copy_matrix.get(),
+                tmp));
+        } else {
+            exec->run(ilu_factorization::make_factorize(
+                lookup.storage_offsets.get_const_data(),
+                lookup.row_descs.get_const_data(),
+                lookup.storage.get_const_data(), diag_idxs.get_const_data(),
+                factors.get(), false, tmp));
+        }
         ilu = factors;
     } else {
         exec->run(
diff --git a/core/factorization/ilu_kernels.hpp b/core/factorization/ilu_kernels.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -23,11 +23,22 @@ namespace kernels {
 #define GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL(ValueType, IndexType)  \
     void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec, \
                        matrix::Csr<ValueType, IndexType>* system_matrix)
-
-
-#define GKO_DECLARE_ALL_AS_TEMPLATES                  \
-    template <typename ValueType, typename IndexType> \
-    GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL(ValueType, IndexType)
+#define GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL(ValueType, IndexType)        \
+    void factorize_on_both(                                                   \
+        std::shared_ptr<const DefaultExecutor> exec,                          \
+        const IndexType* lookup_offsets, const int64* lookup_descs,           \
+        const int32* lookup_storage, const IndexType* diag_idxs,              \
+        matrix::Csr<ValueType, IndexType>* factors,                           \
+        const IndexType* matrix_lookup_offsets,                               \
+        const int64* matrix_lookup_descs, const int32* matrix_lookup_storage, \
+        matrix::Csr<ValueType, IndexType>* matrix, array<int>& tmp_storage)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                            \
+    template <typename ValueType, typename IndexType>           \
+    GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL(ValueType, IndexType); \
+    template <typename ValueType, typename IndexType>           \
+    GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL(ValueType, IndexType)
 
 
 GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(ilu_factorization,
diff --git a/dpcpp/factorization/ilu_kernels.dp.cpp b/dpcpp/factorization/ilu_kernels.dp.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -24,6 +24,21 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void factorize_on_both(std::shared_ptr<const DefaultExecutor> exec,
+                       const IndexType* lookup_offsets,
+                       const int64* lookup_descs, const int32* lookup_storage,
+                       const IndexType* diag_idxs,
+                       matrix::Csr<ValueType, IndexType>* factors,
+                       const IndexType* matrix_lookup_offsets,
+                       const int64* matrix_lookup_descs,
+                       const int32* matrix_lookup_storage,
+                       matrix::Csr<ValueType, IndexType>* matrix,
+                       array<int>& tmp_storage) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL);
+
 }  // namespace ilu_factorization
 }  // namespace dpcpp
 }  // namespace kernels
diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp
@@ -109,6 +109,8 @@ class Ilu : public Composition<ValueType> {
          */
         incomplete_algorithm GKO_FACTORY_PARAMETER_SCALAR(
             algorithm, incomplete_algorithm::sparselib);
+
+        bool GKO_FACTORY_PARAMETER_SCALAR(later_drop, false);
     };
     GKO_ENABLE_LIN_OP_FACTORY(Ilu, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);
diff --git a/omp/factorization/ilu_kernels.cpp b/omp/factorization/ilu_kernels.cpp
@@ -1,9 +1,11 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "core/factorization/ilu_kernels.hpp"
 
+#include "core/matrix/csr_lookup.hpp"
+
 
 namespace gko {
 namespace kernels {
@@ -24,6 +26,85 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
+template <typename ValueType, typename IndexType>
+void factorize_on_both(std::shared_ptr<const DefaultExecutor> exec,
+                       const IndexType* lookup_offsets,
+                       const int64* lookup_descs, const int32* lookup_storage,
+                       const IndexType* diag_idxs,
+                       matrix::Csr<ValueType, IndexType>* factors,
+                       const IndexType* matrix_lookup_offsets,
+                       const int64* matrix_lookup_descs,
+                       const int32* matrix_lookup_storage,
+                       matrix::Csr<ValueType, IndexType>* matrix,
+                       array<int>& tmp_storage)
+{
+    const auto num_rows = factors->get_size()[0];
+    const auto row_ptrs = factors->get_const_row_ptrs();
+    const auto cols = factors->get_const_col_idxs();
+    const auto vals = factors->get_values();
+    // TODO parallelize
+    for (size_type row = 0; row < num_rows; row++) {
+        const auto row_begin = row_ptrs[row];
+        const auto row_diag = diag_idxs[row];
+        matrix::csr::device_sparsity_lookup<IndexType> lookup{
+            row_ptrs, cols, lookup_offsets, lookup_storage, lookup_descs, row};
+        matrix::csr::device_sparsity_lookup<IndexType> matrix_lookup{
+            matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
+            matrix_lookup_offsets,        matrix_lookup_storage,
+            matrix_lookup_descs,          row};
+        auto factor_nz = row_begin;
+        const auto matrix_row_begin = matrix->get_const_row_ptrs()[row];
+        auto matrix_nz = matrix_row_begin;
+        const auto matrix_row_diag =
+            matrix_lookup.lookup_unsafe(row) + matrix_nz;
+        while (matrix_nz < matrix_row_diag || factor_nz < row_diag) {
+            auto dep_matrix = matrix_nz < matrix_row_diag
+                                  ? matrix->get_const_col_idxs()[matrix_nz]
+                                  : std::numeric_limits<IndexType>::max();
+            auto dep_factor = factor_nz < row_diag
+                                  ? cols[factor_nz]
+                                  : std::numeric_limits<IndexType>::max();
+            auto dep = min(dep_matrix, dep_factor);
+            const auto dep_diag_idx = diag_idxs[dep];
+            const auto dep_diag = vals[dep_diag_idx];
+            const auto dep_end = row_ptrs[dep + 1];
+            const auto scale =
+                ((dep == dep_factor) ? vals[factor_nz]
+                                     : matrix->get_const_values()[matrix_nz]) /
+                dep_diag;
+            if (dep == dep_factor) {
+                vals[factor_nz] = scale;
+            }
+            if (dep == dep_matrix) {
+                matrix->get_values()[matrix_nz] = scale;
+            }
+            // we only need to consider the entries in the factor not entire
+            // one.
+            for (auto dep_nz = dep_diag_idx + 1; dep_nz < dep_end; dep_nz++) {
+                const auto col = cols[dep_nz];
+                const auto val = vals[dep_nz];
+                const auto idx = lookup[col];
+                if (idx != invalid_index<IndexType>()) {
+                    vals[row_begin + idx] -= scale * val;
+                }
+                // but we still need to operate on the matrix because we drop
+                // the entries after row operation need to keep the track here.
+                const auto matrix_idx = matrix_lookup[col];
+                if (matrix_idx != invalid_index<IndexType>()) {
+                    matrix->get_values()[matrix_row_begin + matrix_idx] -=
+                        scale * val;
+                }
+            }
+            matrix_nz += (dep == dep_matrix);
+            factor_nz += (dep == dep_factor);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL);
+
+
 }  // namespace ilu_factorization
 }  // namespace omp
 }  // namespace kernels
diff --git a/reference/factorization/ilu_kernels.cpp b/reference/factorization/ilu_kernels.cpp
diff --git a/reference/test/factorization/ilu_kernels.cpp b/reference/test/factorization/ilu_kernels.cpp
diff --git a/test/factorization/ilu_kernels.cpp b/test/factorization/ilu_kernels.cpp

Original file line number	Diff line number	Diff line change
`@@ -1015,6 +1015,7 @@ namespace ilu_factorization {`
`1015`	`1015`
`1016`	`1016`
`1017`	`1017`	`GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);`
	`1018`	`+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_FACTORIZE_ON_BOTH_KERNEL);`
`1018`	`1019`
`1019`	`1020`
`1020`	`1021`	`} // namespace ilu_factorization`