File tree Expand file tree Collapse file tree
common/cuda_hip/factorization Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1- // SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
1+ // SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
22//
33// SPDX-License-Identifier: BSD-3-Clause
44
@@ -113,14 +113,17 @@ __global__ __launch_bounds__(default_block_size) void factorize(
113113 // for each lower triangular entry: eliminate with corresponding row
114114 for (auto lower_nz = row_begin; lower_nz < row_diag; lower_nz++) {
115115 const auto dep = cols[lower_nz];
116- // we can load the value before synchronizing because the following
117- // updates only go past the diagonal of the dependency row, i.e. at
118- // least column dep + 1
119- const auto val = vals[lower_nz];
120116 const auto diag_idx = diag_idxs[dep];
121117 const auto dep_end = row_ptrs[dep + 1 ];
122118 scheduler.wait (dep);
119+ // We need to load vals after synchronize.
120+ // the next lower_nz might be modified if the dep row has the same col
121+ // as next lower_nz's col.
122+ const auto val = vals[lower_nz];
123123 const auto diag = vals[diag_idx];
124+ // we need sync to ensure all threads get the data before assigning to
125+ // scale.
126+ warp.sync ();
124127 const auto scale = val / diag;
125128 if (lane == 0 ) {
126129 vals[lower_nz] = scale;
You can’t perform that action at this time.
0 commit comments