ginkgo-project
diff --git a/‎common/cuda_hip/components/bitvector.hpp‎
Lines changed: 1 addition & 1 deletion b/‎common/cuda_hip/components/bitvector.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/cuda_hip/components/merging.hpp‎
Lines changed: 4 additions & 3 deletions b/‎common/cuda_hip/components/merging.hpp‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎common/cuda_hip/components/searching.hpp‎
Lines changed: 5 additions & 4 deletions b/‎common/cuda_hip/components/searching.hpp‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎common/cuda_hip/factorization/cholesky_kernels.cpp‎
Lines changed: 2 additions & 2 deletions b/‎common/cuda_hip/factorization/cholesky_kernels.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/cuda_hip/factorization/factorization_kernels.cpp‎
Lines changed: 1 addition & 1 deletion b/‎common/cuda_hip/factorization/factorization_kernels.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/cuda_hip/factorization/par_ict_kernels.cpp‎
Lines changed: 8 additions & 6 deletions b/‎common/cuda_hip/factorization/par_ict_kernels.cpp‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎common/cuda_hip/factorization/par_ilut_filter_kernels.hpp‎
Lines changed: 2 additions & 2 deletions b/‎common/cuda_hip/factorization/par_ilut_filter_kernels.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp‎
Lines changed: 10 additions & 9 deletions b/‎common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp‎
Lines changed: 2 additions & 1 deletion b/‎common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎common/cuda_hip/matrix/csr_kernels.template.cpp‎
Lines changed: 2 additions & 2 deletions b/‎common/cuda_hip/matrix/csr_kernels.template.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -49,7 +49,7 @@ __global__ __launch_bounds__(default_block_size) void from_predicate(
         group::tiled_partition<block_size>(group::this_thread_block());
     const auto i = static_cast<IndexType>(subwarp_base + subwarp.thread_rank());
     const auto bit = i < size ? predicate(i) : false;
-    const auto mask = subwarp.ballot(bit);
+    const auto mask = group::ballot(subwarp, bit);
     if (subwarp.thread_rank() == 0) {
         bits[subwarp_id] = mask;
         popcounts[subwarp_id] = gko::detail::popcount(mask);
 
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -7,6 +7,7 @@
 
 
 #include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/searching.hpp"
 #include "core/base/utils.hpp"
@@ -91,7 +92,7 @@ __forceinline__ __device__ detail::merge_result<ValueType> group_merge_step(
     auto a_val = group.shfl(a, a_idx);
     auto b_val = group.shfl(b, b_idx);
     auto cmp = a_val < b_val;
-    auto a_advance = popcnt(group.ballot(cmp));
+    auto a_advance = popcnt(group::ballot(group, cmp));
     auto b_advance = int(group.size()) - a_advance;
 
     return {a_val, b_val, a_idx, b_idx, a_advance, b_advance};
@@ -208,7 +209,7 @@ __forceinline__ __device__ void group_match(const ValueType* __restrict__ a,
         a, a_size, b, b_size, group,
         [&](IndexType a_idx, ValueType a_val, IndexType b_idx, ValueType b_val,
             IndexType, bool valid) {
-            auto matchmask = group.ballot(a_val == b_val && valid);
+            auto matchmask = group::ballot(group, a_val == b_val && valid);
             match_fn(a_val, a_idx, b_idx, matchmask, a_val == b_val && valid);
             return a_idx < a_size && b_idx < b_size;
         });
 
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -7,6 +7,7 @@
 
 
 #include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
 
 
@@ -168,7 +169,7 @@ __forceinline__ __device__ IndexType group_wide_search(IndexType offset,
      */
     auto base_idx = (group_pos - 1) * group.size() + 1;
     auto idx = base_idx + group.thread_rank();
-    auto pos = ffs(group.ballot(idx >= length || p(offset + idx))) - 1;
+    auto pos = ffs(group::ballot(group, idx >= length || p(offset + idx))) - 1;
     return offset + base_idx + pos;
 }
 
@@ -205,7 +206,7 @@ __forceinline__ __device__ IndexType group_ary_search(IndexType offset,
     while (length > group.size()) {
         auto stride = length / group.size();
         auto idx = offset + group.thread_rank() * stride;
-        auto mask = group.ballot(p(idx));
+        auto mask = group::ballot(group, p(idx));
         // if the mask is 0, the partition point is in the last block
         // if the mask is ~0, the partition point is in the first block
         // otherwise, we go to the last block that returned a 0.
@@ -217,7 +218,7 @@ __forceinline__ __device__ IndexType group_ary_search(IndexType offset,
     auto idx = offset + group.thread_rank();
     // if the mask is 0, the partition point is at the end
     // otherwise it is the first set bit
-    auto mask = group.ballot(idx >= end || p(idx));
+    auto mask = group::ballot(group, idx >= end || p(idx));
     auto pos = mask == 0 ? group.size() : ffs(mask) - 1;
     return offset + pos;
 }
 
@@ -143,7 +143,7 @@ __global__ __launch_bounds__(default_block_size) void symbolic_factorize(
         const auto next_node =
             nz < lower_end - 1 ? postorder_cols[nz + 1] : diag_postorder;
         bool pred = node < next_node;
-        auto mask = subwarp.ballot(pred);
+        auto mask = group::ballot(subwarp, pred);
         while (mask) {
             if (pred) {
                 const auto out_nz = out_base + popcnt(mask & prefix_mask);
@@ -152,7 +152,7 @@ __global__ __launch_bounds__(default_block_size) void symbolic_factorize(
                 pred = node < next_node;
             }
             out_base += popcnt(mask);
-            mask = subwarp.ballot(pred);
+            mask = group::ballot(subwarp, pred);
         }
     }
     // add diagonal entry
 
@@ -187,7 +187,7 @@ __launch_bounds__(default_block_size) void add_missing_diagonal_elements(
                     thread_is_active ? old_col_idxs[old_idx] : IndexType{};
                 // automatically false if thread is not active
                 bool diagonal_add_required = !diagonal_added && row < col_idx;
-                auto ballot = subwarp_grp.ballot(diagonal_add_required);
+                auto ballot = group::ballot(subwarp_grp, diagonal_add_required);
 
                 if (ballot) {
                     auto first_subwarp_idx = ffs(ballot) - 1;
 
@@ -12,6 +12,7 @@
 
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/memory.hpp"
 #include "common/cuda_hip/components/merging.hpp"
@@ -75,8 +76,8 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_nnz(
             IndexType llh_col, IndexType out_nz, bool valid) {
             auto col = min(a_col, llh_col);
             // count the number of unique elements being merged
-            count +=
-                popcnt(subwarp.ballot(col <= row && a_col != llh_col && valid));
+            count += popcnt(group::ballot(
+                subwarp, col <= row && a_col != llh_col && valid));
             return true;
         });
     if (subwarp.thread_rank() == 0) {
@@ -149,7 +150,8 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init(
         auto llh_cur_val = subwarp.shfl(llh_val, merge_result.b_idx);
         auto valid = out_begin + lane < out_size;
         // check if the previous thread has matching columns
-        auto equal_mask = subwarp.ballot(a_cur_col == llh_cur_col && valid);
+        auto equal_mask =
+            group::ballot(subwarp, a_cur_col == llh_cur_col && valid);
         auto prev_equal_mask = equal_mask << 1 | skip_first;
         skip_first = bool(equal_mask >> (subwarp_size - 1));
         auto prev_equal = bool(prev_equal_mask & lanemask_eq);
@@ -179,7 +181,7 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init(
         // determine which threads will write output to L
         auto use_l = l_cur_col == r_col;
         auto do_write = !prev_equal && valid && r_col <= row;
-        auto l_new_advance_mask = subwarp.ballot(do_write);
+        auto l_new_advance_mask = group::ballot(subwarp, do_write);
         // store values
         if (do_write) {
             auto diag = l_vals[l_row_ptrs[r_col + 1] - 1];
@@ -192,7 +194,7 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init(
         // advance *_begin offsets
         auto a_advance = merge_result.a_advance;
         auto llh_advance = merge_result.b_advance;
-        auto l_advance = popcnt(subwarp.ballot(do_write && use_l));
+        auto l_advance = popcnt(group::ballot(subwarp, do_write && use_l));
         auto l_new_advance = popcnt(l_new_advance_mask);
         a_begin += a_advance;
         llh_begin += llh_advance;
@@ -295,7 +297,7 @@ __global__ __launch_bounds__(default_block_size) void ict_sweep(
                        conj(load_relaxed(l_vals + (lh_idx + lh_col_begin)));
             }
             // remember the transposed element
-            auto found_transp = subwarp.ballot(lh_row == row);
+            auto found_transp = group::ballot(subwarp, lh_row == row);
             if (found_transp) {
                 lh_nz =
                     subwarp.shfl(lh_idx + lh_col_begin, ffs(found_transp) - 1);
 
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -50,7 +50,7 @@ __device__ void abstract_filter_impl(const IndexType* row_ptrs,
     for (IndexType step = 0; step < num_steps; ++step) {
         auto idx = begin + lane + step * subwarp_size;
         auto keep = idx < end && pred(idx, begin, end);
-        auto mask = subwarp.ballot(keep);
+        auto mask = group::ballot(subwarp, keep);
         step_cb(row, idx, keep, popcnt(mask), popcnt(mask & lane_prefix_mask));
     }
     finish_cb(row, lane);
 
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -74,10 +74,10 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_nnz(
             IndexType out_nz, bool valid) {
             auto col = min(a_col, lu_col);
             // count the number of unique elements being merged
-            l_count +=
-                popcnt(subwarp.ballot(col <= row && a_col != lu_col && valid));
-            u_count +=
-                popcnt(subwarp.ballot(col >= row && a_col != lu_col && valid));
+            l_count += popcnt(
+                group::ballot(subwarp, col <= row && a_col != lu_col && valid));
+            u_count += popcnt(
+                group::ballot(subwarp, col >= row && a_col != lu_col && valid));
             return true;
         });
     if (subwarp.thread_rank() == 0) {
@@ -172,7 +172,8 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init(
         auto lu_cur_val = subwarp.shfl(lu_val, merge_result.b_idx);
         auto valid = out_begin + lane < out_size;
         // check if the previous thread has matching columns
-        auto equal_mask = subwarp.ballot(a_cur_col == lu_cur_col && valid);
+        auto equal_mask =
+            group::ballot(subwarp, a_cur_col == lu_cur_col && valid);
         auto prev_equal_mask = equal_mask << 1 | skip_first;
         skip_first = bool(equal_mask >> (subwarp_size - 1));
         auto prev_equal = bool(prev_equal_mask & lanemask_eq);
@@ -197,9 +198,9 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init(
         // determine which threads will write output to L or U
         auto use_lpu = lpu_cur_col == r_col;
         auto l_new_advance_mask =
-            subwarp.ballot(r_col <= row && !prev_equal && valid);
+            group::ballot(subwarp, r_col <= row && !prev_equal && valid);
         auto u_new_advance_mask =
-            subwarp.ballot(r_col >= row && !prev_equal && valid);
+            group::ballot(subwarp, r_col >= row && !prev_equal && valid);
         // store values
         if (!prev_equal && valid) {
             auto diag =
@@ -222,7 +223,7 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init(
         auto a_advance = merge_result.a_advance;
         auto lu_advance = merge_result.b_advance;
         auto lpu_advance =
-            popcnt(subwarp.ballot(use_lpu && !prev_equal && valid));
+            popcnt(group::ballot(subwarp, use_lpu && !prev_equal && valid));
         auto l_new_advance = popcnt(l_new_advance_mask);
         auto u_new_advance = popcnt(u_new_advance_mask);
         a_begin += a_advance;
 
@@ -10,6 +10,7 @@
 
 #include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
 #include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/memory.hpp"
 #include "common/cuda_hip/components/merging.hpp"
@@ -105,7 +106,7 @@ __global__ __launch_bounds__(default_block_size) void sweep(
                        load_relaxed(ut_vals + (ut_idx + ut_col_begin));
             }
             // remember the transposed element
-            auto found_transp = subwarp.ballot(ut_row == row);
+            auto found_transp = group::ballot(subwarp, ut_row == row);
             if (found_transp) {
                 ut_nz =
                     subwarp.shfl(ut_idx + ut_col_begin, ffs(found_transp) - 1);
 
@@ -609,7 +609,7 @@ __global__ __launch_bounds__(default_block_size) void spgeam_nnz(
         a_col_idxs + a_begin, a_size, b_col_idxs + b_begin, b_size, subwarp,
         [&](IndexType, IndexType a_col, IndexType, IndexType b_col, IndexType,
             bool valid) {
-            count += popcnt(subwarp.ballot(a_col != b_col && valid));
+            count += popcnt(group::ballot(subwarp, a_col != b_col && valid));
             return true;
         });
 
@@ -657,7 +657,7 @@ __global__ __launch_bounds__(default_block_size) void spgeam(
         [&](IndexType a_nz, IndexType a_col, IndexType b_nz, IndexType b_col,
             IndexType, bool valid) {
             auto c_col = min(a_col, b_col);
-            auto equal_mask = subwarp.ballot(a_col == b_col && valid);
+            auto equal_mask = group::ballot(subwarp, a_col == b_col && valid);
             // check if the elements in the previous merge step are
             // equal
             auto prev_equal_mask = equal_mask << 1 | skip_first;
Original file line number	Diff line number	Diff line change
`@@ -143,7 +143,7 @@ __global__ __launch_bounds__(default_block_size) void symbolic_factorize(`
`143`	`143`	`const auto next_node =`
`144`	`144`	`nz < lower_end - 1 ? postorder_cols[nz + 1] : diag_postorder;`
`145`	`145`	`bool pred = node < next_node;`
`146`		`- auto mask = subwarp.ballot(pred);`
	`146`	`+ auto mask = group::ballot(subwarp, pred);`
`147`	`147`	`while (mask) {`
`148`	`148`	`if (pred) {`
`149`	`149`	`const auto out_nz = out_base + popcnt(mask & prefix_mask);`
`@@ -152,7 +152,7 @@ __global__ __launch_bounds__(default_block_size) void symbolic_factorize(`
`152`	`152`	`pred = node < next_node;`
`153`	`153`	`}`
`154`	`154`	`out_base += popcnt(mask);`
`155`		`- mask = subwarp.ballot(pred);`
	`155`	`+ mask = group::ballot(subwarp, pred);`
`156`	`156`	`}`
`157`	`157`	`}`
`158`	`158`	`// add diagonal entry`