Skip to content

Commit 0bc4a4d

Browse files
committed
enable GPU solves
1 parent 6e3ba06 commit 0bc4a4d

9 files changed

Lines changed: 847 additions & 722 deletions

File tree

common/cuda_hip/distributed/preconditioner/bddc_kernels.cpp

Lines changed: 85 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,14 @@
66

77
#include <algorithm>
88

9+
#include "common/cuda_hip/base/config.hpp"
10+
#include "common/cuda_hip/base/runtime.hpp"
11+
#include "common/cuda_hip/base/types.hpp"
12+
#include "common/cuda_hip/components/thread_ids.hpp"
913
#include "core/base/allocator.hpp"
1014
#include "core/base/device_matrix_data_kernels.hpp"
1115
#include "core/base/iterator_factory.hpp"
16+
#include "core/base/utils.hpp"
1217
#include "core/components/prefix_sum_kernels.hpp"
1318
#include "reference/distributed/partition_helpers.hpp"
1419

@@ -19,6 +24,51 @@ namespace GKO_DEVICE_NAMESPACE {
1924
namespace bddc {
2025

2126

27+
constexpr int default_block_size = 512;
28+
29+
30+
namespace kernel {
31+
32+
33+
template <typename ValueType, typename IndexType>
34+
__global__ __launch_bounds__(default_block_size) void generate_constraints(
35+
const IndexType* interface_offsets, size_type n_inactive_idxs,
36+
size_type n_edges_faces, IndexType* row_idxs, IndexType* col_idxs,
37+
ValueType* values)
38+
{
39+
const auto interface_idx = thread::get_thread_id_flat();
40+
if (interface_idx < n_edges_faces) {
41+
const auto start = interface_offsets[interface_idx];
42+
const auto stop = interface_offsets[interface_idx + 1];
43+
const ValueType val =
44+
one<ValueType>() / static_cast<ValueType>(stop - start);
45+
for (size_type idx = start; idx < stop; idx++) {
46+
row_idxs[idx] = interface_idx;
47+
col_idxs[idx] = n_inactive_idxs + idx;
48+
values[idx] = val;
49+
}
50+
}
51+
}
52+
53+
54+
template <typename ValueType>
55+
__global__ __launch_bounds__(default_block_size) void fill_coarse_data(
56+
size_type n_edges_faces, size_type n_corners, size_type lambda_stride,
57+
size_type phi_stride, ValueType* lambda, ValueType* phi)
58+
{
59+
const auto i = thread::get_thread_id_flat();
60+
if (i < n_edges_faces) {
61+
lambda[i * lambda_stride + i] = one<ValueType>();
62+
}
63+
if (i < n_corners) {
64+
phi[i * phi_stride + n_edges_faces + i] = one<ValueType>();
65+
}
66+
}
67+
68+
69+
} // namespace kernel
70+
71+
2272
template <typename ValueType, typename IndexType>
2373
void classify_dofs(
2474
std::shared_ptr<const DefaultExecutor> exec,
@@ -38,11 +88,27 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_BASE(
3888

3989

4090
template <typename ValueType, typename IndexType>
41-
void generate_constraints(
42-
std::shared_ptr<const DefaultExecutor> exec,
43-
const matrix::Dense<ValueType>* labels, size_type n_inner_idxs,
44-
size_type n_edges_faces, const array<IndexType>& interface_sizes,
45-
device_matrix_data<ValueType, IndexType>& constraints) GKO_NOT_IMPLEMENTED;
91+
void generate_constraints(std::shared_ptr<const DefaultExecutor> exec,
92+
const matrix::Dense<ValueType>* labels,
93+
size_type n_inactive_idxs, size_type n_edges_faces,
94+
const array<IndexType>& interface_sizes,
95+
device_matrix_data<ValueType, IndexType>& constraints)
96+
{
97+
array<IndexType> interface_offsets{exec, n_edges_faces + 1};
98+
exec->copy(n_edges_faces, interface_sizes.get_const_data(),
99+
interface_offsets.get_data());
100+
components::prefix_sum_nonnegative(exec, interface_offsets.get_data(),
101+
n_edges_faces + 1);
102+
103+
const auto grid_dim = ceildiv(n_edges_faces, default_block_size);
104+
if (grid_dim > 0) {
105+
kernel::generate_constraints<<<grid_dim, default_block_size>>>(
106+
as_device_type(interface_offsets.get_const_data()), n_inactive_idxs,
107+
n_edges_faces, as_device_type(constraints.get_row_idxs()),
108+
as_device_type(constraints.get_col_idxs()),
109+
as_device_type(constraints.get_values()));
110+
}
111+
}
46112

47113
GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_BASE(
48114
GKO_DECLARE_GENERATE_CONSTRAINTS);
@@ -51,14 +117,27 @@ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE_BASE(
51117
template <typename ValueType>
52118
void fill_coarse_data(std::shared_ptr<const DefaultExecutor> exec,
53119
matrix::Dense<ValueType>* phi_P,
54-
matrix::Dense<ValueType>* lambda_rhs) GKO_NOT_IMPLEMENTED;
120+
matrix::Dense<ValueType>* lambda_rhs)
121+
{
122+
const auto n_edges_faces = lambda_rhs->get_size()[0];
123+
const auto n_corners = phi_P->get_size()[1];
124+
const auto grid_dim =
125+
ceildiv(std::max(n_edges_faces, n_corners), default_block_size);
126+
if (grid_dim > 0) {
127+
kernel::fill_coarse_data<<<grid_dim, default_block_size>>>(
128+
n_edges_faces, n_corners, lambda_rhs->get_stride(),
129+
phi_P->get_stride(), as_device_type(lambda_rhs->get_values()),
130+
as_device_type(phi_P->get_values()));
131+
}
132+
}
55133

56134
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_BASE(GKO_DECLARE_FILL_COARSE_DATA);
57135

58136

59137
template <typename ValueType, typename IndexType>
60138
void build_coarse_contribution(
61139
std::shared_ptr<const DefaultExecutor> exec,
140+
const array<experimental::distributed::preconditioner::dof_type>& dof_types,
62141
const array<remove_complex<ValueType>>& local_labels,
63142
const array<IndexType>& local_tags,
64143
const array<remove_complex<ValueType>>& global_labels,

core/distributed/assembly.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,17 +81,17 @@ device_matrix_data<ValueType, GlobalIndexType> assemble_rows_from_neighbors(
8181
recv_values.set_executor(exec->get_master());
8282
}
8383
auto row_req = comm.i_all_to_all_v(
84-
use_host_buffer ? exec : exec->get_master(),
84+
use_host_buffer ? exec->get_master() : exec,
8585
send_row_idxs.get_const_data(), send_sizes.get_data(),
8686
send_offsets.get_data(), recv_row_idxs.get_data(),
8787
recv_sizes.get_data(), recv_offsets.get_data());
8888
auto col_req = comm.i_all_to_all_v(
89-
use_host_buffer ? exec : exec->get_master(),
89+
use_host_buffer ? exec->get_master() : exec,
9090
send_col_idxs.get_const_data(), send_sizes.get_data(),
9191
send_offsets.get_data(), recv_col_idxs.get_data(),
9292
recv_sizes.get_data(), recv_offsets.get_data());
9393
auto val_req =
94-
comm.i_all_to_all_v(use_host_buffer ? exec : exec->get_master(),
94+
comm.i_all_to_all_v(use_host_buffer ? exec->get_master() : exec,
9595
send_values.get_const_data(), send_sizes.get_data(),
9696
send_offsets.get_data(), recv_values.get_data(),
9797
recv_sizes.get_data(), recv_offsets.get_data());

core/distributed/dd_matrix.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,6 @@ void DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
121121
GKO_ASSERT_EQ(comm.size(), partition->get_num_parts());
122122
auto exec = this->get_executor();
123123
auto local_part = comm.rank();
124-
auto use_host_buffer = mpi::requires_host_buffer(exec, comm);
125124
auto tmp_partition = make_temporary_clone(exec, partition);
126125

127126
// set up LinOp sizes
@@ -163,9 +162,9 @@ void DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
163162

164163
// Gather local sizes from all ranks and build the partition in the enriched
165164
// space.
166-
array<GlobalIndexType> range_bounds{
167-
use_host_buffer ? exec->get_master() : exec, num_parts + 1};
168-
comm.all_gather(exec, &local_num_rows, 1, range_bounds.get_data(), 1);
165+
array<GlobalIndexType> range_bounds{exec->get_master(), num_parts + 1};
166+
comm.all_gather(exec->get_master(), &local_num_rows, 1,
167+
range_bounds.get_data(), 1);
169168
range_bounds.set_executor(exec);
170169
exec->run(dd_matrix::make_prefix_sum_nonnegative(range_bounds.get_data(),
171170
num_parts + 1));

0 commit comments

Comments
 (0)