Skip to content

Commit d30c84f

Browse files
committed
refactor Portable::MatrixFree by combining Shared and GPU data
1 parent 712e8ad commit d30c84f

File tree

9 files changed

+262
-238
lines changed

9 files changed

+262
-238
lines changed

examples/step-64/step-64.cc

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,11 @@ namespace Step64
155155
const int q_point) const
156156
{
157157
const int cell_index = fe_eval->get_current_cell_index();
158-
const typename Portable::MatrixFree<dim, double>::Data *gpu_data =
158+
const typename Portable::MatrixFree<dim, double>::Data *data =
159159
fe_eval->get_matrix_free_data();
160160

161161
const unsigned int position =
162-
gpu_data->local_q_point_id(cell_index, n_q_points, q_point);
162+
data->local_q_point_id(cell_index, n_q_points, q_point);
163163
auto coeff = coef[position];
164164

165165
auto value = fe_eval->get_value(q_point);
@@ -191,11 +191,9 @@ namespace Step64
191191
{}
192192

193193
DEAL_II_HOST_DEVICE void
194-
operator()(const unsigned int cell,
195-
const typename Portable::MatrixFree<dim, double>::Data *gpu_data,
196-
Portable::SharedData<dim, double> *shared_data,
197-
const double *src,
198-
double *dst) const;
194+
operator()(const typename Portable::MatrixFree<dim, double>::Data *data,
195+
const double *src,
196+
double *dst) const;
199197

200198
private:
201199
double *coef;
@@ -209,14 +207,12 @@ namespace Step64
209207
// vector.
210208
template <int dim, int fe_degree>
211209
DEAL_II_HOST_DEVICE void LocalHelmholtzOperator<dim, fe_degree>::operator()(
212-
const unsigned int /*cell*/,
213-
const typename Portable::MatrixFree<dim, double>::Data *gpu_data,
214-
Portable::SharedData<dim, double> *shared_data,
210+
const typename Portable::MatrixFree<dim, double>::Data *data,
215211
const double *src,
216212
double *dst) const
217213
{
218214
Portable::FEEvaluation<dim, fe_degree, fe_degree + 1, 1, double> fe_eval(
219-
gpu_data, shared_data);
215+
data);
220216
fe_eval.read_dof_values(src);
221217
fe_eval.evaluate(EvaluationFlags::values | EvaluationFlags::gradients);
222218
fe_eval.apply_for_each_quad_point(

include/deal.II/matrix_free/portable_evaluation_kernels.h

Lines changed: 61 additions & 67 deletions
Large diffs are not rendered by default.

include/deal.II/matrix_free/portable_fe_evaluation.h

Lines changed: 60 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ namespace Portable
124124
* Constructor.
125125
*/
126126
DEAL_II_HOST_DEVICE
127-
FEEvaluation(const data_type *data, SharedData<dim, Number> *shdata);
127+
explicit FEEvaluation(const data_type *data);
128128

129129
/**
130130
* Return the index of the current cell.
@@ -264,9 +264,10 @@ namespace Portable
264264
apply_for_each_quad_point(const Functor &func);
265265

266266
private:
267-
const data_type *data;
268-
SharedData<dim, Number> *shared_data;
269-
int cell_id;
267+
const typename MatrixFree<dim, Number>::Data *dataa;
268+
const typename MatrixFree<dim, Number>::PrecomputedData *precomputed_data;
269+
SharedData<dim, Number> *shared_data;
270+
int cell_id;
270271
};
271272

272273

@@ -278,10 +279,11 @@ namespace Portable
278279
typename Number>
279280
DEAL_II_HOST_DEVICE
280281
FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
281-
FEEvaluation(const data_type *data, SharedData<dim, Number> *shdata)
282-
: data(data)
283-
, shared_data(shdata)
284-
, cell_id(shared_data->team_member.league_rank())
282+
FEEvaluation(const data_type *data)
283+
: dataa(data)
284+
, precomputed_data(dataa->precomputed_data)
285+
, shared_data(dataa->shared_data)
286+
, cell_id(data->team_member.league_rank())
285287
{}
286288

287289

@@ -313,7 +315,7 @@ namespace Portable
313315
FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
314316
get_matrix_free_data()
315317
{
316-
return data;
318+
return dataa;
317319
}
318320

319321

@@ -328,22 +330,22 @@ namespace Portable
328330
read_dof_values(const Number *src)
329331
{
330332
// Populate the scratch memory
331-
Kokkos::parallel_for(Kokkos::TeamThreadRange(shared_data->team_member,
333+
Kokkos::parallel_for(Kokkos::TeamThreadRange(dataa->team_member,
332334
tensor_dofs_per_component),
333335
[&](const int &i) {
334336
for (unsigned int c = 0; c < n_components_; ++c)
335337
shared_data->values(i, c) =
336-
src[data->local_to_global(
338+
src[precomputed_data->local_to_global(
337339
cell_id, i + tensor_dofs_per_component * c)];
338340
});
339-
shared_data->team_member.team_barrier();
341+
dataa->team_member.team_barrier();
340342

341343
for (unsigned int c = 0; c < n_components_; ++c)
342344
{
343345
internal::resolve_hanging_nodes<dim, fe_degree, false, Number>(
344-
shared_data->team_member,
345-
data->constraint_weights,
346-
data->constraint_mask(cell_id * n_components + c),
346+
dataa->team_member,
347+
precomputed_data->constraint_weights,
348+
precomputed_data->constraint_mask(cell_id * n_components + c),
347349
Kokkos::subview(shared_data->values, Kokkos::ALL, c));
348350
}
349351
}
@@ -362,32 +364,32 @@ namespace Portable
362364
for (unsigned int c = 0; c < n_components_; ++c)
363365
{
364366
internal::resolve_hanging_nodes<dim, fe_degree, true, Number>(
365-
shared_data->team_member,
366-
data->constraint_weights,
367-
data->constraint_mask(cell_id * n_components + c),
367+
dataa->team_member,
368+
precomputed_data->constraint_weights,
369+
precomputed_data->constraint_mask(cell_id * n_components + c),
368370
Kokkos::subview(shared_data->values, Kokkos::ALL, c));
369371
}
370372

371-
if (data->use_coloring)
373+
if (precomputed_data->use_coloring)
372374
{
373375
Kokkos::parallel_for(
374-
Kokkos::TeamThreadRange(shared_data->team_member,
376+
Kokkos::TeamThreadRange(dataa->team_member,
375377
tensor_dofs_per_component),
376378
[&](const int &i) {
377379
for (unsigned int c = 0; c < n_components_; ++c)
378-
dst[data->local_to_global(cell_id,
379-
i + tensor_dofs_per_component * c)] +=
380+
dst[precomputed_data->local_to_global(
381+
cell_id, i + tensor_dofs_per_component * c)] +=
380382
shared_data->values(i, c);
381383
});
382384
}
383385
else
384386
{
385387
Kokkos::parallel_for(
386-
Kokkos::TeamThreadRange(shared_data->team_member,
388+
Kokkos::TeamThreadRange(dataa->team_member,
387389
tensor_dofs_per_component),
388390
[&](const int &i) {
389391
for (unsigned int c = 0; c < n_components_; ++c)
390-
Kokkos::atomic_add(&dst[data->local_to_global(
392+
Kokkos::atomic_add(&dst[precomputed_data->local_to_global(
391393
cell_id, i + (tensor_dofs_per_component)*c)],
392394
shared_data->values(i, c));
393395
});
@@ -408,28 +410,29 @@ namespace Portable
408410
using ElementType = ::dealii::internal::MatrixFreeFunctions::ElementType;
409411

410412
if (fe_degree >= 0 && fe_degree + 1 == n_q_points_1d &&
411-
data->element_type == ElementType::tensor_symmetric_collocation)
413+
precomputed_data->element_type ==
414+
ElementType::tensor_symmetric_collocation)
412415
{
413416
internal::FEEvaluationImplCollocation<dim, fe_degree, Number>::evaluate(
414-
n_components, evaluation_flag, data, shared_data);
417+
n_components, evaluation_flag, dataa);
415418
}
416419
// '<=' on type means tensor_symmetric or tensor_symmetric_hermite, see
417420
// shape_info.h for more details
418421
else if (fe_degree >= 0 &&
419422
internal::use_collocation_evaluation(fe_degree, n_q_points_1d) &&
420-
data->element_type <= ElementType::tensor_symmetric)
423+
precomputed_data->element_type <= ElementType::tensor_symmetric)
421424
{
422425
internal::FEEvaluationImplTransformToCollocation<
423426
dim,
424427
fe_degree,
425428
n_q_points_1d,
426-
Number>::evaluate(n_components, evaluation_flag, data, shared_data);
429+
Number>::evaluate(n_components, evaluation_flag, dataa);
427430
}
428-
else if (fe_degree >= 0 &&
429-
data->element_type <= ElementType::tensor_symmetric_no_collocation)
431+
else if (fe_degree >= 0 && precomputed_data->element_type <=
432+
ElementType::tensor_symmetric_no_collocation)
430433
{
431434
internal::FEEvaluationImpl<dim, fe_degree, n_q_points_1d, Number>::
432-
evaluate(n_components, evaluation_flag, data, shared_data);
435+
evaluate(n_components, evaluation_flag, dataa);
433436
}
434437
else
435438
{
@@ -469,28 +472,29 @@ namespace Portable
469472
using ElementType = ::dealii::internal::MatrixFreeFunctions::ElementType;
470473

471474
if (fe_degree >= 0 && fe_degree + 1 == n_q_points_1d &&
472-
data->element_type == ElementType::tensor_symmetric_collocation)
475+
precomputed_data->element_type ==
476+
ElementType::tensor_symmetric_collocation)
473477
{
474478
internal::FEEvaluationImplCollocation<dim, fe_degree, Number>::
475-
integrate(n_components, integration_flag, data, shared_data);
479+
integrate(n_components, integration_flag, dataa);
476480
}
477481
// '<=' on type means tensor_symmetric or tensor_symmetric_hermite, see
478482
// shape_info.h for more details
479483
else if (fe_degree >= 0 &&
480484
internal::use_collocation_evaluation(fe_degree, n_q_points_1d) &&
481-
data->element_type <= ElementType::tensor_symmetric)
485+
precomputed_data->element_type <= ElementType::tensor_symmetric)
482486
{
483487
internal::FEEvaluationImplTransformToCollocation<
484488
dim,
485489
fe_degree,
486490
n_q_points_1d,
487-
Number>::integrate(n_components, integration_flag, data, shared_data);
491+
Number>::integrate(n_components, integration_flag, dataa);
488492
}
489-
else if (fe_degree >= 0 &&
490-
data->element_type <= ElementType::tensor_symmetric_no_collocation)
493+
else if (fe_degree >= 0 && precomputed_data->element_type <=
494+
ElementType::tensor_symmetric_no_collocation)
491495
{
492496
internal::FEEvaluationImpl<dim, fe_degree, n_q_points_1d, Number>::
493-
integrate(n_components, integration_flag, data, shared_data);
497+
integrate(n_components, integration_flag, dataa);
494498
}
495499
else
496500
{
@@ -588,13 +592,14 @@ namespace Portable
588592
Assert(q_point >= 0 && q_point < n_q_points, ExcInternalError());
589593
if constexpr (n_components_ == 1)
590594
{
591-
shared_data->values(q_point, 0) = val_in * data->JxW(cell_id, q_point);
595+
shared_data->values(q_point, 0) =
596+
val_in * precomputed_data->JxW(cell_id, q_point);
592597
}
593598
else
594599
{
595600
for (unsigned int c = 0; c < n_components; ++c)
596601
shared_data->values(q_point, c) =
597-
val_in[c] * data->JxW(cell_id, q_point);
602+
val_in[c] * precomputed_data->JxW(cell_id, q_point);
598603
}
599604
}
600605

@@ -645,8 +650,9 @@ namespace Portable
645650
{
646651
Number tmp = 0.;
647652
for (unsigned int d_2 = 0; d_2 < dim; ++d_2)
648-
tmp += data->inv_jacobian(cell_id, q_point, d_2, d_1) *
649-
shared_data->gradients(q_point, d_2, 0);
653+
tmp +=
654+
precomputed_data->inv_jacobian(cell_id, q_point, d_2, d_1) *
655+
shared_data->gradients(q_point, d_2, 0);
650656
grad[d_1] = tmp;
651657
}
652658
}
@@ -657,8 +663,9 @@ namespace Portable
657663
{
658664
Number tmp = 0.;
659665
for (unsigned int d_2 = 0; d_2 < dim; ++d_2)
660-
tmp += data->inv_jacobian(cell_id, q_point, d_2, d_1) *
661-
shared_data->gradients(q_point, d_2, c);
666+
tmp +=
667+
precomputed_data->inv_jacobian(cell_id, q_point, d_2, d_1) *
668+
shared_data->gradients(q_point, d_2, c);
662669
grad[c][d_1] = tmp;
663670
}
664671
}
@@ -685,9 +692,10 @@ namespace Portable
685692
Number tmp = 0.;
686693
for (unsigned int d_2 = 0; d_2 < dim; ++d_2)
687694
tmp +=
688-
data->inv_jacobian(cell_id, q_point, d_1, d_2) * grad_in[d_2];
695+
precomputed_data->inv_jacobian(cell_id, q_point, d_1, d_2) *
696+
grad_in[d_2];
689697
shared_data->gradients(q_point, d_1, 0) =
690-
tmp * data->JxW(cell_id, q_point);
698+
tmp * precomputed_data->JxW(cell_id, q_point);
691699
}
692700
}
693701
else
@@ -697,10 +705,11 @@ namespace Portable
697705
{
698706
Number tmp = 0.;
699707
for (unsigned int d_2 = 0; d_2 < dim; ++d_2)
700-
tmp += data->inv_jacobian(cell_id, q_point, d_1, d_2) *
701-
grad_in[c][d_2];
708+
tmp +=
709+
precomputed_data->inv_jacobian(cell_id, q_point, d_1, d_2) *
710+
grad_in[c][d_2];
702711
shared_data->gradients(q_point, d_1, c) =
703-
tmp * data->JxW(cell_id, q_point);
712+
tmp * precomputed_data->JxW(cell_id, q_point);
704713
}
705714
}
706715
}
@@ -717,10 +726,10 @@ namespace Portable
717726
FEEvaluation<dim, fe_degree, n_q_points_1d, n_components_, Number>::
718727
apply_for_each_quad_point(const Functor &func)
719728
{
720-
Kokkos::parallel_for(Kokkos::TeamThreadRange(shared_data->team_member,
729+
Kokkos::parallel_for(Kokkos::TeamThreadRange(dataa->team_member,
721730
n_q_points),
722731
[&](const int &i) { func(this, i); });
723-
shared_data->team_member.team_barrier();
732+
dataa->team_member.team_barrier();
724733
}
725734

726735

0 commit comments

Comments
 (0)