Skip to content

Commit a01326f

Browse files
committed
gen - skip mat load when assembling QFs
1 parent 41ece66 commit a01326f

File tree

4 files changed

+60
-32
lines changed

4 files changed

+60
-32
lines changed

backends/cuda-gen/ceed-cuda-gen-operator-build.cpp

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -180,11 +180,19 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
180180
static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
181181
CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse,
182182
CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
183-
bool use_3d_slices) {
184-
bool is_tensor = true;
183+
bool use_3d_slices, bool skip_active_load) {
184+
bool is_tensor = true, is_active = true;
185185
CeedBasis basis;
186+
186187
CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
187188
if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
189+
{
190+
CeedVector vec;
191+
192+
CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec));
193+
is_active = vec == CEED_VECTOR_ACTIVE;
194+
CeedCallBackend(CeedVectorDestroy(&vec));
195+
}
188196

189197
const char *field_name;
190198
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
@@ -256,15 +264,15 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
256264
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
257265
else data->B.outputs[i] = basis_data->d_interp_1d;
258266
}
259-
if (use_previous_field) {
267+
if (use_previous_field && !skip_active_load) {
260268
std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
261269

262270
code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
263271
} else {
264272
bool is_collocated = false;
265273

266274
CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
267-
if (is_collocated && !is_at_points) {
275+
if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
268276
code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
269277
} else {
270278
code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
@@ -295,15 +303,15 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
295303
else data->B.outputs[i] = basis_data->d_interp_1d;
296304
}
297305
if (is_tensor) {
298-
if (use_previous_field) {
306+
if (use_previous_field && !skip_active_load) {
299307
std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
300308

301309
code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
302310
} else {
303311
bool is_collocated = false;
304312

305313
CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
306-
if (is_collocated && !is_at_points) {
314+
if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
307315
code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
308316
} else {
309317
code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
@@ -315,10 +323,12 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
315323
if (use_3d_slices) {
316324
if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
317325
else data->G.outputs[i] = basis_data->d_collo_grad_1d;
318-
if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
326+
if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
319327
std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
320328

321329
code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
330+
} else if (is_active && skip_active_load) {
331+
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
322332
} else {
323333
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
324334
code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
@@ -329,19 +339,23 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
329339
if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
330340
else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
331341
if (has_collo_grad) {
332-
if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
342+
if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
333343
std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
334344

335345
code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
346+
} else if (is_active && skip_active_load) {
347+
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
336348
} else {
337349
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
338350
code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
339351
}
340352
} else {
341-
if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
353+
if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
342354
std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
343355

344356
code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
357+
} else if (is_active && skip_active_load) {
358+
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
345359
} else {
346360
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
347361
<< (is_tensor ? "" : var_suffix) << "];\n";
@@ -1453,12 +1467,12 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
14531467
code << "\n" << tab << "// Input field constants and basis data\n";
14541468
for (CeedInt i = 0; i < num_input_fields; i++) {
14551469
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1456-
max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1470+
max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
14571471
}
14581472
code << "\n" << tab << "// Output field constants and basis data\n";
14591473
for (CeedInt i = 0; i < num_output_fields; i++) {
14601474
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1461-
max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1475+
max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
14621476
}
14631477

14641478
// Loop over all elements
@@ -1819,12 +1833,12 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
18191833
code << "\n" << tab << "// Input field constants and basis data\n";
18201834
for (CeedInt i = 0; i < num_input_fields; i++) {
18211835
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1822-
max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1836+
max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
18231837
}
18241838
code << "\n" << tab << "// Output field constants and basis data\n";
18251839
for (CeedInt i = 0; i < num_output_fields; i++) {
18261840
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1827-
max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1841+
max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
18281842
}
18291843

18301844
// Loop over all elements
@@ -2385,12 +2399,12 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
23852399
code << "\n" << tab << "// Input field constants and basis data\n";
23862400
for (CeedInt i = 0; i < num_input_fields; i++) {
23872401
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
2388-
max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
2402+
max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true));
23892403
}
23902404
code << "\n" << tab << "// Output field constants and basis data\n";
23912405
for (CeedInt i = 0; i < num_output_fields; i++) {
23922406
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
2393-
max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
2407+
max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true));
23942408
}
23952409

23962410
// Loop over all elements

backends/hip-gen/ceed-hip-gen-operator-build.cpp

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -207,11 +207,19 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
207207
static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
208208
CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
209209
CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
210-
bool use_3d_slices) {
211-
bool is_tensor = true;
210+
bool use_3d_slices, bool skip_active_load) {
211+
bool is_tensor = true, is_active = true;
212212
CeedBasis basis;
213+
213214
CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
214215
if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
216+
{
217+
CeedVector vec;
218+
219+
CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec));
220+
is_active = vec == CEED_VECTOR_ACTIVE;
221+
CeedCallBackend(CeedVectorDestroy(&vec));
222+
}
215223

216224
const char *field_name;
217225
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
@@ -283,15 +291,15 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
283291
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
284292
else data->B.outputs[i] = basis_data->d_interp_1d;
285293
}
286-
if (use_previous_field) {
294+
if (use_previous_field && !skip_active_load) {
287295
std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
288296

289297
code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
290298
} else {
291299
bool is_collocated = false;
292300

293301
CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
294-
if (is_collocated && !is_at_points) {
302+
if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
295303
code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
296304
} else {
297305
code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
@@ -322,15 +330,15 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
322330
else data->B.outputs[i] = basis_data->d_interp_1d;
323331
}
324332
if (is_tensor) {
325-
if (use_previous_field) {
333+
if (use_previous_field && !skip_active_load) {
326334
std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
327335

328336
code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
329337
} else {
330338
bool is_collocated = false;
331339

332340
CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
333-
if (is_collocated && !is_at_points) {
341+
if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
334342
code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
335343
} else {
336344
code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
@@ -342,10 +350,12 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
342350
if (use_3d_slices) {
343351
if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
344352
else data->G.outputs[i] = basis_data->d_collo_grad_1d;
345-
if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
353+
if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
346354
std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
347355

348356
code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
357+
} else if (is_active && skip_active_load) {
358+
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
349359
} else {
350360
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
351361
code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
@@ -356,19 +366,23 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
356366
if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
357367
else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
358368
if (has_collo_grad) {
359-
if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
369+
if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
360370
std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
361371

362372
code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
373+
} else if (is_active && skip_active_load) {
374+
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
363375
} else {
364376
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
365377
code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
366378
}
367379
} else {
368-
if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
380+
if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
369381
std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
370382

371383
code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
384+
} else if (is_active && skip_active_load) {
385+
code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
372386
} else {
373387
code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
374388
<< (is_tensor ? "" : var_suffix) << "];\n";
@@ -1465,12 +1479,12 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
14651479
code << "\n" << tab << "// Input field constants and basis data\n";
14661480
for (CeedInt i = 0; i < num_input_fields; i++) {
14671481
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1468-
max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1482+
max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
14691483
}
14701484
code << "\n" << tab << "// Output field constants and basis data\n";
14711485
for (CeedInt i = 0; i < num_output_fields; i++) {
14721486
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1473-
max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1487+
max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
14741488
}
14751489

14761490
// Loop over all elements
@@ -1823,12 +1837,12 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool
18231837
code << "\n" << tab << "// Input field constants and basis data\n";
18241838
for (CeedInt i = 0; i < num_input_fields; i++) {
18251839
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1826-
max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1840+
max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
18271841
}
18281842
code << "\n" << tab << "// Output field constants and basis data\n";
18291843
for (CeedInt i = 0; i < num_output_fields; i++) {
18301844
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1831-
max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1845+
max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
18321846
}
18331847

18341848
// Loop over all elements
@@ -2380,12 +2394,12 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
23802394
code << "\n" << tab << "// Input field constants and basis data\n";
23812395
for (CeedInt i = 0; i < num_input_fields; i++) {
23822396
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
2383-
max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
2397+
max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true));
23842398
}
23852399
code << "\n" << tab << "// Output field constants and basis data\n";
23862400
for (CeedInt i = 0; i < num_output_fields; i++) {
23872401
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
2388-
max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
2402+
max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true));
23892403
}
23902404

23912405
// Loop over all elements

0 commit comments

Comments
 (0)