Skip to content

Commit 9ee499e

Browse files
committed
hip - remove duplicate mats in gen
1 parent 0a2a649 commit 9ee499e

File tree

1 file changed

+153
-17
lines changed

1 file changed

+153
-17
lines changed

backends/hip-gen/ceed-hip-gen-operator-build.cpp

Lines changed: 153 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,8 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
154154
// Setup fields
155155
//------------------------------------------------------------------------------
156156
static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
157-
CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points,
158-
bool use_3d_slices) {
157+
CeedQFunctionField qf_field, CeedInt field_reuse[3], CeedInt Q_1d, bool is_input, bool is_tensor,
158+
bool is_at_points, bool use_3d_slices) {
159159
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
160160
std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
161161
std::string option_name = (is_input ? "inputs" : "outputs");
@@ -165,6 +165,12 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
165165
CeedBasis_Hip_shared *basis_data;
166166
CeedBasis basis;
167167

168+
// Field reuse info
169+
bool use_previous_field = field_reuse[0] != -1;
170+
bool reuse_input = field_reuse[1];
171+
CeedInt reuse_field = field_reuse[0];
172+
CeedEvalMode reuse_mode = (CeedEvalMode)field_reuse[2];
173+
168174
code << " // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
169175

170176
// Get field data
@@ -215,8 +221,14 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
215221
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
216222
else data->B.outputs[i] = basis_data->d_interp_1d;
217223
}
218-
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
219-
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
224+
if (use_previous_field) {
225+
std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
226+
227+
code << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
228+
} else {
229+
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
230+
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
231+
}
220232
break;
221233
case CEED_EVAL_GRAD:
222234
if (is_at_points) {
@@ -241,27 +253,51 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
241253
else data->B.outputs[i] = basis_data->d_interp_1d;
242254
}
243255
if (is_tensor) {
244-
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
245-
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
256+
if (use_previous_field) {
257+
std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
258+
259+
code << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
260+
} else {
261+
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
262+
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
263+
}
246264
}
247265
if (is_at_points) break; // No G mat for AtPoints
248266
if (use_3d_slices) {
249267
if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
250268
else data->G.outputs[i] = basis_data->d_collo_grad_1d;
251-
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
252-
code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
269+
if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
270+
std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
271+
272+
code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
273+
} else {
274+
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
275+
code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
276+
}
253277
} else {
254278
bool has_collo_grad = basis_data->d_collo_grad_1d;
255279

256280
if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
257281
else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
258282
if (has_collo_grad) {
259-
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
260-
code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
283+
if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
284+
std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
285+
286+
code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
287+
} else {
288+
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
289+
code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
290+
}
261291
} else {
262-
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
263-
code << " LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
264-
<< var_suffix << ");\n";
292+
if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
293+
std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
294+
295+
code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
296+
} else {
297+
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
298+
code << " LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
299+
<< var_suffix << ");\n";
300+
}
265301
}
266302
}
267303
break;
@@ -1151,16 +1187,116 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
11511187
code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
11521188
code << " data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
11531189

1190+
// -- Determine input mat reuse
1191+
CeedInt input_matrix_reuse[CEED_FIELD_MAX][3]; // field, is_input, eval_mode
1192+
1193+
for (CeedInt i = 0; i < num_input_fields; i++) {
1194+
input_matrix_reuse[i][0] = -1;
1195+
}
1196+
for (CeedInt i = 0; i < num_input_fields; i++) {
1197+
CeedEvalMode eval_mode_i;
1198+
CeedBasis basis_i;
1199+
1200+
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1201+
if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1202+
CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1203+
for (CeedInt j = 0; (input_matrix_reuse[i][0] == -1) && (j < i); j++) {
1204+
CeedEvalMode eval_mode_j;
1205+
CeedBasis basis_j;
1206+
1207+
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1208+
if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1209+
CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1210+
if (basis_i == basis_j) {
1211+
if (is_tensor) {
1212+
input_matrix_reuse[i][0] = j;
1213+
input_matrix_reuse[i][1] = true;
1214+
input_matrix_reuse[i][2] = eval_mode_j;
1215+
} else {
1216+
// For non-tensor can only re-use with the same eval mode
1217+
if (eval_mode_i == eval_mode_j) {
1218+
input_matrix_reuse[i][0] = j;
1219+
input_matrix_reuse[i][1] = true;
1220+
input_matrix_reuse[i][2] = eval_mode_j;
1221+
}
1222+
}
1223+
}
1224+
CeedCallBackend(CeedBasisDestroy(&basis_j));
1225+
}
1226+
CeedCallBackend(CeedBasisDestroy(&basis_i));
1227+
}
1228+
1229+
// -- Determine output mat reuse
1230+
CeedInt output_matrix_reuse[CEED_FIELD_MAX][3]; // field, is_input, eval_mode
1231+
1232+
for (CeedInt i = 0; i < num_output_fields; i++) {
1233+
output_matrix_reuse[i][0] = -1;
1234+
}
1235+
for (CeedInt i = 0; i < num_output_fields; i++) {
1236+
CeedEvalMode eval_mode_i;
1237+
CeedBasis basis_i;
1238+
1239+
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1240+
CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1241+
for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < num_input_fields); j++) {
1242+
CeedEvalMode eval_mode_j;
1243+
CeedBasis basis_j;
1244+
1245+
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1246+
if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1247+
CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1248+
if (basis_i == basis_j) {
1249+
if (is_tensor) {
1250+
output_matrix_reuse[i][0] = j;
1251+
output_matrix_reuse[i][1] = true;
1252+
output_matrix_reuse[i][2] = eval_mode_j;
1253+
} else {
1254+
// For non-tensor can only re-use with the same eval mode
1255+
if (eval_mode_i == eval_mode_j) {
1256+
output_matrix_reuse[i][0] = j;
1257+
output_matrix_reuse[i][1] = true;
1258+
output_matrix_reuse[i][2] = eval_mode_j;
1259+
}
1260+
}
1261+
}
1262+
CeedCallBackend(CeedBasisDestroy(&basis_j));
1263+
}
1264+
for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < i); j++) {
1265+
CeedEvalMode eval_mode_j;
1266+
CeedBasis basis_j;
1267+
1268+
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1269+
if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1270+
CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1271+
if (basis_i == basis_j) {
1272+
if (is_tensor) {
1273+
output_matrix_reuse[i][0] = j;
1274+
output_matrix_reuse[i][1] = false;
1275+
output_matrix_reuse[i][2] = eval_mode_j;
1276+
} else {
1277+
// For non-tensor can only re-use with the same eval mode
1278+
if (eval_mode_i == eval_mode_j) {
1279+
output_matrix_reuse[i][0] = j;
1280+
output_matrix_reuse[i][1] = false;
1281+
output_matrix_reuse[i][2] = eval_mode_j;
1282+
}
1283+
}
1284+
}
1285+
CeedCallBackend(CeedBasisDestroy(&basis_j));
1286+
}
1287+
CeedCallBackend(CeedBasisDestroy(&basis_i));
1288+
}
1289+
11541290
// Initialize constants, and matrices B and G
11551291
code << "\n // Input field constants and basis data\n";
11561292
for (CeedInt i = 0; i < num_input_fields; i++) {
1157-
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_tensor,
1158-
is_at_points, use_3d_slices));
1293+
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d, true,
1294+
is_tensor, is_at_points, use_3d_slices));
11591295
}
11601296
code << "\n // Output field constants and basis data\n";
11611297
for (CeedInt i = 0; i < num_output_fields; i++) {
1162-
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
1163-
is_at_points, use_3d_slices));
1298+
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
1299+
false, is_tensor, is_at_points, use_3d_slices));
11641300
}
11651301

11661302
// Loop over all elements

0 commit comments

Comments
 (0)