Skip to content

Commit a4c0ed8

Browse files
committed
hip - remove duplicate mats in gen
1 parent f31af4e commit a4c0ed8

File tree

1 file changed

+153
-17
lines changed

1 file changed

+153
-17
lines changed

backends/hip-gen/ceed-hip-gen-operator-build.cpp

Lines changed: 153 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,8 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
154154
// Setup fields
155155
//------------------------------------------------------------------------------
156156
static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
157-
CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points,
158-
bool use_3d_slices) {
157+
CeedQFunctionField qf_field, CeedInt field_reuse[3], CeedInt Q_1d, bool is_input, bool is_tensor,
158+
bool is_at_points, bool use_3d_slices) {
159159
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
160160
std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
161161
std::string option_name = (is_input ? "inputs" : "outputs");
@@ -165,6 +165,12 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
165165
CeedBasis_Hip_shared *basis_data;
166166
CeedBasis basis;
167167

168+
// Field reuse info
169+
bool use_previous_field = field_reuse[0] != -1;
170+
bool reuse_input = field_reuse[1];
171+
CeedInt reuse_field = field_reuse[0];
172+
CeedEvalMode reuse_mode = (CeedEvalMode)field_reuse[2];
173+
168174
code << " // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
169175

170176
// Get field data
@@ -215,8 +221,14 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
215221
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
216222
else data->B.outputs[i] = basis_data->d_interp_1d;
217223
}
218-
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
219-
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
224+
if (use_previous_field) {
225+
std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
226+
227+
code << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
228+
} else {
229+
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
230+
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
231+
}
220232
break;
221233
case CEED_EVAL_GRAD:
222234
if (is_at_points) {
@@ -241,27 +253,51 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
241253
else data->B.outputs[i] = basis_data->d_interp_1d;
242254
}
243255
if (is_tensor) {
244-
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
245-
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
256+
if (use_previous_field) {
257+
std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
258+
259+
code << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
260+
} else {
261+
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
262+
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
263+
}
246264
}
247265
if (is_at_points) break; // No G mat for AtPoints
248266
if (use_3d_slices) {
249267
if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
250268
else data->G.outputs[i] = basis_data->d_collo_grad_1d;
251-
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
252-
code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
269+
if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
270+
std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
271+
272+
code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
273+
} else {
274+
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
275+
code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
276+
}
253277
} else {
254278
bool has_collo_grad = basis_data->d_collo_grad_1d;
255279

256280
if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
257281
else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
258282
if (has_collo_grad) {
259-
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
260-
code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
283+
if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
284+
std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
285+
286+
code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
287+
} else {
288+
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
289+
code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
290+
}
261291
} else {
262-
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
263-
code << " LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
264-
<< var_suffix << ");\n";
292+
if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
293+
std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
294+
295+
code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
296+
} else {
297+
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
298+
code << " LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
299+
<< var_suffix << ");\n";
300+
}
265301
}
266302
}
267303
break;
@@ -1092,16 +1128,116 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
10921128
code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
10931129
code << " data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
10941130

1131+
// -- Determine input mat reuse
1132+
CeedInt input_matrix_reuse[CEED_FIELD_MAX][3]; // field, is_input, eval_mode
1133+
1134+
for (CeedInt i = 0; i < num_input_fields; i++) {
1135+
input_matrix_reuse[i][0] = -1;
1136+
}
1137+
for (CeedInt i = 0; i < num_input_fields; i++) {
1138+
CeedEvalMode eval_mode_i;
1139+
CeedBasis basis_i;
1140+
1141+
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1142+
if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1143+
CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1144+
for (CeedInt j = 0; (input_matrix_reuse[i][0] == -1) && (j < i); j++) {
1145+
CeedEvalMode eval_mode_j;
1146+
CeedBasis basis_j;
1147+
1148+
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1149+
if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1150+
CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1151+
if (basis_i == basis_j) {
1152+
if (is_tensor) {
1153+
input_matrix_reuse[i][0] = j;
1154+
input_matrix_reuse[i][1] = true;
1155+
input_matrix_reuse[i][2] = eval_mode_j;
1156+
} else {
1157+
// For non-tensor can only re-use with the same eval mode
1158+
if (eval_mode_i == eval_mode_j) {
1159+
input_matrix_reuse[i][0] = j;
1160+
input_matrix_reuse[i][1] = true;
1161+
input_matrix_reuse[i][2] = eval_mode_j;
1162+
}
1163+
}
1164+
}
1165+
CeedCallBackend(CeedBasisDestroy(&basis_j));
1166+
}
1167+
CeedCallBackend(CeedBasisDestroy(&basis_i));
1168+
}
1169+
1170+
// -- Determine output mat reuse
1171+
CeedInt output_matrix_reuse[CEED_FIELD_MAX][3]; // field, is_input, eval_mode
1172+
1173+
for (CeedInt i = 0; i < num_output_fields; i++) {
1174+
output_matrix_reuse[i][0] = -1;
1175+
}
1176+
for (CeedInt i = 0; i < num_output_fields; i++) {
1177+
CeedEvalMode eval_mode_i;
1178+
CeedBasis basis_i;
1179+
1180+
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1181+
CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1182+
for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < num_input_fields); j++) {
1183+
CeedEvalMode eval_mode_j;
1184+
CeedBasis basis_j;
1185+
1186+
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1187+
if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1188+
CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1189+
if (basis_i == basis_j) {
1190+
if (is_tensor) {
1191+
output_matrix_reuse[i][0] = j;
1192+
output_matrix_reuse[i][1] = true;
1193+
output_matrix_reuse[i][2] = eval_mode_j;
1194+
} else {
1195+
// For non-tensor can only re-use with the same eval mode
1196+
if (eval_mode_i == eval_mode_j) {
1197+
output_matrix_reuse[i][0] = j;
1198+
output_matrix_reuse[i][1] = true;
1199+
output_matrix_reuse[i][2] = eval_mode_j;
1200+
}
1201+
}
1202+
}
1203+
CeedCallBackend(CeedBasisDestroy(&basis_j));
1204+
}
1205+
for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < i); j++) {
1206+
CeedEvalMode eval_mode_j;
1207+
CeedBasis basis_j;
1208+
1209+
CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1210+
if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1211+
CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1212+
if (basis_i == basis_j) {
1213+
if (is_tensor) {
1214+
output_matrix_reuse[i][0] = j;
1215+
output_matrix_reuse[i][1] = false;
1216+
output_matrix_reuse[i][2] = eval_mode_j;
1217+
} else {
1218+
// For non-tensor can only re-use with the same eval mode
1219+
if (eval_mode_i == eval_mode_j) {
1220+
output_matrix_reuse[i][0] = j;
1221+
output_matrix_reuse[i][1] = false;
1222+
output_matrix_reuse[i][2] = eval_mode_j;
1223+
}
1224+
}
1225+
}
1226+
CeedCallBackend(CeedBasisDestroy(&basis_j));
1227+
}
1228+
CeedCallBackend(CeedBasisDestroy(&basis_i));
1229+
}
1230+
10951231
// Initialize constants, and matrices B and G
10961232
code << "\n // Input field constants and basis data\n";
10971233
for (CeedInt i = 0; i < num_input_fields; i++) {
1098-
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_tensor,
1099-
is_at_points, use_3d_slices));
1234+
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d, true,
1235+
is_tensor, is_at_points, use_3d_slices));
11001236
}
11011237
code << "\n // Output field constants and basis data\n";
11021238
for (CeedInt i = 0; i < num_output_fields; i++) {
1103-
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
1104-
is_at_points, use_3d_slices));
1239+
CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
1240+
false, is_tensor, is_at_points, use_3d_slices));
11051241
}
11061242

11071243
// Loop over all elements

0 commit comments

Comments
 (0)