@@ -154,8 +154,8 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
154154// Setup fields
155155// ------------------------------------------------------------------------------
156156static int CeedOperatorBuildKernelFieldData_Hip_gen (std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
157- CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input , bool is_tensor , bool is_at_points ,
158- bool use_3d_slices) {
157+ CeedQFunctionField qf_field, CeedInt field_reuse[ 3 ], CeedInt Q_1d , bool is_input , bool is_tensor ,
158+ bool is_at_points, bool use_3d_slices) {
159159 std::string var_suffix = (is_input ? " _in_" : " _out_" ) + std::to_string (i);
160160 std::string P_name = (is_tensor ? " P_1d" : " P" ) + var_suffix, Q_name = is_tensor ? " Q_1d" : " Q" ;
161161 std::string option_name = (is_input ? " inputs" : " outputs" );
@@ -165,6 +165,12 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
165165 CeedBasis_Hip_shared *basis_data;
166166 CeedBasis basis;
167167
168+ // Field reuse info
169+ bool use_previous_field = field_reuse[0 ] != -1 ;
170+ bool reuse_input = field_reuse[1 ];
171+ CeedInt reuse_field = field_reuse[0 ];
172+ CeedEvalMode reuse_mode = (CeedEvalMode)field_reuse[2 ];
173+
168174 code << " // -- " << (is_input ? " Input" : " Output" ) << " field " << i << " \n " ;
169175
170176 // Get field data
@@ -215,8 +221,14 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
215221 if (is_input) data->B .inputs [i] = basis_data->d_interp_1d ;
216222 else data->B .outputs [i] = basis_data->d_interp_1d ;
217223 }
218- code << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
219- code << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
224+ if (use_previous_field) {
225+ std::string reuse_var = " s_B" + ((reuse_input ? " _in_" : " _out_" ) + std::to_string (reuse_field));
226+
227+ code << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << " ;\n " ;
228+ } else {
229+ code << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
230+ code << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
231+ }
220232 break ;
221233 case CEED_EVAL_GRAD:
222234 if (is_at_points) {
@@ -241,27 +253,51 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
241253 else data->B .outputs [i] = basis_data->d_interp_1d ;
242254 }
243255 if (is_tensor) {
244- code << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
245- code << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
256+ if (use_previous_field) {
257+ std::string reuse_var = " s_B" + ((reuse_input ? " _in_" : " _out_" ) + std::to_string (reuse_field));
258+
259+ code << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << " ;\n " ;
260+ } else {
261+ code << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
262+ code << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
263+ }
246264 }
247265 if (is_at_points) break ; // No G mat for AtPoints
248266 if (use_3d_slices) {
249267 if (is_input) data->G .inputs [i] = basis_data->d_collo_grad_1d ;
250268 else data->G .outputs [i] = basis_data->d_collo_grad_1d ;
251- code << " __shared__ CeedScalar s_G" << var_suffix << " [" << Q_name << " *" << Q_name << " ];\n " ;
252- code << " LoadMatrix<" << Q_name << " , " << Q_name << " >(data, G." << option_name << " [" << i << " ], s_G" << var_suffix << " );\n " ;
269+ if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
270+ std::string reuse_var = " s_G" + ((reuse_input ? " _in_" : " _out_" ) + std::to_string (reuse_field));
271+
272+ code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << " ;\n " ;
273+ } else {
274+ code << " __shared__ CeedScalar s_G" << var_suffix << " [" << Q_name << " *" << Q_name << " ];\n " ;
275+ code << " LoadMatrix<" << Q_name << " , " << Q_name << " >(data, G." << option_name << " [" << i << " ], s_G" << var_suffix << " );\n " ;
276+ }
253277 } else {
254278 bool has_collo_grad = basis_data->d_collo_grad_1d ;
255279
256280 if (is_input) data->G .inputs [i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d ;
257281 else data->G .outputs [i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d ;
258282 if (has_collo_grad) {
259- code << " __shared__ CeedScalar s_G" << var_suffix << " [" << Q_name << " *" << Q_name << " ];\n " ;
260- code << " LoadMatrix<" << Q_name << " , " << Q_name << " >(data, G." << option_name << " [" << i << " ], s_G" << var_suffix << " );\n " ;
283+ if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
284+ std::string reuse_var = " s_G" + ((reuse_input ? " _in_" : " _out_" ) + std::to_string (reuse_field));
285+
286+ code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << " ;\n " ;
287+ } else {
288+ code << " __shared__ CeedScalar s_G" << var_suffix << " [" << Q_name << " *" << Q_name << " ];\n " ;
289+ code << " LoadMatrix<" << Q_name << " , " << Q_name << " >(data, G." << option_name << " [" << i << " ], s_G" << var_suffix << " );\n " ;
290+ }
261291 } else {
262- code << " __shared__ CeedScalar s_G" << var_suffix << " [" << P_name << " *" << Q_name << (is_tensor ? " " : " *dim" ) << " ];\n " ;
263- code << " LoadMatrix<" << P_name << " , " << Q_name << (is_tensor ? " " : " *dim" ) << " >(data, G." << option_name << " [" << i << " ], s_G"
264- << var_suffix << " );\n " ;
292+ if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
293+ std::string reuse_var = " s_G" + ((reuse_input ? " _in_" : " _out_" ) + std::to_string (reuse_field));
294+
295+ code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << " ;\n " ;
296+ } else {
297+ code << " __shared__ CeedScalar s_G" << var_suffix << " [" << P_name << " *" << Q_name << (is_tensor ? " " : " *dim" ) << " ];\n " ;
298+ code << " LoadMatrix<" << P_name << " , " << Q_name << (is_tensor ? " " : " *dim" ) << " >(data, G." << option_name << " [" << i << " ], s_G"
299+ << var_suffix << " );\n " ;
300+ }
265301 }
266302 }
267303 break ;
@@ -1151,16 +1187,116 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
11511187 code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n " ;
11521188 code << " data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1 ) ? " " : " *T_1D" ) << " ;\n " ;
11531189
1190+ // -- Determine input mat reuse
1191+ CeedInt input_matrix_reuse[CEED_FIELD_MAX][3 ]; // field, is_input, eval_mode
1192+
1193+ for (CeedInt i = 0 ; i < num_input_fields; i++) {
1194+ input_matrix_reuse[i][0 ] = -1 ;
1195+ }
1196+ for (CeedInt i = 0 ; i < num_input_fields; i++) {
1197+ CeedEvalMode eval_mode_i;
1198+ CeedBasis basis_i;
1199+
1200+ CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_input_fields[i], &eval_mode_i));
1201+ if (eval_mode_i == CEED_EVAL_WEIGHT) continue ;
1202+ CeedCallBackend (CeedOperatorFieldGetBasis (op_input_fields[i], &basis_i));
1203+ for (CeedInt j = 0 ; (input_matrix_reuse[i][0 ] == -1 ) && (j < i); j++) {
1204+ CeedEvalMode eval_mode_j;
1205+ CeedBasis basis_j;
1206+
1207+ CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_input_fields[j], &eval_mode_j));
1208+ if (eval_mode_j == CEED_EVAL_WEIGHT) continue ;
1209+ CeedCallBackend (CeedOperatorFieldGetBasis (op_input_fields[j], &basis_j));
1210+ if (basis_i == basis_j) {
1211+ if (is_tensor) {
1212+ input_matrix_reuse[i][0 ] = j;
1213+ input_matrix_reuse[i][1 ] = true ;
1214+ input_matrix_reuse[i][2 ] = eval_mode_j;
1215+ } else {
1216+ // For non-tensor can only re-use with the same eval mode
1217+ if (eval_mode_i == eval_mode_j) {
1218+ input_matrix_reuse[i][0 ] = j;
1219+ input_matrix_reuse[i][1 ] = true ;
1220+ input_matrix_reuse[i][2 ] = eval_mode_j;
1221+ }
1222+ }
1223+ }
1224+ CeedCallBackend (CeedBasisDestroy (&basis_j));
1225+ }
1226+ CeedCallBackend (CeedBasisDestroy (&basis_i));
1227+ }
1228+
1229+ // -- Determine output mat reuse
1230+ CeedInt output_matrix_reuse[CEED_FIELD_MAX][3 ]; // field, is_input, eval_mode
1231+
1232+ for (CeedInt i = 0 ; i < num_output_fields; i++) {
1233+ output_matrix_reuse[i][0 ] = -1 ;
1234+ }
1235+ for (CeedInt i = 0 ; i < num_output_fields; i++) {
1236+ CeedEvalMode eval_mode_i;
1237+ CeedBasis basis_i;
1238+
1239+ CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_output_fields[i], &eval_mode_i));
1240+ CeedCallBackend (CeedOperatorFieldGetBasis (op_output_fields[i], &basis_i));
1241+ for (CeedInt j = 0 ; (output_matrix_reuse[i][0 ] == -1 ) && (j < num_input_fields); j++) {
1242+ CeedEvalMode eval_mode_j;
1243+ CeedBasis basis_j;
1244+
1245+ CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_input_fields[j], &eval_mode_j));
1246+ if (eval_mode_j == CEED_EVAL_WEIGHT) continue ;
1247+ CeedCallBackend (CeedOperatorFieldGetBasis (op_input_fields[j], &basis_j));
1248+ if (basis_i == basis_j) {
1249+ if (is_tensor) {
1250+ output_matrix_reuse[i][0 ] = j;
1251+ output_matrix_reuse[i][1 ] = true ;
1252+ output_matrix_reuse[i][2 ] = eval_mode_j;
1253+ } else {
1254+ // For non-tensor can only re-use with the same eval mode
1255+ if (eval_mode_i == eval_mode_j) {
1256+ output_matrix_reuse[i][0 ] = j;
1257+ output_matrix_reuse[i][1 ] = true ;
1258+ output_matrix_reuse[i][2 ] = eval_mode_j;
1259+ }
1260+ }
1261+ }
1262+ CeedCallBackend (CeedBasisDestroy (&basis_j));
1263+ }
1264+ for (CeedInt j = 0 ; (output_matrix_reuse[i][0 ] == -1 ) && (j < i); j++) {
1265+ CeedEvalMode eval_mode_j;
1266+ CeedBasis basis_j;
1267+
1268+ CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_output_fields[j], &eval_mode_j));
1269+ if (eval_mode_j == CEED_EVAL_WEIGHT) continue ;
1270+ CeedCallBackend (CeedOperatorFieldGetBasis (op_output_fields[j], &basis_j));
1271+ if (basis_i == basis_j) {
1272+ if (is_tensor) {
1273+ output_matrix_reuse[i][0 ] = j;
1274+ output_matrix_reuse[i][1 ] = false ;
1275+ output_matrix_reuse[i][2 ] = eval_mode_j;
1276+ } else {
1277+ // For non-tensor can only re-use with the same eval mode
1278+ if (eval_mode_i == eval_mode_j) {
1279+ output_matrix_reuse[i][0 ] = j;
1280+ output_matrix_reuse[i][1 ] = false ;
1281+ output_matrix_reuse[i][2 ] = eval_mode_j;
1282+ }
1283+ }
1284+ }
1285+ CeedCallBackend (CeedBasisDestroy (&basis_j));
1286+ }
1287+ CeedCallBackend (CeedBasisDestroy (&basis_i));
1288+ }
1289+
11541290 // Initialize constants, and matrices B and G
11551291 code << " \n // Input field constants and basis data\n " ;
11561292 for (CeedInt i = 0 ; i < num_input_fields; i++) {
1157- CeedCallBackend (CeedOperatorBuildKernelFieldData_Hip_gen (code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true , is_tensor ,
1158- is_at_points, use_3d_slices));
1293+ CeedCallBackend (CeedOperatorBuildKernelFieldData_Hip_gen (code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d, true ,
1294+ is_tensor, is_at_points, use_3d_slices));
11591295 }
11601296 code << " \n // Output field constants and basis data\n " ;
11611297 for (CeedInt i = 0 ; i < num_output_fields; i++) {
1162- CeedCallBackend (CeedOperatorBuildKernelFieldData_Hip_gen (code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false , is_tensor ,
1163- is_at_points, use_3d_slices));
1298+ CeedCallBackend (CeedOperatorBuildKernelFieldData_Hip_gen (code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d ,
1299+ false , is_tensor, is_at_points, use_3d_slices));
11641300 }
11651301
11661302 // Loop over all elements
0 commit comments