@@ -154,8 +154,8 @@ static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fiel
154154// Setup fields
155155// ------------------------------------------------------------------------------
156156static int CeedOperatorBuildKernelFieldData_Hip_gen (std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
157- CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input , bool is_tensor , bool is_at_points ,
158- bool use_3d_slices) {
157+ CeedQFunctionField qf_field, CeedInt field_reuse[ 3 ], CeedInt Q_1d , bool is_input , bool is_tensor ,
158+ bool is_at_points, bool use_3d_slices) {
159159 std::string var_suffix = (is_input ? " _in_" : " _out_" ) + std::to_string (i);
160160 std::string P_name = (is_tensor ? " P_1d" : " P" ) + var_suffix, Q_name = is_tensor ? " Q_1d" : " Q" ;
161161 std::string option_name = (is_input ? " inputs" : " outputs" );
@@ -165,6 +165,12 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
165165 CeedBasis_Hip_shared *basis_data;
166166 CeedBasis basis;
167167
168+ // Field reuse info
169+ bool use_previous_field = field_reuse[0 ] != -1 ;
170+ bool reuse_input = field_reuse[1 ];
171+ CeedInt reuse_field = field_reuse[0 ];
172+ CeedEvalMode reuse_mode = (CeedEvalMode)field_reuse[2 ];
173+
168174 code << " // -- " << (is_input ? " Input" : " Output" ) << " field " << i << " \n " ;
169175
170176 // Get field data
@@ -215,8 +221,14 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
215221 if (is_input) data->B .inputs [i] = basis_data->d_interp_1d ;
216222 else data->B .outputs [i] = basis_data->d_interp_1d ;
217223 }
218- code << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
219- code << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
224+ if (use_previous_field) {
225+ std::string reuse_var = " s_B" + ((reuse_input ? " _in_" : " _out_" ) + std::to_string (reuse_field));
226+
227+ code << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << " ;\n " ;
228+ } else {
229+ code << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
230+ code << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
231+ }
220232 break ;
221233 case CEED_EVAL_GRAD:
222234 if (is_at_points) {
@@ -241,27 +253,51 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
241253 else data->B .outputs [i] = basis_data->d_interp_1d ;
242254 }
243255 if (is_tensor) {
244- code << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
245- code << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
256+ if (use_previous_field) {
257+ std::string reuse_var = " s_B" + ((reuse_input ? " _in_" : " _out_" ) + std::to_string (reuse_field));
258+
259+ code << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << " ;\n " ;
260+ } else {
261+ code << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
262+ code << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
263+ }
246264 }
247265 if (is_at_points) break ; // No G mat for AtPoints
248266 if (use_3d_slices) {
249267 if (is_input) data->G .inputs [i] = basis_data->d_collo_grad_1d ;
250268 else data->G .outputs [i] = basis_data->d_collo_grad_1d ;
251- code << " __shared__ CeedScalar s_G" << var_suffix << " [" << Q_name << " *" << Q_name << " ];\n " ;
252- code << " LoadMatrix<" << Q_name << " , " << Q_name << " >(data, G." << option_name << " [" << i << " ], s_G" << var_suffix << " );\n " ;
269+ if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
270+ std::string reuse_var = " s_G" + ((reuse_input ? " _in_" : " _out_" ) + std::to_string (reuse_field));
271+
272+ code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << " ;\n " ;
273+ } else {
274+ code << " __shared__ CeedScalar s_G" << var_suffix << " [" << Q_name << " *" << Q_name << " ];\n " ;
275+ code << " LoadMatrix<" << Q_name << " , " << Q_name << " >(data, G." << option_name << " [" << i << " ], s_G" << var_suffix << " );\n " ;
276+ }
253277 } else {
254278 bool has_collo_grad = basis_data->d_collo_grad_1d ;
255279
256280 if (is_input) data->G .inputs [i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d ;
257281 else data->G .outputs [i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d ;
258282 if (has_collo_grad) {
259- code << " __shared__ CeedScalar s_G" << var_suffix << " [" << Q_name << " *" << Q_name << " ];\n " ;
260- code << " LoadMatrix<" << Q_name << " , " << Q_name << " >(data, G." << option_name << " [" << i << " ], s_G" << var_suffix << " );\n " ;
283+ if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
284+ std::string reuse_var = " s_G" + ((reuse_input ? " _in_" : " _out_" ) + std::to_string (reuse_field));
285+
286+ code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << " ;\n " ;
287+ } else {
288+ code << " __shared__ CeedScalar s_G" << var_suffix << " [" << Q_name << " *" << Q_name << " ];\n " ;
289+ code << " LoadMatrix<" << Q_name << " , " << Q_name << " >(data, G." << option_name << " [" << i << " ], s_G" << var_suffix << " );\n " ;
290+ }
261291 } else {
262- code << " __shared__ CeedScalar s_G" << var_suffix << " [" << P_name << " *" << Q_name << (is_tensor ? " " : " *dim" ) << " ];\n " ;
263- code << " LoadMatrix<" << P_name << " , " << Q_name << (is_tensor ? " " : " *dim" ) << " >(data, G." << option_name << " [" << i << " ], s_G"
264- << var_suffix << " );\n " ;
292+ if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
293+ std::string reuse_var = " s_G" + ((reuse_input ? " _in_" : " _out_" ) + std::to_string (reuse_field));
294+
295+ code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << " ;\n " ;
296+ } else {
297+ code << " __shared__ CeedScalar s_G" << var_suffix << " [" << P_name << " *" << Q_name << (is_tensor ? " " : " *dim" ) << " ];\n " ;
298+ code << " LoadMatrix<" << P_name << " , " << Q_name << (is_tensor ? " " : " *dim" ) << " >(data, G." << option_name << " [" << i << " ], s_G"
299+ << var_suffix << " );\n " ;
300+ }
265301 }
266302 }
267303 break ;
@@ -1092,16 +1128,116 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
10921128 code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n " ;
10931129 code << " data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1 ) ? " " : " *T_1D" ) << " ;\n " ;
10941130
1131+ // -- Determine input mat reuse
1132+ CeedInt input_matrix_reuse[CEED_FIELD_MAX][3 ]; // field, is_input, eval_mode
1133+
1134+ for (CeedInt i = 0 ; i < num_input_fields; i++) {
1135+ input_matrix_reuse[i][0 ] = -1 ;
1136+ }
1137+ for (CeedInt i = 0 ; i < num_input_fields; i++) {
1138+ CeedEvalMode eval_mode_i;
1139+ CeedBasis basis_i;
1140+
1141+ CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_input_fields[i], &eval_mode_i));
1142+ if (eval_mode_i == CEED_EVAL_WEIGHT) continue ;
1143+ CeedCallBackend (CeedOperatorFieldGetBasis (op_input_fields[i], &basis_i));
1144+ for (CeedInt j = 0 ; (input_matrix_reuse[i][0 ] == -1 ) && (j < i); j++) {
1145+ CeedEvalMode eval_mode_j;
1146+ CeedBasis basis_j;
1147+
1148+ CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_input_fields[j], &eval_mode_j));
1149+ if (eval_mode_j == CEED_EVAL_WEIGHT) continue ;
1150+ CeedCallBackend (CeedOperatorFieldGetBasis (op_input_fields[j], &basis_j));
1151+ if (basis_i == basis_j) {
1152+ if (is_tensor) {
1153+ input_matrix_reuse[i][0 ] = j;
1154+ input_matrix_reuse[i][1 ] = true ;
1155+ input_matrix_reuse[i][2 ] = eval_mode_j;
1156+ } else {
1157+ // For non-tensor can only re-use with the same eval mode
1158+ if (eval_mode_i == eval_mode_j) {
1159+ input_matrix_reuse[i][0 ] = j;
1160+ input_matrix_reuse[i][1 ] = true ;
1161+ input_matrix_reuse[i][2 ] = eval_mode_j;
1162+ }
1163+ }
1164+ }
1165+ CeedCallBackend (CeedBasisDestroy (&basis_j));
1166+ }
1167+ CeedCallBackend (CeedBasisDestroy (&basis_i));
1168+ }
1169+
1170+ // -- Determine output mat reuse
1171+ CeedInt output_matrix_reuse[CEED_FIELD_MAX][3 ]; // field, is_input, eval_mode
1172+
1173+ for (CeedInt i = 0 ; i < num_output_fields; i++) {
1174+ output_matrix_reuse[i][0 ] = -1 ;
1175+ }
1176+ for (CeedInt i = 0 ; i < num_output_fields; i++) {
1177+ CeedEvalMode eval_mode_i;
1178+ CeedBasis basis_i;
1179+
1180+ CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_output_fields[i], &eval_mode_i));
1181+ CeedCallBackend (CeedOperatorFieldGetBasis (op_output_fields[i], &basis_i));
1182+ for (CeedInt j = 0 ; (output_matrix_reuse[i][0 ] == -1 ) && (j < num_input_fields); j++) {
1183+ CeedEvalMode eval_mode_j;
1184+ CeedBasis basis_j;
1185+
1186+ CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_input_fields[j], &eval_mode_j));
1187+ if (eval_mode_j == CEED_EVAL_WEIGHT) continue ;
1188+ CeedCallBackend (CeedOperatorFieldGetBasis (op_input_fields[j], &basis_j));
1189+ if (basis_i == basis_j) {
1190+ if (is_tensor) {
1191+ output_matrix_reuse[i][0 ] = j;
1192+ output_matrix_reuse[i][1 ] = true ;
1193+ output_matrix_reuse[i][2 ] = eval_mode_j;
1194+ } else {
1195+ // For non-tensor can only re-use with the same eval mode
1196+ if (eval_mode_i == eval_mode_j) {
1197+ output_matrix_reuse[i][0 ] = j;
1198+ output_matrix_reuse[i][1 ] = true ;
1199+ output_matrix_reuse[i][2 ] = eval_mode_j;
1200+ }
1201+ }
1202+ }
1203+ CeedCallBackend (CeedBasisDestroy (&basis_j));
1204+ }
1205+ for (CeedInt j = 0 ; (output_matrix_reuse[i][0 ] == -1 ) && (j < i); j++) {
1206+ CeedEvalMode eval_mode_j;
1207+ CeedBasis basis_j;
1208+
1209+ CeedCallBackend (CeedQFunctionFieldGetEvalMode (qf_output_fields[j], &eval_mode_j));
1210+ if (eval_mode_j == CEED_EVAL_WEIGHT) continue ;
1211+ CeedCallBackend (CeedOperatorFieldGetBasis (op_output_fields[j], &basis_j));
1212+ if (basis_i == basis_j) {
1213+ if (is_tensor) {
1214+ output_matrix_reuse[i][0 ] = j;
1215+ output_matrix_reuse[i][1 ] = false ;
1216+ output_matrix_reuse[i][2 ] = eval_mode_j;
1217+ } else {
1218+ // For non-tensor can only re-use with the same eval mode
1219+ if (eval_mode_i == eval_mode_j) {
1220+ output_matrix_reuse[i][0 ] = j;
1221+ output_matrix_reuse[i][1 ] = false ;
1222+ output_matrix_reuse[i][2 ] = eval_mode_j;
1223+ }
1224+ }
1225+ }
1226+ CeedCallBackend (CeedBasisDestroy (&basis_j));
1227+ }
1228+ CeedCallBackend (CeedBasisDestroy (&basis_i));
1229+ }
1230+
10951231 // Initialize constants, and matrices B and G
10961232 code << " \n // Input field constants and basis data\n " ;
10971233 for (CeedInt i = 0 ; i < num_input_fields; i++) {
1098- CeedCallBackend (CeedOperatorBuildKernelFieldData_Hip_gen (code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true , is_tensor ,
1099- is_at_points, use_3d_slices));
1234+ CeedCallBackend (CeedOperatorBuildKernelFieldData_Hip_gen (code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d, true ,
1235+ is_tensor, is_at_points, use_3d_slices));
11001236 }
11011237 code << " \n // Output field constants and basis data\n " ;
11021238 for (CeedInt i = 0 ; i < num_output_fields; i++) {
1103- CeedCallBackend (CeedOperatorBuildKernelFieldData_Hip_gen (code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false , is_tensor ,
1104- is_at_points, use_3d_slices));
1239+ CeedCallBackend (CeedOperatorBuildKernelFieldData_Hip_gen (code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d ,
1240+ false , is_tensor, is_at_points, use_3d_slices));
11051241 }
11061242
11071243 // Loop over all elements
0 commit comments