@@ -180,11 +180,19 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
180180static int CeedOperatorBuildKernelFieldData_Cuda_gen (std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
181181 CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse,
182182 CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
183- bool use_3d_slices) {
184- bool is_tensor = true ;
183+ bool use_3d_slices, bool skip_active_load ) {
184+ bool is_tensor = true , is_active = true ;
185185 CeedBasis basis;
186+
186187 CeedCallBackend (CeedOperatorFieldGetBasis (op_field, &basis));
187188 if (basis != CEED_BASIS_NONE) CeedCallBackend (CeedBasisIsTensor (basis, &is_tensor));
189+ {
190+ CeedVector vec;
191+
192+ CeedCallBackend (CeedOperatorFieldGetVector (op_field, &vec));
193+ is_active = vec == CEED_VECTOR_ACTIVE;
194+ CeedCallBackend (CeedVectorDestroy (&vec));
195+ }
188196
189197 const char *field_name;
190198 std::string var_suffix = (is_input ? " _in_" : " _out_" ) + std::to_string (i);
@@ -256,15 +264,15 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
256264 if (is_input) data->B .inputs [i] = basis_data->d_interp_1d ;
257265 else data->B .outputs [i] = basis_data->d_interp_1d ;
258266 }
259- if (use_previous_field) {
267+ if (use_previous_field && !skip_active_load ) {
260268 std::string reuse_var = " s_B" + ((field_reuse.is_input ? " _in_" : " _out_" ) + std::to_string (field_reuse.index ));
261269
262270 code << tab << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << " ;\n " ;
263271 } else {
264272 bool is_collocated = false ;
265273
266274 CeedCallBackend (CeedBasisIsCollocated (basis, &is_collocated));
267- if (is_collocated && !is_at_points) {
275+ if ((is_active && skip_active_load) || ( is_collocated && !is_at_points) ) {
268276 code << tab << " CeedScalar *s_B" << var_suffix << " = NULL;\n " ;
269277 } else {
270278 code << tab << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
@@ -295,15 +303,15 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
295303 else data->B .outputs [i] = basis_data->d_interp_1d ;
296304 }
297305 if (is_tensor) {
298- if (use_previous_field) {
306+ if (use_previous_field && !skip_active_load ) {
299307 std::string reuse_var = " s_B" + ((field_reuse.is_input ? " _in_" : " _out_" ) + std::to_string (field_reuse.index ));
300308
301309 code << tab << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << " ;\n " ;
302310 } else {
303311 bool is_collocated = false ;
304312
305313 CeedCallBackend (CeedBasisIsCollocated (basis, &is_collocated));
306- if (is_collocated && !is_at_points) {
314+ if ((is_active && skip_active_load) || ( is_collocated && !is_at_points) ) {
307315 code << tab << " CeedScalar *s_B" << var_suffix << " = NULL;\n " ;
308316 } else {
309317 code << tab << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
@@ -315,10 +323,12 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
315323 if (use_3d_slices) {
316324 if (is_input) data->G .inputs [i] = basis_data->d_collo_grad_1d ;
317325 else data->G .outputs [i] = basis_data->d_collo_grad_1d ;
318- if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
326+ if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load ) {
319327 std::string reuse_var = " s_G" + ((field_reuse.is_input ? " _in_" : " _out_" ) + std::to_string (field_reuse.index ));
320328
321329 code << tab << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << " ;\n " ;
330+ } else if (is_active && skip_active_load) {
331+ code << tab << " CeedScalar *s_G" << var_suffix << " = NULL;\n " ;
322332 } else {
323333 code << tab << " __shared__ CeedScalar s_G" << var_suffix << " [" << Q_name << " *" << Q_name << " ];\n " ;
324334 code << tab << " LoadMatrix<" << Q_name << " , " << Q_name << " >(data, G." << option_name << " [" << i << " ], s_G" << var_suffix << " );\n " ;
@@ -329,19 +339,23 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
329339 if (is_input) data->G .inputs [i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d ;
330340 else data->G .outputs [i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d ;
331341 if (has_collo_grad) {
332- if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
342+ if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load ) {
333343 std::string reuse_var = " s_G" + ((field_reuse.is_input ? " _in_" : " _out_" ) + std::to_string (field_reuse.index ));
334344
335345 code << tab << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << " ;\n " ;
346+ } else if (is_active && skip_active_load) {
347+ code << tab << " CeedScalar *s_G" << var_suffix << " = NULL;\n " ;
336348 } else {
337349 code << tab << " __shared__ CeedScalar s_G" << var_suffix << " [" << Q_name << " *" << Q_name << " ];\n " ;
338350 code << tab << " LoadMatrix<" << Q_name << " , " << Q_name << " >(data, G." << option_name << " [" << i << " ], s_G" << var_suffix << " );\n " ;
339351 }
340352 } else {
341- if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
353+ if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load ) {
342354 std::string reuse_var = " s_G" + ((field_reuse.is_input ? " _in_" : " _out_" ) + std::to_string (field_reuse.index ));
343355
344356 code << tab << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << " ;\n " ;
357+ } else if (is_active && skip_active_load) {
358+ code << tab << " CeedScalar *s_G" << var_suffix << " = NULL;\n " ;
345359 } else {
346360 code << tab << " __shared__ CeedScalar s_G" << var_suffix << " [" << P_name << " *" << Q_name << (is_tensor ? " " : " *dim" )
347361 << (is_tensor ? " " : var_suffix) << " ];\n " ;
@@ -1453,12 +1467,12 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
14531467 code << " \n " << tab << " // Input field constants and basis data\n " ;
14541468 for (CeedInt i = 0 ; i < num_input_fields; i++) {
14551469 CeedCallBackend (CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1456- max_dim, Q, Q_1d, true , is_all_tensor, is_at_points, use_3d_slices));
1470+ max_dim, Q, Q_1d, true , is_all_tensor, is_at_points, use_3d_slices, false ));
14571471 }
14581472 code << " \n " << tab << " // Output field constants and basis data\n " ;
14591473 for (CeedInt i = 0 ; i < num_output_fields; i++) {
14601474 CeedCallBackend (CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1461- max_dim, Q, Q_1d, false , is_all_tensor, is_at_points, use_3d_slices));
1475+ max_dim, Q, Q_1d, false , is_all_tensor, is_at_points, use_3d_slices, false ));
14621476 }
14631477
14641478 // Loop over all elements
@@ -1819,12 +1833,12 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
18191833 code << " \n " << tab << " // Input field constants and basis data\n " ;
18201834 for (CeedInt i = 0 ; i < num_input_fields; i++) {
18211835 CeedCallBackend (CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1822- max_dim, Q, Q_1d, true , is_all_tensor, is_at_points, use_3d_slices));
1836+ max_dim, Q, Q_1d, true , is_all_tensor, is_at_points, use_3d_slices, false ));
18231837 }
18241838 code << " \n " << tab << " // Output field constants and basis data\n " ;
18251839 for (CeedInt i = 0 ; i < num_output_fields; i++) {
18261840 CeedCallBackend (CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1827- max_dim, Q, Q_1d, false , is_all_tensor, is_at_points, use_3d_slices));
1841+ max_dim, Q, Q_1d, false , is_all_tensor, is_at_points, use_3d_slices, false ));
18281842 }
18291843
18301844 // Loop over all elements
@@ -2385,12 +2399,12 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
23852399 code << " \n " << tab << " // Input field constants and basis data\n " ;
23862400 for (CeedInt i = 0 ; i < num_input_fields; i++) {
23872401 CeedCallBackend (CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
2388- max_dim, Q, Q_1d, true , is_all_tensor, is_at_points, use_3d_slices));
2402+ max_dim, Q, Q_1d, true , is_all_tensor, is_at_points, use_3d_slices, true ));
23892403 }
23902404 code << " \n " << tab << " // Output field constants and basis data\n " ;
23912405 for (CeedInt i = 0 ; i < num_output_fields; i++) {
23922406 CeedCallBackend (CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
2393- max_dim, Q, Q_1d, false , is_all_tensor, is_at_points, use_3d_slices));
2407+ max_dim, Q, Q_1d, false , is_all_tensor, is_at_points, use_3d_slices, true ));
23942408 }
23952409
23962410 // Loop over all elements
0 commit comments