@@ -261,8 +261,15 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
261261
262262 code << tab << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << " ;\n " ;
263263 } else {
264- code << tab << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
265- code << tab << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
264+ bool is_collocated = false ;
265+
266+ CeedCallBackend (CeedBasisIsCollocated (basis, &is_collocated));
267+ if (is_collocated && !is_at_points) {
268+ code << tab << " CeedScalar *s_B" << var_suffix << " = NULL;\n " ;
269+ } else {
270+ code << tab << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
271+ code << tab << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
272+ }
266273 }
267274 break ;
268275 case CEED_EVAL_GRAD:
@@ -293,8 +300,15 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
293300
294301 code << tab << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << " ;\n " ;
295302 } else {
296- code << tab << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
297- code << tab << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
303+ bool is_collocated = false ;
304+
305+ CeedCallBackend (CeedBasisIsCollocated (basis, &is_collocated));
306+ if (is_collocated && !is_at_points) {
307+ code << tab << " CeedScalar *s_B" << var_suffix << " = NULL;\n " ;
308+ } else {
309+ code << tab << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_name << " *" << Q_name << " ];\n " ;
310+ code << tab << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
311+ }
298312 }
299313 }
300314 if (is_at_points) break ; // No G mat for AtPoints
@@ -492,10 +506,11 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
492506static int CeedOperatorBuildKernelBasis_Cuda_gen (std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
493507 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d,
494508 bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
495- bool is_tensor = true ;
509+ bool is_tensor = true , is_collocated = true ;
496510 CeedBasis basis;
497511 CeedCallBackend (CeedOperatorFieldGetBasis (op_field, &basis));
498512 CeedCallBackend (CeedBasisIsTensor (basis, &is_tensor));
513+ CeedCallBackend (CeedBasisIsCollocated (basis, &is_collocated));
499514
500515 std::string var_suffix = (is_input ? " _in_" : " _out_" ) + std::to_string (i);
501516 std::string P_name = (is_tensor ? " P_1d" : " P" ) + var_suffix, Q_name = is_tensor ? " Q_1d" : " Q" ;
@@ -534,9 +549,9 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
534549 code << tab << function_name << " <num_comp" << var_suffix << " , " << P_name << " , " << Q_name << " , OP_T_1D>(data, r_e" << var_suffix
535550 << " , s_B" << var_suffix << " , r_c" << var_suffix << " );\n " ;
536551 } else {
537- std::string function_name = is_tensor
538- ? ((dim == 1 ? " Interp " : " InterpTensor " ) + std::to_string (dim) + " d" + (is_all_tensor ? " " : " Flattened" ))
539- : " InterpNonTensor" ;
552+ std::string function_name = is_tensor ? ((dim == 1 ? " Interp " : " InterpTensor " ) + std::string (is_collocated ? " CollocatedNodes " : " " ) +
553+ std::to_string (dim) + " d" + (is_all_tensor ? " " : " Flattened" ))
554+ : " InterpNonTensor" ;
540555 std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? " OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
541556
542557 code << tab << " CeedScalar r_q" << var_suffix << " [num_comp" << var_suffix << " *" << (is_all_tensor && (dim >= 3 ) ? Q_name : " 1" ) << " ];\n " ;
@@ -552,15 +567,18 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
552567 code << tab << function_name << " <num_comp" << var_suffix << " , " << P_name << " , " << Q_name << " , OP_T_1D>(data, r_e" << var_suffix
553568 << " , s_B" << var_suffix << " , r_c" << var_suffix << " );\n " ;
554569 } else if (use_3d_slices) {
555- std::string function_name = (dim > 1 ? " InterpTensor" : " Interp" ) + std::to_string (dim) + " d" ;
570+ std::string function_name =
571+ (dim > 1 ? " InterpTensor" : " Interp" ) + std::string (is_collocated ? " CollocatedNodes" : " " ) + std::to_string (dim) + " d" ;
556572
557573 code << tab << " CeedScalar r_q" << var_suffix << " [num_comp" << var_suffix << " *" << Q_name << " ];\n " ;
558574 code << tab << function_name << " <num_comp" << var_suffix << " , " << P_name << " , " << Q_name << " , OP_T_1D>(data, r_e" << var_suffix
559575 << " , s_B" << var_suffix << " , r_q" << var_suffix << " );\n " ;
560576 } else if (is_tensor) {
561- bool is_collocated = dim == 3 && Q_1d >= P_1d;
562- std::string function_name = (dim == 1 ? " Grad" : (is_collocated ? " GradTensorCollocated" : " GradTensor" )) + std::to_string (dim) + " d" +
563- (is_all_tensor ? " " : " Flattened" );
577+ bool is_collocated_grad = dim == 3 && Q_1d >= P_1d;
578+ std::string function_name =
579+ (dim == 1 ? " Grad"
580+ : (" GradTensor" + std::string (is_collocated ? " CollocatedNodes" : (is_collocated_grad ? " Collocated" : " " ))) +
581+ std::to_string (dim) + " d" + (is_all_tensor ? " " : " Flattened" ));
564582 std::string op_t_1d_name = is_all_tensor ? " OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
565583
566584 code << tab << " CeedScalar r_q" << var_suffix << " [num_comp" << var_suffix << " *dim" << var_suffix << " *"
@@ -611,7 +629,8 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
611629 << " , s_B" << var_suffix << " , r_e" << var_suffix << " );\n " ;
612630 } else {
613631 std::string function_name =
614- is_tensor ? ((dim == 1 ? " InterpTranspose" : " InterpTransposeTensor" ) + std::to_string (dim) + " d" + (is_all_tensor ? " " : " Flattened" ))
632+ is_tensor ? ((dim == 1 ? " InterpTranspose" : " InterpTransposeTensor" ) + std::string (is_collocated ? " CollocatedNodes" : " " ) +
633+ std::to_string (dim) + " d" + (is_all_tensor ? " " : " Flattened" ))
615634 : " InterpTransposeNonTensor" ;
616635 std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? " OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
617636
@@ -627,14 +646,17 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
627646 code << tab << function_name << " <num_comp" << var_suffix << " , " << P_name << " , " << Q_name << " , OP_T_1D>(data, r_c" << var_suffix
628647 << " , s_B" << var_suffix << " , r_e" << var_suffix << " );\n " ;
629648 } else if (use_3d_slices) {
630- std::string function_name = (dim == 1 ? " InterpTranspose" : " InterpTransposeTensor" ) + std::to_string (dim) + " d" ;
649+ std::string function_name = (dim == 1 ? " InterpTranspose" : " InterpTransposeTensor" ) + std::string (is_collocated ? " CollocatedNodes" : " " ) +
650+ std::to_string (dim) + " d" ;
631651
632652 code << tab << function_name << " <num_comp" << var_suffix << " , " << P_name << " , " << Q_name << " , OP_T_1D>(data, r_q" << var_suffix
633653 << " , s_B" << var_suffix << " , r_e" << var_suffix << " );\n " ;
634654 } else if (is_tensor) {
635- bool is_collocated = dim == 3 && Q_1d >= P_1d;
636- std::string function_name = (dim == 1 ? " GradTranspose" : (is_collocated ? " GradTransposeTensorCollocated" : " GradTransposeTensor" )) +
637- std::to_string (dim) + " d" + (is_all_tensor ? " " : " Flattened" );
655+ bool is_collocated_grad = dim == 3 && Q_1d >= P_1d;
656+ std::string function_name =
657+ (dim == 1 ? " GradTranspose"
658+ : (" GradTransposeTensor" + std::string (is_collocated ? " CollocatedNodes" : (is_collocated_grad ? " Collocated" : " " )))) +
659+ std::to_string (dim) + " d" + (is_all_tensor ? " " : " Flattened" );
638660 std::string op_t_1d_name = is_all_tensor ? " OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
639661
640662 code << tab << function_name << " <num_comp" << var_suffix << " , " << P_name << " , " << Q_name << " , " << op_t_1d_name << " >(data, r_q"
@@ -870,7 +892,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
870892 code << tab << " CeedScalar r_s" << var_suffix << " [num_comp" << var_suffix << " ];\n " ;
871893 code << tab << " for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n " ;
872894 tab.push ();
873- code << " r_s" << var_suffix << " [j] = r_q" << var_suffix << " [q + j*" << Q_name << " ];\n " ;
895+ code << tab << " r_s" << var_suffix << " [j] = r_q" << var_suffix << " [q + j*" << Q_name << " ];\n " ;
874896 tab.pop ();
875897 code << tab << " }\n " ;
876898 break ;
0 commit comments