cuda - fix 2D flattening

jeremylt · jeremylt · commit 8e1cf6656194 · 2025-03-12T14:22:54.000-06:00
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -177,7 +177,7 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 // Setup fields
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                     CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt Q_1d, bool is_input,
+                                                     CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt Q, CeedInt Q_1d, bool is_input,
                                                      bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
   bool      is_tensor = true;
   CeedBasis basis;
@@ -220,7 +220,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
     CeedInt P = 0;
 
     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
-    code << "  const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P) << ";\n";
+    code << "  const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
   }
   code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
   if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -528,10 +528,11 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           std::string function_name = is_tensor
                                           ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
                                           : "InterpNonTensor";
+          std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
-               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+               << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
@@ -551,17 +552,18 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
                                       (is_all_tensor ? "" : "Flattened");
+          std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" << (dim >= 3 ? Q_name : "1")
                << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
-               << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
+               << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradNonTensor";
 
           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name << ", "
-               << (P_1d > Q_1d ? P_name : Q_name) << ">(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+               << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_WEIGHT: {
@@ -602,9 +604,10 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           std::string function_name =
               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
                         : "InterpTransposeNonTensor";
+          std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
-               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+               << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       case CEED_EVAL_GRAD:
@@ -623,14 +626,15 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
           std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
                                       std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
+          std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
-               << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
+               << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         } else {
           std::string function_name = "GradTransposeNonTensor";
 
-          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name << ", "
-               << (P_1d > Q_1d ? P_name : Q_name) << ">(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
+          code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
+               << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
         }
         break;
       // LCOV_EXCL_START
@@ -1160,6 +1164,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
     if (is_at_points) Q_1d = max_num_points;
     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
   }
+  if (Q == 0) Q = Q_1d;
+  data->Q    = Q;
   data->Q_1d = Q_1d;
 
   // Check for restriction only identity operator
@@ -1389,13 +1395,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   // Initialize constants, and matrices B and G
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d,
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q, Q_1d,
                                                               true, is_all_tensor, is_at_points, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
-                                                              false, is_all_tensor, is_at_points, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q,
+                                                              Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
   }
 
   // Loop over all elements
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -197,16 +197,14 @@ static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, c
   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
 
   // Apply operator
-  void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
-  const CeedInt dim       = data->dim;
-  const CeedInt Q_1d      = data->Q_1d;
-  const CeedInt P_1d      = data->max_P_1d;
-  const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
-  int           max_threads_per_block, min_grid_size, grid;
+  void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
+  int   max_threads_per_block, min_grid_size, grid;
 
   CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
+  const CeedInt thread_1d = CeedIntMax(is_tensor ? data->Q_1d : data->Q, data->max_P_1d);
+
   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
-  int block[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
+  int block[3] = {thread_1d, ((!is_tensor || data->dim == 1) ? 1 : thread_1d), -1};
 
   if (is_tensor) {
     CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h
@@ -14,7 +14,7 @@
 typedef struct {
   bool           use_fallback;
   CeedInt        dim;
-  CeedInt        Q_1d;
+  CeedInt        Q, Q_1d;
   CeedInt        max_P_1d;
   CUmodule       module;
   CUfunction     op;
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -184,6 +184,29 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const int
   __syncthreads();
 }
 
+//------------------------------------------------------------------------------
+// 2D pack/unpack quadrature values
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QPack2D(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] : 0.0;
+    __syncthreads();
+  }
+}
+
+template <int NUM_COMP, int Q_1D, int T_1D>
+inline __device__ void QUnpack2D(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    if (data.t_id_x < (Q_1D * Q_1D)) data.slice[(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] = U[comp];
+    __syncthreads();
+    U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
+    __syncthreads();
+  }
+}
+
 //------------------------------------------------------------------------------
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
@@ -204,11 +227,11 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_
 }
 
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                CeedScalar *__restrict__ r_V) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
+  QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
+  InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, r_V);
+  QPack2D<NUM_COMP, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -231,11 +254,11 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed
 }
 
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                         CeedScalar *__restrict__ r_V) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
+  QUnpack2D<NUM_COMP, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
+  InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, r_V);
+  QPack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -260,11 +283,11 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r
 }
 
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
+inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
                                              CeedScalar *__restrict__ r_V) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
+  QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
+  GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, c_G, r_V);
+  QPack2D<NUM_COMP * 2, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -289,11 +312,11 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc
 }
 
 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
+inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
+  QUnpack2D<NUM_COMP * 2, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
+  GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, c_G, r_V);
+  QPack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
 }
 
 //------------------------------------------------------------------------------
@@ -312,9 +335,7 @@ inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *_
 
 template <int P_1D, int Q_1D>
 inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
-  const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
-
-  WeightTensor2d_Core<Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, q_weight_1d, w);
+  WeightTensor2d_Core<Q_1D>(data, data.t_id_x % Q_1D, data.t_id_x / Q_1D, q_weight_1d, w);
 }
 
 //------------------------------------------------------------------------------