CEED
diff --git a/‎backends/cuda-gen/ceed-cuda-gen-operator-build.cpp‎
Lines changed: 265 additions & 169 deletions b/‎backends/cuda-gen/ceed-cuda-gen-operator-build.cpp‎
Lines changed: 265 additions & 169 deletions
diff --git a/‎backends/cuda-gen/ceed-cuda-gen-operator.c‎
Lines changed: 4 additions & 8 deletions b/‎backends/cuda-gen/ceed-cuda-gen-operator.c‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎backends/cuda-gen/ceed-cuda-gen.h‎
Lines changed: 2 additions & 1 deletion b/‎backends/cuda-gen/ceed-cuda-gen.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/hip-gen/ceed-hip-gen-operator-build.cpp‎
Lines changed: 261 additions & 167 deletions b/‎backends/hip-gen/ceed-hip-gen-operator-build.cpp‎
Lines changed: 261 additions & 167 deletions
diff --git a/‎backends/hip-gen/ceed-hip-gen-operator.c‎
Lines changed: 10 additions & 14 deletions b/‎backends/hip-gen/ceed-hip-gen-operator.c‎
Lines changed: 10 additions & 14 deletions
diff --git a/‎backends/hip-gen/ceed-hip-gen.h‎
Lines changed: 2 additions & 1 deletion b/‎backends/hip-gen/ceed-hip-gen.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎doc/sphinx/source/releasenotes.md‎
Lines changed: 1 addition & 0 deletions b/‎doc/sphinx/source/releasenotes.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h‎
Lines changed: 1 addition & 1 deletion b/‎include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h‎
Lines changed: 1 addition & 1 deletion b/‎include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h‎
Lines changed: 12 additions & 12 deletions b/‎include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h‎
Lines changed: 12 additions & 12 deletions
@@ -197,22 +197,18 @@ static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, c
   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
 
   // Apply operator
-  void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
-  const CeedInt dim       = data->dim;
-  const CeedInt Q_1d      = data->Q_1d;
-  const CeedInt P_1d      = data->max_P_1d;
-  const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
-  int           max_threads_per_block, min_grid_size, grid;
+  void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
+  int   max_threads_per_block, min_grid_size, grid;
 
   CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
-  int block[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
+  int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
 
   if (is_tensor) {
     CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, is_at_points ? 1 : max_threads_per_block,
                                        cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
   } else {
-    CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
+    CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1));
 
     grid     = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
     block[2] = elems_per_block;
 
@@ -14,8 +14,9 @@
 typedef struct {
   bool           use_fallback;
   CeedInt        dim;
-  CeedInt        Q_1d;
+  CeedInt        Q, Q_1d;
   CeedInt        max_P_1d;
+  CeedInt        thread_1d;
   CUmodule       module;
   CUfunction     op;
   FieldsInt_Cuda indices;
 
@@ -131,39 +131,35 @@ static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream,
   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
 
   // Apply operator
-  void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
-  const CeedInt dim       = data->dim;
-  const CeedInt Q_1d      = data->Q_1d;
-  const CeedInt P_1d      = data->max_P_1d;
-  const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
+  void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
 
   CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
-  CeedInt block_sizes[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
+  CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
 
   if (is_tensor) {
-    CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, P_1d, Q_1d, block_sizes));
+    CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
     if (is_at_points) block_sizes[2] = 1;
   } else {
-    CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
+    CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64;
 
     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
     block_sizes[2]  = elems_per_block;
   }
-  if (dim == 1 || !is_tensor) {
+  if (data->dim == 1 || !is_tensor) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
-    CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);
+    CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
         CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
-  } else if (dim == 2) {
+  } else if (data->dim == 2) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
-    CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
+    CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
         CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
-  } else if (dim == 3) {
+  } else if (data->dim == 3) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
-    CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
+    CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
 
     CeedCallBackend(
         CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
 
@@ -14,8 +14,9 @@
 typedef struct {
   bool          use_fallback;
   CeedInt       dim;
-  CeedInt       Q_1d;
+  CeedInt       Q, Q_1d;
   CeedInt       max_P_1d;
+  CeedInt       thread_1d;
   hipModule_t   module;
   hipFunction_t op;
   FieldsInt_Hip indices;
 
@@ -25,6 +25,7 @@ On this page we provide a summary of the main API changes, new features and exam
 - Allow user to set additional compiler options for CUDA and HIP JiT.
 Specifically, directories set with `CeedAddJitSourceRoot(ceed, "foo/bar")` will be used to set `-Ifoo/bar` and defines set with `CeedAddJitDefine(ceed, "foo=bar")` will be used to set `-Dfoo=bar`.
 - Added non-tensor basis support to code generation backends `/gpu/cuda/gen` and `/gpu/hip/gen`.
+- Added support to code generation backends `/gpu/cuda/gen` and `/gpu/hip/gen` for operators with both tensor and non-tensor bases.
 
 ### Examples
 
 
@@ -92,7 +92,7 @@ inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedS
 //------------------------------------------------------------------------------
 // Quadrature weights
 //------------------------------------------------------------------------------
-template <int Q>
+template <int P, int Q>
 inline __device__ void WeightNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) {
   *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0;
 }
@@ -194,7 +194,7 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re
   CeedScalar r_W[1];
 
   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
-    WeightNonTensor<BASIS_Q>(data, q_weight, r_W);
+    WeightNonTensor<BASIS_P, BASIS_Q>(data, q_weight, r_W);
     WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W);
   }
 }
@@ -40,7 +40,7 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
 //------------------------------------------------------------------------------
 // 1D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                         CeedScalar *__restrict__ r_V) {
   CeedScalar chebyshev_x[Q_1D];
@@ -61,7 +61,7 @@ inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p,
 //------------------------------------------------------------------------------
 // 1D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                  CeedScalar *__restrict__ r_C) {
   CeedScalar chebyshev_x[Q_1D];
@@ -86,7 +86,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce
 //------------------------------------------------------------------------------
 // 1D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                       CeedScalar *__restrict__ r_V) {
   CeedScalar chebyshev_x[Q_1D];
@@ -107,7 +107,7 @@ inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, co
 //------------------------------------------------------------------------------
 // 1D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                CeedScalar *__restrict__ r_C) {
   CeedScalar chebyshev_x[Q_1D];
@@ -136,7 +136,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 2D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                         CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
@@ -166,7 +166,7 @@ inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p,
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                  CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -204,7 +204,7 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
 //------------------------------------------------------------------------------
 // 2D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                       CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
@@ -238,7 +238,7 @@ inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, co
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -284,7 +284,7 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                         CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
@@ -319,7 +319,7 @@ inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p,
 //------------------------------------------------------------------------------
 // 3D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                  CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -362,7 +362,7 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
 //------------------------------------------------------------------------------
 // 3D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
                                       CeedScalar *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
@@ -402,7 +402,7 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
 inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
                                                CeedScalar *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
Original file line number	Diff line number	Diff line change
`@@ -92,7 +92,7 @@ inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedS`
`92`	`92`	`//------------------------------------------------------------------------------`
`93`	`93`	`// Quadrature weights`
`94`	`94`	`//------------------------------------------------------------------------------`
`95`		`-template <int Q>`
	`95`	`+template <int P, int Q>`
`96`	`96`	`inline __device__ void WeightNonTensor(SharedData_Cuda &data, const CeedScalar __restrict__ q_weight, CeedScalar w) {`
`97`	`97`	`*w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0;`
`98`	`98`	`}`
Original file line number	Diff line number	Diff line change
`@@ -194,7 +194,7 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re`
`194`	`194`	`CeedScalar r_W[1];`
`195`	`195`
`196`	`196`	`for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {`
`197`		`- WeightNonTensor<BASIS_Q>(data, q_weight, r_W);`
	`197`	`+ WeightNonTensor<BASIS_P, BASIS_Q>(data, q_weight, r_W);`
`198`	`198`	`WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W);`
`199`	`199`	`}`
`200`	`200`	`}`