@@ -184,6 +184,29 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const int
184184 __syncthreads ();
185185}
186186
187+ // ------------------------------------------------------------------------------
188+ // 2D pack/unpack quadrature values
189+ // ------------------------------------------------------------------------------
190+ template <int NUM_COMP, int Q_1D, int T_1D>
191+ inline __device__ void QPack2D (SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
192+ for (CeedInt comp = 0 ; comp < NUM_COMP; comp++) {
193+ if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice [t_id_x + t_id_y * T_1D] = U[comp];
194+ __syncthreads ();
195+ U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice [(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] : 0.0 ;
196+ __syncthreads ();
197+ }
198+ }
199+
200+ template <int NUM_COMP, int Q_1D, int T_1D>
201+ inline __device__ void QUnpack2D (SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
202+ for (CeedInt comp = 0 ; comp < NUM_COMP; comp++) {
203+ if (data.t_id_x < (Q_1D * Q_1D)) data.slice [(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] = U[comp];
204+ __syncthreads ();
205+ U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice [t_id_x + t_id_y * T_1D] : 0.0 ;
206+ __syncthreads ();
207+ }
208+ }
209+
187210// ------------------------------------------------------------------------------
188211// 2D interpolate to quadrature points
189212// ------------------------------------------------------------------------------
@@ -204,11 +227,11 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_
204227}
205228
206229template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
207- inline __device__ void InterpTensor2dFlattened (SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
230+ inline __device__ void InterpTensor2dFlattened (SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
208231 CeedScalar *__restrict__ r_V) {
209- const int max_1d = P_1D < Q_1D ? P_1D : Q_1D ;
210-
211- InterpTensor2d_Core <NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d , data.t_id_x / max_1d, r_U, c_B , r_V);
232+ QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data. t_id_x % T_1D, data. t_id_x / T_1D, r_U) ;
233+ InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data. t_id_x % T_1D, data. t_id_x / T_1D, r_U, c_B, r_V);
234+ QPack2D <NUM_COMP, Q_1D, T_1D>(data, data.t_id_x % T_1D , data.t_id_x / T_1D , r_V);
212235}
213236
214237// ------------------------------------------------------------------------------
@@ -231,11 +254,11 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed
231254}
232255
233256template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
234- inline __device__ void InterpTransposeTensor2dFlattened (SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
257+ inline __device__ void InterpTransposeTensor2dFlattened (SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
235258 CeedScalar *__restrict__ r_V) {
236- const int max_1d = P_1D < Q_1D ? P_1D : Q_1D ;
237-
238- InterpTransposeTensor2d_Core <NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d , data.t_id_x / max_1d, r_U, c_B , r_V);
259+ QUnpack2D<NUM_COMP, Q_1D, T_1D>(data, data. t_id_x % T_1D, data. t_id_x / T_1D, r_U) ;
260+ InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data. t_id_x % T_1D, data. t_id_x / T_1D, r_U, c_B, r_V);
261+ QPack2D <NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D , data.t_id_x / T_1D , r_V);
239262}
240263
241264// ------------------------------------------------------------------------------
@@ -260,11 +283,11 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r
260283}
261284
262285template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
263- inline __device__ void GradTensor2dFlattened (SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
286+ inline __device__ void GradTensor2dFlattened (SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
264287 CeedScalar *__restrict__ r_V) {
265- const int max_1d = P_1D < Q_1D ? P_1D : Q_1D ;
266-
267- GradTensor2d_Core <NUM_COMP, P_1D , Q_1D, T_1D>(data, data.t_id_x % max_1d , data.t_id_x / max_1d, r_U, c_B, c_G , r_V);
288+ QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data. t_id_x % T_1D, data. t_id_x / T_1D, r_U) ;
289+ GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data. t_id_x % T_1D, data. t_id_x / T_1D, r_U, c_B, c_G, r_V);
290+ QPack2D <NUM_COMP * 2 , Q_1D, T_1D>(data, data.t_id_x % T_1D , data.t_id_x / T_1D , r_V);
268291}
269292
270293// ------------------------------------------------------------------------------
@@ -289,11 +312,11 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc
289312}
290313
291314template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
292- inline __device__ void GradTransposeTensor2dFlattened (SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
315+ inline __device__ void GradTransposeTensor2dFlattened (SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
293316 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
294- const int max_1d = P_1D < Q_1D ? P_1D : Q_1D ;
295-
296- GradTransposeTensor2d_Core <NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d , data.t_id_x / max_1d, r_U, c_B, c_G , r_V);
317+ QUnpack2D<NUM_COMP * 2 , Q_1D, T_1D>(data, data. t_id_x % T_1D, data. t_id_x / T_1D, r_U) ;
318+ GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data. t_id_x % T_1D, data. t_id_x / T_1D, r_U, c_B, c_G, r_V);
319+ QPack2D <NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D , data.t_id_x / T_1D , r_V);
297320}
298321
299322// ------------------------------------------------------------------------------
@@ -312,9 +335,7 @@ inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *_
312335
313336template <int P_1D, int Q_1D>
314337inline __device__ void WeightTensor2dFlattened (SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
315- const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
316-
317- WeightTensor2d_Core<Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, q_weight_1d, w);
338+ WeightTensor2d_Core<Q_1D>(data, data.t_id_x % Q_1D, data.t_id_x / Q_1D, q_weight_1d, w);
318339}
319340
320341// ------------------------------------------------------------------------------
0 commit comments