@@ -180,21 +180,21 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
180180 __syncthreads ();
181181 // Contract y direction
182182 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1 ], chebyshev_x);
183+ const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0 ;
184+
183185 for (CeedInt i = 0 ; i < Q_1D; i++) {
184- buffer[i] = chebyshev_x[i] * r_U[comp] ;
186+ buffer[i] = chebyshev_x[i] * r_u ;
185187 }
186188 // Contract x direction
187189 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0 ], chebyshev_x);
188- if (p < NUM_POINTS) {
189- for (CeedInt i = 0 ; i < Q_1D; i++) {
190- // Note: shifting to avoid atomic adds
191- const CeedInt ii = (i + data.t_id_x ) % Q_1D;
190+ for (CeedInt i = 0 ; i < Q_1D; i++) {
191+ // Note: shifting to avoid atomic adds
192+ const CeedInt ii = (i + data.t_id_y ) % Q_1D;
192193
193- for (CeedInt j = 0 ; j < Q_1D; j++) {
194- const CeedInt jj = (j + data.t_id_y ) % Q_1D;
194+ for (CeedInt j = 0 ; j < Q_1D; j++) {
195+ const CeedInt jj = (j + data.t_id_x ) % Q_1D;
195196
196- atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
197- }
197+ atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
198198 }
199199 }
200200 // Pull from shared to register
@@ -255,22 +255,22 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
255255 // Contract y direction
256256 if (dim == 1 ) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1 ], chebyshev_x);
257257 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1 ], chebyshev_x);
258+ const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0 ;
259+
258260 for (CeedInt i = 0 ; i < Q_1D; i++) {
259- buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] ;
261+ buffer[i] = chebyshev_x[i] * r_u ;
260262 }
261263 // Contract x direction
262264 if (dim == 0 ) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0 ], chebyshev_x);
263265 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0 ], chebyshev_x);
264- if (p < NUM_POINTS) {
265- for (CeedInt i = 0 ; i < Q_1D; i++) {
266- // Note: shifting to avoid atomic adds
267- const CeedInt ii = (i + data.t_id_x ) % Q_1D;
266+ for (CeedInt i = 0 ; i < Q_1D; i++) {
267+ // Note: shifting to avoid atomic adds
268+ const CeedInt ii = (i + data.t_id_y ) % Q_1D;
268269
269- for (CeedInt j = 0 ; j < Q_1D; j++) {
270- const CeedInt jj = (j + data.t_id_y ) % Q_1D;
270+ for (CeedInt j = 0 ; j < Q_1D; j++) {
271+ const CeedInt jj = (j + data.t_id_x ) % Q_1D;
271272
272- atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
273- }
273+ atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
274274 }
275275 }
276276 }
@@ -341,21 +341,21 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
341341 __syncthreads ();
342342 // Contract y and z direction
343343 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1 ], chebyshev_x);
344+ const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0 ;
345+
344346 for (CeedInt i = 0 ; i < Q_1D; i++) {
345- buffer[i] = chebyshev_x[i] * r_U[comp] * z;
347+ buffer[i] = chebyshev_x[i] * r_u * z;
346348 }
347349 // Contract x direction
348350 ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0 ], chebyshev_x);
349- if (p < NUM_POINTS) {
350- for (CeedInt i = 0 ; i < Q_1D; i++) {
351- // Note: shifting to avoid atomic adds
352- const CeedInt ii = (i + data.t_id_x ) % Q_1D;
351+ for (CeedInt i = 0 ; i < Q_1D; i++) {
352+ // Note: shifting to avoid atomic adds
353+ const CeedInt ii = (i + data.t_id_y ) % Q_1D;
353354
354- for (CeedInt j = 0 ; j < Q_1D; j++) {
355- const CeedInt jj = (j + data.t_id_y ) % Q_1D;
355+ for (CeedInt j = 0 ; j < Q_1D; j++) {
356+ const CeedInt jj = (j + data.t_id_x ) % Q_1D;
356357
357- atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
358- }
358+ atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
359359 }
360360 }
361361 // Pull from shared to register
@@ -438,24 +438,23 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
438438 // Contract y and z direction
439439 if (dim == 1 ) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1 ], chebyshev_x);
440440 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1 ], chebyshev_x);
441- const CeedScalar zz = dim == 2 ? dz : z;
441+ const CeedScalar zz = dim == 2 ? dz : z;
442+ const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0 ;
442443
443444 for (CeedInt i = 0 ; i < Q_1D; i++) {
444- buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * zz;
445+ buffer[i] = chebyshev_x[i] * r_u * zz;
445446 }
446447 // Contract x direction
447448 if (dim == 0 ) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0 ], chebyshev_x);
448449 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0 ], chebyshev_x);
449- if (p < NUM_POINTS) {
450- for (CeedInt i = 0 ; i < Q_1D; i++) {
451- // Note: shifting to avoid atomic adds
452- const CeedInt ii = (i + data.t_id_x ) % Q_1D;
450+ for (CeedInt i = 0 ; i < Q_1D; i++) {
451+ // Note: shifting to avoid atomic adds
452+ const CeedInt ii = (i + data.t_id_y ) % Q_1D;
453453
454- for (CeedInt j = 0 ; j < Q_1D; j++) {
455- const CeedInt jj = (j + data.t_id_y ) % Q_1D;
454+ for (CeedInt j = 0 ; j < Q_1D; j++) {
455+ const CeedInt jj = (j + data.t_id_x ) % Q_1D;
456456
457- atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
458- }
457+ atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
459458 }
460459 }
461460 }
0 commit comments