@@ -194,7 +194,7 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
194194 for (CeedInt j = 0 ; j < Q_1D; j++) {
195195 const CeedInt jj = (j + data.t_id_x ) % Q_1D;
196196
197- atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
197+ if (data. t_id_x < Q_1D && data. t_id_y < Q_1D) atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
198198 }
199199 }
200200 // Pull from shared to register
@@ -270,7 +270,7 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
270270 for (CeedInt j = 0 ; j < Q_1D; j++) {
271271 const CeedInt jj = (j + data.t_id_x ) % Q_1D;
272272
273- atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
273+ if (data. t_id_x < Q_1D && data. t_id_y < Q_1D) atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
274274 }
275275 }
276276 }
@@ -355,7 +355,7 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
355355 for (CeedInt j = 0 ; j < Q_1D; j++) {
356356 const CeedInt jj = (j + data.t_id_x ) % Q_1D;
357357
358- atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
358+ if (data. t_id_x < Q_1D && data. t_id_y < Q_1D) atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
359359 }
360360 }
361361 // Pull from shared to register
@@ -439,11 +439,12 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
439439 if (dim == 1 ) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1 ], chebyshev_x);
440440 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1 ], chebyshev_x);
441441 const CeedScalar zz = dim == 2 ? dz : z;
442- const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0 ;
442+ const CeedScalar r_u = ( p < NUM_POINTS) ? r_U[comp + dim * NUM_COMP] : 0.0 ;
443443
444444 for (CeedInt i = 0 ; i < Q_1D; i++) {
445445 buffer[i] = chebyshev_x[i] * r_u * zz;
446446 }
447+
447448 // Contract x direction
448449 if (dim == 0 ) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0 ], chebyshev_x);
449450 else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0 ], chebyshev_x);
@@ -454,7 +455,7 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
454455 for (CeedInt j = 0 ; j < Q_1D; j++) {
455456 const CeedInt jj = (j + data.t_id_x ) % Q_1D;
456457
457- atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
458+ if (data. t_id_x < Q_1D && data. t_id_y < Q_1D) atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
458459 }
459460 }
460461 }
0 commit comments