@@ -74,7 +74,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce
7474 // Contract x direction
7575 if (p < NUM_POINTS) {
7676 for (CeedInt i = 0 ; i < Q_1D; i++) {
77- atomicAdd (&data.slice [comp * Q_1D + (i + data.t_id_x ) % Q_1D], chebyshev_x[(i + data.t_id_x ) % Q_1D] * r_U[comp]);
77+ atomicAdd_block (&data.slice [comp * Q_1D + (i + data.t_id_x ) % Q_1D], chebyshev_x[(i + data.t_id_x ) % Q_1D] * r_U[comp]);
7878 }
7979 }
8080 // Pull from shared to register
@@ -121,7 +121,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
121121 // Contract x direction
122122 if (p < NUM_POINTS) {
123123 for (CeedInt i = 0 ; i < Q_1D; i++) {
124- atomicAdd (&data.slice [comp * Q_1D + (i + data.t_id_x ) % Q_1D], chebyshev_x[(i + data.t_id_x ) % Q_1D] * r_U[comp]);
124+ atomicAdd_block (&data.slice [comp * Q_1D + (i + data.t_id_x ) % Q_1D], chebyshev_x[(i + data.t_id_x ) % Q_1D] * r_U[comp]);
125125 }
126126 }
127127 // Pull from shared to register
@@ -193,7 +193,7 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
193193 for (CeedInt j = 0 ; j < Q_1D; j++) {
194194 const CeedInt jj = (j + data.t_id_y ) % Q_1D;
195195
196- atomicAdd (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
196+ atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
197197 }
198198 }
199199 }
@@ -269,7 +269,7 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
269269 for (CeedInt j = 0 ; j < Q_1D; j++) {
270270 const CeedInt jj = (j + data.t_id_y ) % Q_1D;
271271
272- atomicAdd (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
272+ atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
273273 }
274274 }
275275 }
@@ -354,7 +354,7 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
354354 for (CeedInt j = 0 ; j < Q_1D; j++) {
355355 const CeedInt jj = (j + data.t_id_y ) % Q_1D;
356356
357- atomicAdd (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
357+ atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
358358 }
359359 }
360360 }
@@ -454,7 +454,7 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
454454 for (CeedInt j = 0 ; j < Q_1D; j++) {
455455 const CeedInt jj = (j + data.t_id_y ) % Q_1D;
456456
457- atomicAdd (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
457+ atomicAdd_block (&data.slice [jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
458458 }
459459 }
460460 }
0 commit comments