Skip to content

Commit 7f1120a

Browse files
committed
atpoints - use atomicAdd_block
1 parent 64efbe9 commit 7f1120a

File tree

2 files changed

+12
-12
lines changed

2 files changed

+12
-12
lines changed

include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce
7474
// Contract x direction
7575
if (p < NUM_POINTS) {
7676
for (CeedInt i = 0; i < Q_1D; i++) {
77-
atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
77+
atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
7878
}
7979
}
8080
// Pull from shared to register
@@ -121,7 +121,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
121121
// Contract x direction
122122
if (p < NUM_POINTS) {
123123
for (CeedInt i = 0; i < Q_1D; i++) {
124-
atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
124+
atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
125125
}
126126
}
127127
// Pull from shared to register
@@ -193,7 +193,7 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
193193
for (CeedInt j = 0; j < Q_1D; j++) {
194194
const CeedInt jj = (j + data.t_id_y) % Q_1D;
195195

196-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
196+
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
197197
}
198198
}
199199
}
@@ -269,7 +269,7 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
269269
for (CeedInt j = 0; j < Q_1D; j++) {
270270
const CeedInt jj = (j + data.t_id_y) % Q_1D;
271271

272-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
272+
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
273273
}
274274
}
275275
}
@@ -354,7 +354,7 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
354354
for (CeedInt j = 0; j < Q_1D; j++) {
355355
const CeedInt jj = (j + data.t_id_y) % Q_1D;
356356

357-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
357+
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
358358
}
359359
}
360360
}
@@ -454,7 +454,7 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
454454
for (CeedInt j = 0; j < Q_1D; j++) {
455455
const CeedInt jj = (j + data.t_id_y) % Q_1D;
456456

457-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
457+
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
458458
}
459459
}
460460
}

include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const Cee
7575
// Contract x direction
7676
if (p < NUM_POINTS) {
7777
for (CeedInt i = 0; i < Q_1D; i++) {
78-
atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
78+
atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
7979
}
8080
}
8181
// Pull from shared to register
@@ -122,7 +122,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedI
122122
// Contract x direction
123123
if (p < NUM_POINTS) {
124124
for (CeedInt i = 0; i < Q_1D; i++) {
125-
atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
125+
atomicAdd_block(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
126126
}
127127
}
128128
// Pull from shared to register
@@ -194,7 +194,7 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const Cee
194194
for (CeedInt j = 0; j < Q_1D; j++) {
195195
const CeedInt jj = (j + data.t_id_y) % Q_1D;
196196

197-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
197+
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
198198
}
199199
}
200200
}
@@ -270,7 +270,7 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedI
270270
for (CeedInt j = 0; j < Q_1D; j++) {
271271
const CeedInt jj = (j + data.t_id_y) % Q_1D;
272272

273-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
273+
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
274274
}
275275
}
276276
}
@@ -355,7 +355,7 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const Cee
355355
for (CeedInt j = 0; j < Q_1D; j++) {
356356
const CeedInt jj = (j + data.t_id_y) % Q_1D;
357357

358-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
358+
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
359359
}
360360
}
361361
}
@@ -455,7 +455,7 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
455455
for (CeedInt j = 0; j < Q_1D; j++) {
456456
const CeedInt jj = (j + data.t_id_y) % Q_1D;
457457

458-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
458+
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
459459
}
460460
}
461461
}

0 commit comments

Comments
 (0)