Skip to content

Commit 83153ff

Browse files
authored
Merge pull request #1723 from CEED/jeremy/at-points-shifts
Fix AtPoints transpose shift
2 parents 1a63be7 + a24d84e commit 83153ff

File tree

2 files changed

+20
-20
lines changed

2 files changed

+20
-20
lines changed

include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce
7474
// Contract x direction
7575
if (p < NUM_POINTS) {
7676
for (CeedInt i = 0; i < Q_1D; i++) {
77-
atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
77+
atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
7878
}
7979
}
8080
// Pull from shared to register
@@ -120,7 +120,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
120120
// Contract x direction
121121
if (p < NUM_POINTS) {
122122
for (CeedInt i = 0; i < Q_1D; i++) {
123-
atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
123+
atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
124124
}
125125
}
126126
// Pull from shared to register
@@ -186,10 +186,10 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
186186
if (p < NUM_POINTS) {
187187
for (CeedInt i = 0; i < Q_1D; i++) {
188188
// Note: shifting to avoid atomic adds
189-
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
189+
const CeedInt ii = (i + data.t_id_x) % Q_1D;
190190

191191
for (CeedInt j = 0; j < Q_1D; j++) {
192-
const CeedInt jj = (j + p) % Q_1D;
192+
const CeedInt jj = (j + data.t_id_y) % Q_1D;
193193

194194
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
195195
}
@@ -261,10 +261,10 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
261261
if (p < NUM_POINTS) {
262262
for (CeedInt i = 0; i < Q_1D; i++) {
263263
// Note: shifting to avoid atomic adds
264-
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
264+
const CeedInt ii = (i + data.t_id_x) % Q_1D;
265265

266266
for (CeedInt j = 0; j < Q_1D; j++) {
267-
const CeedInt jj = (j + p) % Q_1D;
267+
const CeedInt jj = (j + data.t_id_y) % Q_1D;
268268

269269
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
270270
}
@@ -343,10 +343,10 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
343343
if (p < NUM_POINTS) {
344344
for (CeedInt i = 0; i < Q_1D; i++) {
345345
// Note: shifting to avoid atomic adds
346-
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
346+
const CeedInt ii = (i + data.t_id_x) % Q_1D;
347347

348348
for (CeedInt j = 0; j < Q_1D; j++) {
349-
const CeedInt jj = ((j + p) % Q_1D);
349+
const CeedInt jj = (j + data.t_id_y) % Q_1D;
350350

351351
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
352352
}
@@ -430,10 +430,10 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
430430
if (p < NUM_POINTS) {
431431
for (CeedInt i = 0; i < Q_1D; i++) {
432432
// Note: shifting to avoid atomic adds
433-
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
433+
const CeedInt ii = (i + data.t_id_x) % Q_1D;
434434

435435
for (CeedInt j = 0; j < Q_1D; j++) {
436-
const CeedInt jj = ((j + p) % Q_1D);
436+
const CeedInt jj = (j + data.t_id_y) % Q_1D;
437437

438438
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
439439
}

include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const Cee
7474
// Contract x direction
7575
if (p < NUM_POINTS) {
7676
for (CeedInt i = 0; i < Q_1D; i++) {
77-
atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
77+
atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
7878
}
7979
}
8080
// Pull from shared to register
@@ -120,7 +120,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedI
120120
// Contract x direction
121121
if (p < NUM_POINTS) {
122122
for (CeedInt i = 0; i < Q_1D; i++) {
123-
atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
123+
atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
124124
}
125125
}
126126
// Pull from shared to register
@@ -186,10 +186,10 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const Cee
186186
if (p < NUM_POINTS) {
187187
for (CeedInt i = 0; i < Q_1D; i++) {
188188
// Note: shifting to avoid atomic adds
189-
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
189+
const CeedInt ii = (i + data.t_id_x) % Q_1D;
190190

191191
for (CeedInt j = 0; j < Q_1D; j++) {
192-
const CeedInt jj = (j + p) % Q_1D;
192+
const CeedInt jj = (j + data.t_id_y) % Q_1D;
193193

194194
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
195195
}
@@ -261,10 +261,10 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedI
261261
if (p < NUM_POINTS) {
262262
for (CeedInt i = 0; i < Q_1D; i++) {
263263
// Note: shifting to avoid atomic adds
264-
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
264+
const CeedInt ii = (i + data.t_id_x) % Q_1D;
265265

266266
for (CeedInt j = 0; j < Q_1D; j++) {
267-
const CeedInt jj = (j + p) % Q_1D;
267+
const CeedInt jj = (j + data.t_id_y) % Q_1D;
268268

269269
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
270270
}
@@ -343,10 +343,10 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const Cee
343343
if (p < NUM_POINTS) {
344344
for (CeedInt i = 0; i < Q_1D; i++) {
345345
// Note: shifting to avoid atomic adds
346-
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
346+
const CeedInt ii = (i + data.t_id_x) % Q_1D;
347347

348348
for (CeedInt j = 0; j < Q_1D; j++) {
349-
const CeedInt jj = ((j + p) % Q_1D);
349+
const CeedInt jj = (j + data.t_id_y) % Q_1D;
350350

351351
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
352352
}
@@ -430,10 +430,10 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
430430
if (p < NUM_POINTS) {
431431
for (CeedInt i = 0; i < Q_1D; i++) {
432432
// Note: shifting to avoid atomic adds
433-
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
433+
const CeedInt ii = (i + data.t_id_x) % Q_1D;
434434

435435
for (CeedInt j = 0; j < Q_1D; j++) {
436-
const CeedInt jj = ((j + p) % Q_1D);
436+
const CeedInt jj = (j + data.t_id_y) % Q_1D;
437437

438438
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
439439
}

0 commit comments

Comments
 (0)