Skip to content

Commit 66c2c38

Browse files
authored
Merge pull request #1866 from CEED/jeremy/fix-mpm
gpu - fix gen AtPoints transpose
2 parents df10768 + 49337e2 commit 66c2c38

File tree

2 files changed

+10
-9
lines changed

2 files changed

+10
-9
lines changed

include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
194194
for (CeedInt j = 0; j < Q_1D; j++) {
195195
const CeedInt jj = (j + data.t_id_x) % Q_1D;
196196

197-
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
197+
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
198198
}
199199
}
200200
// Pull from shared to register
@@ -270,7 +270,7 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
270270
for (CeedInt j = 0; j < Q_1D; j++) {
271271
const CeedInt jj = (j + data.t_id_x) % Q_1D;
272272

273-
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
273+
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
274274
}
275275
}
276276
}
@@ -355,7 +355,7 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
355355
for (CeedInt j = 0; j < Q_1D; j++) {
356356
const CeedInt jj = (j + data.t_id_x) % Q_1D;
357357

358-
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
358+
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
359359
}
360360
}
361361
// Pull from shared to register
@@ -439,11 +439,12 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
439439
if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
440440
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
441441
const CeedScalar zz = dim == 2 ? dz : z;
442-
const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
442+
const CeedScalar r_u = (p < NUM_POINTS) ? r_U[comp + dim * NUM_COMP] : 0.0;
443443

444444
for (CeedInt i = 0; i < Q_1D; i++) {
445445
buffer[i] = chebyshev_x[i] * r_u * zz;
446446
}
447+
447448
// Contract x direction
448449
if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
449450
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -454,7 +455,7 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
454455
for (CeedInt j = 0; j < Q_1D; j++) {
455456
const CeedInt jj = (j + data.t_id_x) % Q_1D;
456457

457-
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
458+
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
458459
}
459460
}
460461
}

include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const Cee
195195
for (CeedInt j = 0; j < Q_1D; j++) {
196196
const CeedInt jj = (j + data.t_id_x) % Q_1D;
197197

198-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
198+
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
199199
}
200200
}
201201
// Pull from shared to register
@@ -271,7 +271,7 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedI
271271
for (CeedInt j = 0; j < Q_1D; j++) {
272272
const CeedInt jj = (j + data.t_id_x) % Q_1D;
273273

274-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
274+
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
275275
}
276276
}
277277
}
@@ -356,7 +356,7 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const Cee
356356
for (CeedInt j = 0; j < Q_1D; j++) {
357357
const CeedInt jj = (j + data.t_id_x) % Q_1D;
358358

359-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
359+
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
360360
}
361361
}
362362
// Pull from shared to register
@@ -455,7 +455,7 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
455455
for (CeedInt j = 0; j < Q_1D; j++) {
456456
const CeedInt jj = (j + data.t_id_x) % Q_1D;
457457

458-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
458+
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
459459
}
460460
}
461461
}

0 commit comments

Comments
 (0)