Skip to content

Commit 2f00a50

Browse files
authored
Merge pull request #1863 from CEED/jeremy/at-points-tune
AtPoints Tuning
2 parents 58ed935 + c6cb50f commit 2f00a50

File tree

2 files changed

+72
-74
lines changed

2 files changed

+72
-74
lines changed

include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h

Lines changed: 36 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -180,21 +180,21 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
180180
__syncthreads();
181181
// Contract y direction
182182
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
183+
const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
184+
183185
for (CeedInt i = 0; i < Q_1D; i++) {
184-
buffer[i] = chebyshev_x[i] * r_U[comp];
186+
buffer[i] = chebyshev_x[i] * r_u;
185187
}
186188
// Contract x direction
187189
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
188-
if (p < NUM_POINTS) {
189-
for (CeedInt i = 0; i < Q_1D; i++) {
190-
// Note: shifting to avoid atomic adds
191-
const CeedInt ii = (i + data.t_id_x) % Q_1D;
190+
for (CeedInt i = 0; i < Q_1D; i++) {
191+
// Note: shifting to avoid atomic adds
192+
const CeedInt ii = (i + data.t_id_y) % Q_1D;
192193

193-
for (CeedInt j = 0; j < Q_1D; j++) {
194-
const CeedInt jj = (j + data.t_id_y) % Q_1D;
194+
for (CeedInt j = 0; j < Q_1D; j++) {
195+
const CeedInt jj = (j + data.t_id_x) % Q_1D;
195196

196-
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
197-
}
197+
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
198198
}
199199
}
200200
// Pull from shared to register
@@ -255,22 +255,22 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
255255
// Contract y direction
256256
if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
257257
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
258+
const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
259+
258260
for (CeedInt i = 0; i < Q_1D; i++) {
259-
buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
261+
buffer[i] = chebyshev_x[i] * r_u;
260262
}
261263
// Contract x direction
262264
if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
263265
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
264-
if (p < NUM_POINTS) {
265-
for (CeedInt i = 0; i < Q_1D; i++) {
266-
// Note: shifting to avoid atomic adds
267-
const CeedInt ii = (i + data.t_id_x) % Q_1D;
266+
for (CeedInt i = 0; i < Q_1D; i++) {
267+
// Note: shifting to avoid atomic adds
268+
const CeedInt ii = (i + data.t_id_y) % Q_1D;
268269

269-
for (CeedInt j = 0; j < Q_1D; j++) {
270-
const CeedInt jj = (j + data.t_id_y) % Q_1D;
270+
for (CeedInt j = 0; j < Q_1D; j++) {
271+
const CeedInt jj = (j + data.t_id_x) % Q_1D;
271272

272-
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
273-
}
273+
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
274274
}
275275
}
276276
}
@@ -341,21 +341,21 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
341341
__syncthreads();
342342
// Contract y and z direction
343343
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
344+
const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
345+
344346
for (CeedInt i = 0; i < Q_1D; i++) {
345-
buffer[i] = chebyshev_x[i] * r_U[comp] * z;
347+
buffer[i] = chebyshev_x[i] * r_u * z;
346348
}
347349
// Contract x direction
348350
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
349-
if (p < NUM_POINTS) {
350-
for (CeedInt i = 0; i < Q_1D; i++) {
351-
// Note: shifting to avoid atomic adds
352-
const CeedInt ii = (i + data.t_id_x) % Q_1D;
351+
for (CeedInt i = 0; i < Q_1D; i++) {
352+
// Note: shifting to avoid atomic adds
353+
const CeedInt ii = (i + data.t_id_y) % Q_1D;
353354

354-
for (CeedInt j = 0; j < Q_1D; j++) {
355-
const CeedInt jj = (j + data.t_id_y) % Q_1D;
355+
for (CeedInt j = 0; j < Q_1D; j++) {
356+
const CeedInt jj = (j + data.t_id_x) % Q_1D;
356357

357-
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
358-
}
358+
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
359359
}
360360
}
361361
// Pull from shared to register
@@ -438,24 +438,23 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
438438
// Contract y and z direction
439439
if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
440440
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
441-
const CeedScalar zz = dim == 2 ? dz : z;
441+
const CeedScalar zz = dim == 2 ? dz : z;
442+
const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
442443

443444
for (CeedInt i = 0; i < Q_1D; i++) {
444-
buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * zz;
445+
buffer[i] = chebyshev_x[i] * r_u * zz;
445446
}
446447
// Contract x direction
447448
if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
448449
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
449-
if (p < NUM_POINTS) {
450-
for (CeedInt i = 0; i < Q_1D; i++) {
451-
// Note: shifting to avoid atomic adds
452-
const CeedInt ii = (i + data.t_id_x) % Q_1D;
450+
for (CeedInt i = 0; i < Q_1D; i++) {
451+
// Note: shifting to avoid atomic adds
452+
const CeedInt ii = (i + data.t_id_y) % Q_1D;
453453

454-
for (CeedInt j = 0; j < Q_1D; j++) {
455-
const CeedInt jj = (j + data.t_id_y) % Q_1D;
454+
for (CeedInt j = 0; j < Q_1D; j++) {
455+
const CeedInt jj = (j + data.t_id_x) % Q_1D;
456456

457-
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
458-
}
457+
atomicAdd_block(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
459458
}
460459
}
461460
}

include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h

Lines changed: 36 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -181,21 +181,21 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const Cee
181181
__syncthreads();
182182
// Contract y direction
183183
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
184+
const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
185+
184186
for (CeedInt i = 0; i < Q_1D; i++) {
185-
buffer[i] = chebyshev_x[i] * r_U[comp];
187+
buffer[i] = chebyshev_x[i] * r_u;
186188
}
187189
// Contract x direction
188190
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
189-
if (p < NUM_POINTS) {
190-
for (CeedInt i = 0; i < Q_1D; i++) {
191-
// Note: shifting to avoid atomic adds
192-
const CeedInt ii = (i + data.t_id_x) % Q_1D;
191+
for (CeedInt i = 0; i < Q_1D; i++) {
192+
// Note: shifting to avoid atomic adds
193+
const CeedInt ii = (i + data.t_id_y) % Q_1D;
193194

194-
for (CeedInt j = 0; j < Q_1D; j++) {
195-
const CeedInt jj = (j + data.t_id_y) % Q_1D;
195+
for (CeedInt j = 0; j < Q_1D; j++) {
196+
const CeedInt jj = (j + data.t_id_x) % Q_1D;
196197

197-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
198-
}
198+
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
199199
}
200200
}
201201
// Pull from shared to register
@@ -256,22 +256,22 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedI
256256
// Contract y direction
257257
if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
258258
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
259+
const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
260+
259261
for (CeedInt i = 0; i < Q_1D; i++) {
260-
buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
262+
buffer[i] = chebyshev_x[i] * r_u;
261263
}
262264
// Contract x direction
263265
if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
264266
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
265-
if (p < NUM_POINTS) {
266-
for (CeedInt i = 0; i < Q_1D; i++) {
267-
// Note: shifting to avoid atomic adds
268-
const CeedInt ii = (i + data.t_id_x) % Q_1D;
267+
for (CeedInt i = 0; i < Q_1D; i++) {
268+
// Note: shifting to avoid atomic adds
269+
const CeedInt ii = (i + data.t_id_y) % Q_1D;
269270

270-
for (CeedInt j = 0; j < Q_1D; j++) {
271-
const CeedInt jj = (j + data.t_id_y) % Q_1D;
271+
for (CeedInt j = 0; j < Q_1D; j++) {
272+
const CeedInt jj = (j + data.t_id_x) % Q_1D;
272273

273-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
274-
}
274+
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
275275
}
276276
}
277277
}
@@ -342,21 +342,21 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const Cee
342342
__syncthreads();
343343
// Contract y and z direction
344344
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
345+
const CeedScalar r_u = p < NUM_POINTS ? r_U[comp] : 0.0;
346+
345347
for (CeedInt i = 0; i < Q_1D; i++) {
346-
buffer[i] = chebyshev_x[i] * r_U[comp] * z;
348+
buffer[i] = chebyshev_x[i] * r_u * z;
347349
}
348350
// Contract x direction
349351
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
350-
if (p < NUM_POINTS) {
351-
for (CeedInt i = 0; i < Q_1D; i++) {
352-
// Note: shifting to avoid atomic adds
353-
const CeedInt ii = (i + data.t_id_x) % Q_1D;
352+
for (CeedInt i = 0; i < Q_1D; i++) {
353+
// Note: shifting to avoid atomic adds
354+
const CeedInt ii = (i + data.t_id_y) % Q_1D;
354355

355-
for (CeedInt j = 0; j < Q_1D; j++) {
356-
const CeedInt jj = (j + data.t_id_y) % Q_1D;
356+
for (CeedInt j = 0; j < Q_1D; j++) {
357+
const CeedInt jj = (j + data.t_id_x) % Q_1D;
357358

358-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
359-
}
359+
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
360360
}
361361
}
362362
// Pull from shared to register
@@ -439,24 +439,23 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
439439
// Contract y and z direction
440440
if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
441441
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
442-
const CeedScalar zz = dim == 2 ? dz : z;
442+
const CeedScalar zz = dim == 2 ? dz : z;
443+
const CeedScalar r_u = p < NUM_POINTS ? r_U[comp + dim * NUM_COMP] : 0.0;
443444

444445
for (CeedInt i = 0; i < Q_1D; i++) {
445-
buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * zz;
446+
buffer[i] = chebyshev_x[i] * r_u * zz;
446447
}
447448
// Contract x direction
448449
if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
449450
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
450-
if (p < NUM_POINTS) {
451-
for (CeedInt i = 0; i < Q_1D; i++) {
452-
// Note: shifting to avoid atomic adds
453-
const CeedInt ii = (i + data.t_id_x) % Q_1D;
451+
for (CeedInt i = 0; i < Q_1D; i++) {
452+
// Note: shifting to avoid atomic adds
453+
const CeedInt ii = (i + data.t_id_y) % Q_1D;
454454

455-
for (CeedInt j = 0; j < Q_1D; j++) {
456-
const CeedInt jj = (j + data.t_id_y) % Q_1D;
455+
for (CeedInt j = 0; j < Q_1D; j++) {
456+
const CeedInt jj = (j + data.t_id_x) % Q_1D;
457457

458-
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
459-
}
458+
atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
460459
}
461460
}
462461
}

0 commit comments

Comments
 (0)