Skip to content

Commit 64efbe9

Browse files
committed
gpu - minor reduction in AtPoints grad FLOPs
1 parent 65d1306 commit 64efbe9

File tree

3 files changed

+42
-42
lines changed

3 files changed

+42
-42
lines changed

include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -376,22 +376,20 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
376376
CeedScalar buffer[Q_1D];
377377
CeedScalar chebyshev_x[Q_1D];
378378

379-
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
380-
// Get z contraction value
381-
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
382-
CeedScalar z = chebyshev_x[k];
379+
// Get z contraction values
380+
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
381+
const CeedScalar z = chebyshev_x[k];
382+
383+
ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
384+
const CeedScalar dz = chebyshev_x[k];
383385

386+
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
384387
// Load coefficients
385388
__syncthreads();
386389
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
387390
__syncthreads();
388391
// Gradient directions
389392
for (CeedInt dim = 0; dim < 3; dim++) {
390-
// Update z value for final pass
391-
if (dim == 2) {
392-
ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
393-
z = chebyshev_x[k];
394-
}
395393
// Contract x direction
396394
if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
397395
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -404,8 +402,10 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
404402
// Contract y and z direction
405403
if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
406404
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
405+
const CeedScalar zz = dim == 2 ? dz : z;
406+
407407
for (CeedInt i = 0; i < Q_1D; i++) {
408-
r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
408+
r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
409409
}
410410
}
411411
}
@@ -422,26 +422,26 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
422422
CeedScalar buffer[Q_1D];
423423
CeedScalar chebyshev_x[Q_1D];
424424

425-
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
426-
// Get z contraction value
427-
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
428-
CeedScalar z = chebyshev_x[k];
425+
// Get z contraction values
426+
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
427+
const CeedScalar z = chebyshev_x[k];
428+
429+
ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
430+
const CeedScalar dz = chebyshev_x[k];
429431

432+
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
430433
// Clear shared memory
431434
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
432435
__syncthreads();
433436
// Gradient directions
434437
for (CeedInt dim = 0; dim < 3; dim++) {
435-
// Update z value for final pass
436-
if (dim == 2) {
437-
ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
438-
z = chebyshev_x[k];
439-
}
440438
// Contract y and z direction
441439
if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
442440
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
441+
const CeedScalar zz = dim == 2 ? dz : z;
442+
443443
for (CeedInt i = 0; i < Q_1D; i++) {
444-
buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
444+
buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * zz;
445445
}
446446
// Contract x direction
447447
if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);

include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -377,22 +377,20 @@ inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, con
377377
CeedScalar buffer[Q_1D];
378378
CeedScalar chebyshev_x[Q_1D];
379379

380-
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
381-
// Get z contraction value
382-
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
383-
CeedScalar z = chebyshev_x[k];
380+
// Get z contraction values
381+
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
382+
const CeedScalar z = chebyshev_x[k];
383+
384+
ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
385+
const CeedScalar dz = chebyshev_x[k];
384386

387+
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
385388
// Load coefficients
386389
__syncthreads();
387390
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
388391
__syncthreads();
389392
// Gradient directions
390393
for (CeedInt dim = 0; dim < 3; dim++) {
391-
// Update z value for final pass
392-
if (dim == 2) {
393-
ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
394-
z = chebyshev_x[k];
395-
}
396394
// Contract x direction
397395
if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
398396
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -405,8 +403,10 @@ inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, con
405403
// Contract y and z direction
406404
if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
407405
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
406+
const CeedScalar zz = dim == 2 ? dz : z;
407+
408408
for (CeedInt i = 0; i < Q_1D; i++) {
409-
r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
409+
r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
410410
}
411411
}
412412
}
@@ -423,26 +423,26 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
423423
CeedScalar buffer[Q_1D];
424424
CeedScalar chebyshev_x[Q_1D];
425425

426-
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
427-
// Get z contraction value
428-
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
429-
CeedScalar z = chebyshev_x[k];
426+
// Get z contraction values
427+
ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
428+
const CeedScalar z = chebyshev_x[k];
429+
430+
ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
431+
const CeedScalar dz = chebyshev_x[k];
430432

433+
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
431434
// Clear shared memory
432435
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
433436
__syncthreads();
434437
// Gradient directions
435438
for (CeedInt dim = 0; dim < 3; dim++) {
436-
// Update z value for final pass
437-
if (dim == 2) {
438-
ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
439-
z = chebyshev_x[k];
440-
}
441439
// Contract y and z direction
442440
if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
443441
else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
442+
const CeedScalar zz = dim == 2 ? dz : z;
443+
444444
for (CeedInt i = 0; i < Q_1D; i++) {
445-
buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
445+
buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * zz;
446446
}
447447
// Contract x direction
448448
if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);

interface/ceed-basis.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -939,9 +939,9 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEva
939939
*flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0));
940940
if (dim == 3 && is_gpu) {
941941
CeedInt inner_flops =
942-
dim * (2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 : 3) * Q_1d + d_chebyshev_flops) + (2 * dim - 1) * chebyshev_flops;
942+
dim * (2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 : 3) * Q_1d) + (dim - 1) * (2 * chebyshev_flops + d_chebyshev_flops);
943943

944-
*flops += num_points * Q_1d * num_comp * (inner_flops + (t_mode == CEED_TRANSPOSE ? 1 : 0));
944+
*flops += num_points * Q_1d * (chebyshev_flops + d_chebyshev_flops) * num_comp * (inner_flops + (t_mode == CEED_TRANSPOSE ? 1 : 0));
945945
} else {
946946
*flops += num_points * (is_gpu ? num_comp : 1) * dim * (d_chebyshev_flops + (dim - 1) * chebyshev_flops);
947947
}

0 commit comments

Comments
 (0)