gpu - minor reduction in AtPoints grad FLOPs

jeremylt · jeremylt · commit 64efbe91070c · 2025-07-11T11:33:13.000-06:00
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -376,22 +376,20 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
-    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      // Get z contraction value
-      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-      CeedScalar z = chebyshev_x[k];
+    // Get z contraction values
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar dz = chebyshev_x[k];
 
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // Load coefficients
       __syncthreads();
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
       __syncthreads();
       // Gradient directions
       for (CeedInt dim = 0; dim < 3; dim++) {
-        // Update z value for final pass
-        if (dim == 2) {
-          ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
-          z = chebyshev_x[k];
-        }
         // Contract x direction
         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -404,8 +402,10 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
         // Contract y and z direction
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        const CeedScalar zz = dim == 2 ? dz : z;
+
         for (CeedInt i = 0; i < Q_1D; i++) {
-          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
+          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
         }
       }
     }
@@ -422,26 +422,26 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
-    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      // Get z contraction value
-      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-      CeedScalar z = chebyshev_x[k];
+    // Get z contraction values
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar dz = chebyshev_x[k];
 
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // Clear shared memory
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
       __syncthreads();
       // Gradient directions
       for (CeedInt dim = 0; dim < 3; dim++) {
-        // Update z value for final pass
-        if (dim == 2) {
-          ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
-          z = chebyshev_x[k];
-        }
         // Contract y and z direction
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        const CeedScalar zz = dim == 2 ? dz : z;
+
         for (CeedInt i = 0; i < Q_1D; i++) {
-          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
+          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * zz;
         }
         // Contract x direction
         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -377,22 +377,20 @@ inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, con
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
-    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      // Get z contraction value
-      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-      CeedScalar z = chebyshev_x[k];
+    // Get z contraction values
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar dz = chebyshev_x[k];
 
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // Load coefficients
       __syncthreads();
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
       __syncthreads();
       // Gradient directions
       for (CeedInt dim = 0; dim < 3; dim++) {
-        // Update z value for final pass
-        if (dim == 2) {
-          ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
-          z = chebyshev_x[k];
-        }
         // Contract x direction
         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -405,8 +403,10 @@ inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, con
         // Contract y and z direction
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        const CeedScalar zz = dim == 2 ? dz : z;
+
         for (CeedInt i = 0; i < Q_1D; i++) {
-          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
+          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * zz;
         }
       }
     }
@@ -423,26 +423,26 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
 
-    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
-      // Get z contraction value
-      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
-      CeedScalar z = chebyshev_x[k];
+    // Get z contraction values
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar z = chebyshev_x[k];
+
+    ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+    const CeedScalar dz = chebyshev_x[k];
 
+    for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       // Clear shared memory
       if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
       __syncthreads();
       // Gradient directions
       for (CeedInt dim = 0; dim < 3; dim++) {
-        // Update z value for final pass
-        if (dim == 2) {
-          ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
-          z = chebyshev_x[k];
-        }
         // Contract y and z direction
         if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
         else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        const CeedScalar zz = dim == 2 ? dz : z;
+
         for (CeedInt i = 0; i < Q_1D; i++) {
-          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
+          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * zz;
         }
         // Contract x direction
         if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
@@ -939,9 +939,9 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEva
           *flops = tensor_flops + num_points * num_comp * (point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0));
           if (dim == 3 && is_gpu) {
             CeedInt inner_flops =
-                dim * (2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 : 3) * Q_1d + d_chebyshev_flops) + (2 * dim - 1) * chebyshev_flops;
+                dim * (2 * Q_1d * Q_1d + (t_mode == CEED_TRANSPOSE ? 2 : 3) * Q_1d) + (dim - 1) * (2 * chebyshev_flops + d_chebyshev_flops);
 
-            *flops += num_points * Q_1d * num_comp * (inner_flops + (t_mode == CEED_TRANSPOSE ? 1 : 0));
+            *flops += num_points * Q_1d * (chebyshev_flops + d_chebyshev_flops) * num_comp * (inner_flops + (t_mode == CEED_TRANSPOSE ? 1 : 0));
           } else {
             *flops += num_points * (is_gpu ? num_comp : 1) * dim * (d_chebyshev_flops + (dim - 1) * chebyshev_flops);
           }