gen - use field names for clarity #1772

jeremylt · 2025-03-06T23:32:11Z

Its silly that we didn't think of this sooner.

#define T_1D 6
#include <ceed/jit-source/cuda/cuda-jit.h>

// Tensor basis source
#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>

// CodeGen operator source
#include <ceed/jit-source/cuda/cuda-gen-templates.h>


#undef CEED_Q_VLA
#define CEED_Q_VLA 6

// User QFunction source
#include "/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume.h"


// -----------------------------------------------------------------------------
// Operator Kernel
// 
// d_[in,out]_i:   CeedVector device array
// r_[in,out]_e_i: Element vector register
// r_[in,out]_q_i: Quadrature space vector register
// r_[in,out]_c_i: AtPoints Chebyshev coefficients register
// r_[in,out]_s_i: Quadrature space slice vector register
// 
// s_B_[in,out]_i: Interpolation matrix, shared memory
// s_G_[in,out]_i: Gradient matrix, shared memory
// -----------------------------------------------------------------------------
extern "C" __global__ void CeedKernelCudaGenOperator_apply_mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda points) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  const CeedScalar *d_in_1 = fields.inputs[1];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 3;
  const CeedInt Q_1d = 6;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0: u
  const CeedInt P_1d_in_0 = 5;
  const CeedInt num_comp_in_0 = 1;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_in_0[P_1d_in_0*Q_1d];
  LoadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);
  // -- Input field 1: qdata
  const CeedInt P_1d_in_1 = 6;
  const CeedInt num_comp_in_1 = 1;
  // EvalMode: none

  // Output field constants and basis data
  // -- Output field 0: v
  const CeedInt P_1d_out_0 = 5;
  const CeedInt num_comp_out_0 = 1;
  // EvalMode: interpolation
  CeedScalar *s_B_out_0 = s_B_in_0;

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // Scratch restriction buffer space
    CeedScalar r_e_scratch[216];

    // -- Input field restrictions and basis actions
    // ---- Input field 0: u
    CeedScalar *r_e_in_0 = r_e_scratch;
    const CeedInt l_size_in_0 = 274625;
    // CompStride: 274625
    ReadLVecStandard3d<num_comp_in_0, 274625, P_1d_in_0>(data, l_size_in_0, elem, indices.inputs[0], d_in_0, r_e_in_0);
    // EvalMode: interpolation
    CeedScalar r_q_in_0[num_comp_in_0*Q_1d];
    InterpTensor3d<num_comp_in_0, P_1d_in_0, Q_1d>(data, r_e_in_0, s_B_in_0, r_q_in_0);
    // ---- Input field 1: qdata
    CeedScalar r_e_in_1[num_comp_in_1*P_1d_in_1];
    // Strides: {1, 884736, 216}
    ReadLVecStrided3d<num_comp_in_1, P_1d_in_1, 1, 884736, 216>(data, elem, d_in_1, r_e_in_1);
    // EvalMode: none
    CeedScalar *r_q_in_1 = r_e_in_1;

    // -- Output field setup
    // ---- Output field 0: v
    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];

    // Note: Using full elements
    {
      // -- Input fields
      // ---- Input field 0: u
      CeedScalar *r_s_in_0 = r_q_in_0;
      // ---- Input field 1: qdata
      CeedScalar *r_s_in_1 = r_q_in_1;
      // -- Output fields
      // ---- Output field 0: v
      CeedScalar *r_s_out_0 = r_q_out_0;

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[2];
      // ------ Input field 0: u
      inputs[0] = r_s_in_0;
      // ------ Input field 1: qdata
      inputs[1] = r_s_in_1;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0: v
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      apply_mass(ctx, Q_1d, inputs, outputs);
    }

    // -- Output field basis action and restrictions
    // ---- Output field 0: v
    // EvalMode: interpolation
    CeedScalar *r_e_out_0 = r_e_scratch;
    InterpTransposeTensor3d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_q_out_0, s_B_out_0, r_e_out_0);
    const CeedInt l_size_out_0 = 274625;
    // CompStride: 274625
    WriteLVecStandard3d<num_comp_out_0, 274625, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
  }
}
// -----------------------------------------------------------------------------

jedbrown

Nice. Scanning the generated code, can we add more __restrict? (I don't know if it'll make a perf difference.)

jeremylt added minor GPU 1-In Review labels Mar 6, 2025

jeremylt self-assigned this Mar 6, 2025

gen - use field names for clarity

59fa3f9

jeremylt force-pushed the jeremy/gen-field-names branch from 8c77781 to 59fa3f9 Compare March 6, 2025 23:41

jedbrown approved these changes Mar 7, 2025

View reviewed changes

gen - restrict input/output array pointers

826538b

jeremylt merged commit 7577ddf into main Mar 7, 2025
29 checks passed

jeremylt deleted the jeremy/gen-field-names branch March 7, 2025 17:02

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

gen - use field names for clarity #1772

gen - use field names for clarity #1772

Uh oh!

jeremylt commented Mar 6, 2025

Uh oh!

jedbrown left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

gen - use field names for clarity #1772

gen - use field names for clarity #1772

Uh oh!

Conversation

jeremylt commented Mar 6, 2025

Uh oh!

jedbrown left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants