Skip to content

Conversation

@jeremylt
Copy link
Member

@jeremylt jeremylt commented Mar 6, 2025

Its silly that we didn't think of this sooner.

#define T_1D 6
#include <ceed/jit-source/cuda/cuda-jit.h>

// Tensor basis source
#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>

// CodeGen operator source
#include <ceed/jit-source/cuda/cuda-gen-templates.h>


#undef CEED_Q_VLA
#define CEED_Q_VLA 6

// User QFunction source
#include "/home/jeremy/Dev/libCEED/examples/ceed/ex1-volume.h"


// -----------------------------------------------------------------------------
// Operator Kernel
// 
// d_[in,out]_i:   CeedVector device array
// r_[in,out]_e_i: Element vector register
// r_[in,out]_q_i: Quadrature space vector register
// r_[in,out]_c_i: AtPoints Chebyshev coefficients register
// r_[in,out]_s_i: Quadrature space slice vector register
// 
// s_B_[in,out]_i: Interpolation matrix, shared memory
// s_G_[in,out]_i: Gradient matrix, shared memory
// -----------------------------------------------------------------------------
extern "C" __global__ void CeedKernelCudaGenOperator_apply_mass(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda points) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  const CeedScalar *d_in_1 = fields.inputs[1];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 3;
  const CeedInt Q_1d = 6;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0: u
  const CeedInt P_1d_in_0 = 5;
  const CeedInt num_comp_in_0 = 1;
  // EvalMode: interpolation
  __shared__ CeedScalar s_B_in_0[P_1d_in_0*Q_1d];
  LoadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);
  // -- Input field 1: qdata
  const CeedInt P_1d_in_1 = 6;
  const CeedInt num_comp_in_1 = 1;
  // EvalMode: none

  // Output field constants and basis data
  // -- Output field 0: v
  const CeedInt P_1d_out_0 = 5;
  const CeedInt num_comp_out_0 = 1;
  // EvalMode: interpolation
  CeedScalar *s_B_out_0 = s_B_in_0;

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // Scratch restriction buffer space
    CeedScalar r_e_scratch[216];

    // -- Input field restrictions and basis actions
    // ---- Input field 0: u
    CeedScalar *r_e_in_0 = r_e_scratch;
    const CeedInt l_size_in_0 = 274625;
    // CompStride: 274625
    ReadLVecStandard3d<num_comp_in_0, 274625, P_1d_in_0>(data, l_size_in_0, elem, indices.inputs[0], d_in_0, r_e_in_0);
    // EvalMode: interpolation
    CeedScalar r_q_in_0[num_comp_in_0*Q_1d];
    InterpTensor3d<num_comp_in_0, P_1d_in_0, Q_1d>(data, r_e_in_0, s_B_in_0, r_q_in_0);
    // ---- Input field 1: qdata
    CeedScalar r_e_in_1[num_comp_in_1*P_1d_in_1];
    // Strides: {1, 884736, 216}
    ReadLVecStrided3d<num_comp_in_1, P_1d_in_1, 1, 884736, 216>(data, elem, d_in_1, r_e_in_1);
    // EvalMode: none
    CeedScalar *r_q_in_1 = r_e_in_1;

    // -- Output field setup
    // ---- Output field 0: v
    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];

    // Note: Using full elements
    {
      // -- Input fields
      // ---- Input field 0: u
      CeedScalar *r_s_in_0 = r_q_in_0;
      // ---- Input field 1: qdata
      CeedScalar *r_s_in_1 = r_q_in_1;
      // -- Output fields
      // ---- Output field 0: v
      CeedScalar *r_s_out_0 = r_q_out_0;

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[2];
      // ------ Input field 0: u
      inputs[0] = r_s_in_0;
      // ------ Input field 1: qdata
      inputs[1] = r_s_in_1;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0: v
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      apply_mass(ctx, Q_1d, inputs, outputs);
    }

    // -- Output field basis action and restrictions
    // ---- Output field 0: v
    // EvalMode: interpolation
    CeedScalar *r_e_out_0 = r_e_scratch;
    InterpTransposeTensor3d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_q_out_0, s_B_out_0, r_e_out_0);
    const CeedInt l_size_out_0 = 274625;
    // CompStride: 274625
    WriteLVecStandard3d<num_comp_out_0, 274625, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
  }
}
// -----------------------------------------------------------------------------

@jeremylt jeremylt force-pushed the jeremy/gen-field-names branch from 8c77781 to 59fa3f9 Compare March 6, 2025 23:41
Copy link
Member

@jedbrown jedbrown left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice. Scanning the generated code, can we add more __restrict? (I don't know if it'll make a perf difference.)

@jeremylt jeremylt merged commit 7577ddf into main Mar 7, 2025
29 checks passed
@jeremylt jeremylt deleted the jeremy/gen-field-names branch March 7, 2025 17:02
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants