Skip to content

Commit 4b6745b

Browse files
authored
Merge pull request #1762 from CEED/jeremy/gen-mixed
Mixed Tensor/NonTensor for Gen
2 parents 20a16a5 + 8b89f79 commit 4b6745b

21 files changed

+1679
-436
lines changed

backends/cuda-gen/ceed-cuda-gen-operator-build.cpp

Lines changed: 265 additions & 169 deletions
Large diffs are not rendered by default.

backends/cuda-gen/ceed-cuda-gen-operator.c

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -197,22 +197,18 @@ static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, c
197197
CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
198198

199199
// Apply operator
200-
void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
201-
const CeedInt dim = data->dim;
202-
const CeedInt Q_1d = data->Q_1d;
203-
const CeedInt P_1d = data->max_P_1d;
204-
const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
205-
int max_threads_per_block, min_grid_size, grid;
200+
void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
201+
int max_threads_per_block, min_grid_size, grid;
206202

207203
CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
208204
CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
209-
int block[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
205+
int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
210206

211207
if (is_tensor) {
212208
CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, is_at_points ? 1 : max_threads_per_block,
213209
cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
214210
} else {
215-
CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));
211+
CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1));
216212

217213
grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
218214
block[2] = elems_per_block;

backends/cuda-gen/ceed-cuda-gen.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@
1414
typedef struct {
1515
bool use_fallback;
1616
CeedInt dim;
17-
CeedInt Q_1d;
17+
CeedInt Q, Q_1d;
1818
CeedInt max_P_1d;
19+
CeedInt thread_1d;
1920
CUmodule module;
2021
CUfunction op;
2122
FieldsInt_Cuda indices;

backends/hip-gen/ceed-hip-gen-operator-build.cpp

Lines changed: 261 additions & 167 deletions
Large diffs are not rendered by default.

backends/hip-gen/ceed-hip-gen-operator.c

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -131,39 +131,35 @@ static int CeedOperatorApplyAddCore_Hip_gen(CeedOperator op, hipStream_t stream,
131131
CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
132132

133133
// Apply operator
134-
void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
135-
const CeedInt dim = data->dim;
136-
const CeedInt Q_1d = data->Q_1d;
137-
const CeedInt P_1d = data->max_P_1d;
138-
const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
134+
void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
139135

140136
CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
141-
CeedInt block_sizes[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
137+
CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1};
142138

143139
if (is_tensor) {
144-
CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, P_1d, Q_1d, block_sizes));
140+
CeedCallBackend(BlockGridCalculate_Hip_gen(data->dim, num_elem, data->max_P_1d, data->Q_1d, block_sizes));
145141
if (is_at_points) block_sizes[2] = 1;
146142
} else {
147-
CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
143+
CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64;
148144

149145
elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
150146
block_sizes[2] = elems_per_block;
151147
}
152-
if (dim == 1 || !is_tensor) {
148+
if (data->dim == 1 || !is_tensor) {
153149
CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
154-
CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);
150+
CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar);
155151

156152
CeedCallBackend(
157153
CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
158-
} else if (dim == 2) {
154+
} else if (data->dim == 2) {
159155
CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
160-
CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
156+
CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
161157

162158
CeedCallBackend(
163159
CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));
164-
} else if (dim == 3) {
160+
} else if (data->dim == 3) {
165161
CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
166-
CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
162+
CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar);
167163

168164
CeedCallBackend(
169165
CeedTryRunKernelDimShared_Hip(ceed, data->op, stream, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, is_run_good, opargs));

backends/hip-gen/ceed-hip-gen.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@
1414
typedef struct {
1515
bool use_fallback;
1616
CeedInt dim;
17-
CeedInt Q_1d;
17+
CeedInt Q, Q_1d;
1818
CeedInt max_P_1d;
19+
CeedInt thread_1d;
1920
hipModule_t module;
2021
hipFunction_t op;
2122
FieldsInt_Hip indices;

doc/sphinx/source/releasenotes.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ On this page we provide a summary of the main API changes, new features and exam
2525
- Allow user to set additional compiler options for CUDA and HIP JiT.
2626
Specifically, directories set with `CeedAddJitSourceRoot(ceed, "foo/bar")` will be used to set `-Ifoo/bar` and defines set with `CeedAddJitDefine(ceed, "foo=bar")` will be used to set `-Dfoo=bar`.
2727
- Added non-tensor basis support to code generation backends `/gpu/cuda/gen` and `/gpu/hip/gen`.
28+
- Added support to code generation backends `/gpu/cuda/gen` and `/gpu/hip/gen` for operators with both tensor and non-tensor bases.
2829

2930
### Examples
3031

include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedS
9292
//------------------------------------------------------------------------------
9393
// Quadrature weights
9494
//------------------------------------------------------------------------------
95-
template <int Q>
95+
template <int P, int Q>
9696
inline __device__ void WeightNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) {
9797
*w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0;
9898
}

include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__re
194194
CeedScalar r_W[1];
195195

196196
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
197-
WeightNonTensor<BASIS_Q>(data, q_weight, r_W);
197+
WeightNonTensor<BASIS_P, BASIS_Q>(data, q_weight, r_W);
198198
WriteElementStrided1d<1, BASIS_Q>(data, elem, 1, BASIS_Q * num_elem, BASIS_Q, r_W, d_W);
199199
}
200200
}

include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
4040
//------------------------------------------------------------------------------
4141
// 1D interpolate to points
4242
//------------------------------------------------------------------------------
43-
template <int NUM_COMP, int NUM_POINTS, int Q_1D>
43+
template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
4444
inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
4545
CeedScalar *__restrict__ r_V) {
4646
CeedScalar chebyshev_x[Q_1D];
@@ -61,7 +61,7 @@ inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p,
6161
//------------------------------------------------------------------------------
6262
// 1D interpolate transpose
6363
//------------------------------------------------------------------------------
64-
template <int NUM_COMP, int NUM_POINTS, int Q_1D>
64+
template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
6565
inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
6666
CeedScalar *__restrict__ r_C) {
6767
CeedScalar chebyshev_x[Q_1D];
@@ -86,7 +86,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce
8686
//------------------------------------------------------------------------------
8787
// 1D derivatives at points
8888
//------------------------------------------------------------------------------
89-
template <int NUM_COMP, int NUM_POINTS, int Q_1D>
89+
template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
9090
inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
9191
CeedScalar *__restrict__ r_V) {
9292
CeedScalar chebyshev_x[Q_1D];
@@ -107,7 +107,7 @@ inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, co
107107
//------------------------------------------------------------------------------
108108
// 1D derivatives transpose
109109
//------------------------------------------------------------------------------
110-
template <int NUM_COMP, int NUM_POINTS, int Q_1D>
110+
template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
111111
inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
112112
CeedScalar *__restrict__ r_C) {
113113
CeedScalar chebyshev_x[Q_1D];
@@ -136,7 +136,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
136136
//------------------------------------------------------------------------------
137137
// 2D interpolate to points
138138
//------------------------------------------------------------------------------
139-
template <int NUM_COMP, int NUM_POINTS, int Q_1D>
139+
template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
140140
inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
141141
CeedScalar *__restrict__ r_V) {
142142
for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
@@ -166,7 +166,7 @@ inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p,
166166
//------------------------------------------------------------------------------
167167
// 2D interpolate transpose
168168
//------------------------------------------------------------------------------
169-
template <int NUM_COMP, int NUM_POINTS, int Q_1D>
169+
template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
170170
inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
171171
CeedScalar *__restrict__ r_C) {
172172
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -204,7 +204,7 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
204204
//------------------------------------------------------------------------------
205205
// 2D derivatives at points
206206
//------------------------------------------------------------------------------
207-
template <int NUM_COMP, int NUM_POINTS, int Q_1D>
207+
template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
208208
inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
209209
CeedScalar *__restrict__ r_V) {
210210
for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
@@ -238,7 +238,7 @@ inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, co
238238
//------------------------------------------------------------------------------
239239
// 2D derivatives transpose
240240
//------------------------------------------------------------------------------
241-
template <int NUM_COMP, int NUM_POINTS, int Q_1D>
241+
template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
242242
inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
243243
CeedScalar *__restrict__ r_C) {
244244
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -284,7 +284,7 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
284284
//------------------------------------------------------------------------------
285285
// 3D interpolate to points
286286
//------------------------------------------------------------------------------
287-
template <int NUM_COMP, int NUM_POINTS, int Q_1D>
287+
template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
288288
inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
289289
CeedScalar *__restrict__ r_V) {
290290
for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
@@ -319,7 +319,7 @@ inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p,
319319
//------------------------------------------------------------------------------
320320
// 3D interpolate transpose
321321
//------------------------------------------------------------------------------
322-
template <int NUM_COMP, int NUM_POINTS, int Q_1D>
322+
template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
323323
inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
324324
CeedScalar *__restrict__ r_C) {
325325
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -362,7 +362,7 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
362362
//------------------------------------------------------------------------------
363363
// 3D derivatives at points
364364
//------------------------------------------------------------------------------
365-
template <int NUM_COMP, int NUM_POINTS, int Q_1D>
365+
template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
366366
inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
367367
CeedScalar *__restrict__ r_V) {
368368
for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
@@ -402,7 +402,7 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
402402
//------------------------------------------------------------------------------
403403
// 3D derivatives transpose
404404
//------------------------------------------------------------------------------
405-
template <int NUM_COMP, int NUM_POINTS, int Q_1D>
405+
template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
406406
inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
407407
CeedScalar *__restrict__ r_C) {
408408
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {

0 commit comments

Comments
 (0)