Skip to content

Commit 8e1cf66

Browse files
committed
cuda - fix 2D flattening
1 parent 336ccc0 commit 8e1cf66

File tree

4 files changed

+69
-44
lines changed

4 files changed

+69
-44
lines changed

backends/cuda-gen/ceed-cuda-gen-operator-build.cpp

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
177177
// Setup fields
178178
//------------------------------------------------------------------------------
179179
static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
180-
CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt Q_1d, bool is_input,
180+
CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt Q, CeedInt Q_1d, bool is_input,
181181
bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
182182
bool is_tensor = true;
183183
CeedBasis basis;
@@ -220,7 +220,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
220220
CeedInt P = 0;
221221

222222
CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
223-
code << " const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P) << ";\n";
223+
code << " const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
224224
}
225225
code << " const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
226226
if (eval_mode != CEED_EVAL_WEIGHT) {
@@ -528,10 +528,11 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
528528
std::string function_name = is_tensor
529529
? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
530530
: "InterpNonTensor";
531+
std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
531532

532533
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
533-
code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
534-
<< ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
534+
code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
535+
<< var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
535536
}
536537
break;
537538
case CEED_EVAL_GRAD:
@@ -551,17 +552,18 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
551552
bool is_collocated = dim == 3 && Q_1d >= P_1d;
552553
std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
553554
(is_all_tensor ? "" : "Flattened");
555+
std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
554556

555557
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" << (dim >= 3 ? Q_name : "1")
556558
<< "];\n";
557-
code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
558-
<< ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
559+
code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
560+
<< var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
559561
} else {
560562
std::string function_name = "GradNonTensor";
561563

562564
code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
563-
code << " " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name << ", "
564-
<< (P_1d > Q_1d ? P_name : Q_name) << ">(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
565+
code << " " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
566+
<< ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
565567
}
566568
break;
567569
case CEED_EVAL_WEIGHT: {
@@ -602,9 +604,10 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
602604
std::string function_name =
603605
is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
604606
: "InterpTransposeNonTensor";
607+
std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
605608

606-
code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
607-
<< ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
609+
code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
610+
<< var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
608611
}
609612
break;
610613
case CEED_EVAL_GRAD:
@@ -623,14 +626,15 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
623626
bool is_collocated = dim == 3 && Q_1d >= P_1d;
624627
std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
625628
std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
629+
std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
626630

627-
code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
628-
<< ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
631+
code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
632+
<< var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
629633
} else {
630634
std::string function_name = "GradTransposeNonTensor";
631635

632-
code << " " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name << ", "
633-
<< (P_1d > Q_1d ? P_name : Q_name) << ">(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
636+
code << " " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
637+
<< ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
634638
}
635639
break;
636640
// LCOV_EXCL_START
@@ -1160,6 +1164,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
11601164
if (is_at_points) Q_1d = max_num_points;
11611165
else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
11621166
}
1167+
if (Q == 0) Q = Q_1d;
1168+
data->Q = Q;
11631169
data->Q_1d = Q_1d;
11641170

11651171
// Check for restriction only identity operator
@@ -1389,13 +1395,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
13891395
// Initialize constants, and matrices B and G
13901396
code << "\n // Input field constants and basis data\n";
13911397
for (CeedInt i = 0; i < num_input_fields; i++) {
1392-
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d,
1398+
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q, Q_1d,
13931399
true, is_all_tensor, is_at_points, use_3d_slices));
13941400
}
13951401
code << "\n // Output field constants and basis data\n";
13961402
for (CeedInt i = 0; i < num_output_fields; i++) {
1397-
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
1398-
false, is_all_tensor, is_at_points, use_3d_slices));
1403+
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q,
1404+
Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
13991405
}
14001406

14011407
// Loop over all elements

backends/cuda-gen/ceed-cuda-gen-operator.c

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -197,16 +197,14 @@ static int CeedOperatorApplyAddCore_Cuda_gen(CeedOperator op, CUstream stream, c
197197
CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
198198

199199
// Apply operator
200-
void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
201-
const CeedInt dim = data->dim;
202-
const CeedInt Q_1d = data->Q_1d;
203-
const CeedInt P_1d = data->max_P_1d;
204-
const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
205-
int max_threads_per_block, min_grid_size, grid;
200+
void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
201+
int max_threads_per_block, min_grid_size, grid;
206202

207203
CeedCallBackend(CeedOperatorHasTensorBases(op, &is_tensor));
204+
const CeedInt thread_1d = CeedIntMax(is_tensor ? data->Q_1d : data->Q, data->max_P_1d);
205+
208206
CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
209-
int block[3] = {thread_1d, ((!is_tensor || dim == 1) ? 1 : thread_1d), -1};
207+
int block[3] = {thread_1d, ((!is_tensor || data->dim == 1) ? 1 : thread_1d), -1};
210208

211209
if (is_tensor) {
212210
CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,

backends/cuda-gen/ceed-cuda-gen.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
typedef struct {
1515
bool use_fallback;
1616
CeedInt dim;
17-
CeedInt Q_1d;
17+
CeedInt Q, Q_1d;
1818
CeedInt max_P_1d;
1919
CUmodule module;
2020
CUfunction op;

include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,29 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const int
184184
__syncthreads();
185185
}
186186

187+
//------------------------------------------------------------------------------
188+
// 2D pack/unpack quadrature values
189+
//------------------------------------------------------------------------------
190+
template <int NUM_COMP, int Q_1D, int T_1D>
191+
inline __device__ void QPack2D(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
192+
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
193+
if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
194+
__syncthreads();
195+
U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] : 0.0;
196+
__syncthreads();
197+
}
198+
}
199+
200+
template <int NUM_COMP, int Q_1D, int T_1D>
201+
inline __device__ void QUnpack2D(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
202+
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
203+
if (data.t_id_x < (Q_1D * Q_1D)) data.slice[(data.t_id_x % Q_1D) + (data.t_id_x / Q_1D) * T_1D] = U[comp];
204+
__syncthreads();
205+
U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
206+
__syncthreads();
207+
}
208+
}
209+
187210
//------------------------------------------------------------------------------
188211
// 2D interpolate to quadrature points
189212
//------------------------------------------------------------------------------
@@ -204,11 +227,11 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_
204227
}
205228

206229
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
207-
inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
230+
inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
208231
CeedScalar *__restrict__ r_V) {
209-
const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
210-
211-
InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
232+
QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
233+
InterpTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, r_V);
234+
QPack2D<NUM_COMP, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
212235
}
213236

214237
//------------------------------------------------------------------------------
@@ -231,11 +254,11 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed
231254
}
232255

233256
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
234-
inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
257+
inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
235258
CeedScalar *__restrict__ r_V) {
236-
const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
237-
238-
InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, r_V);
259+
QUnpack2D<NUM_COMP, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
260+
InterpTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, r_V);
261+
QPack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
239262
}
240263

241264
//------------------------------------------------------------------------------
@@ -260,11 +283,11 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r
260283
}
261284

262285
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
263-
inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
286+
inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
264287
CeedScalar *__restrict__ r_V) {
265-
const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
266-
267-
GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
288+
QUnpack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
289+
GradTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, c_G, r_V);
290+
QPack2D<NUM_COMP * 2, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
268291
}
269292

270293
//------------------------------------------------------------------------------
@@ -289,11 +312,11 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc
289312
}
290313

291314
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
292-
inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
315+
inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
293316
const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
294-
const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
295-
296-
GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, r_U, c_B, c_G, r_V);
317+
QUnpack2D<NUM_COMP * 2, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U);
318+
GradTransposeTensor2d_Core<NUM_COMP, P_1D, Q_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_U, c_B, c_G, r_V);
319+
QPack2D<NUM_COMP, P_1D, T_1D>(data, data.t_id_x % T_1D, data.t_id_x / T_1D, r_V);
297320
}
298321

299322
//------------------------------------------------------------------------------
@@ -312,9 +335,7 @@ inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *_
312335

313336
template <int P_1D, int Q_1D>
314337
inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
315-
const int max_1d = P_1D < Q_1D ? P_1D : Q_1D;
316-
317-
WeightTensor2d_Core<Q_1D>(data, data.t_id_x % max_1d, data.t_id_x / max_1d, q_weight_1d, w);
338+
WeightTensor2d_Core<Q_1D>(data, data.t_id_x % Q_1D, data.t_id_x / Q_1D, q_weight_1d, w);
318339
}
319340

320341
//------------------------------------------------------------------------------

0 commit comments

Comments
 (0)