diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp index 481c358466..c4630abfe1 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp +++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp @@ -1285,7 +1285,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; code << tab << "// -----------------------------------------------------------------------------\n"; code << tab << "extern \"C\" __global__ void " << operator_name - << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " + << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarBase *W, Points_Cuda " "points) {\n"; tab.push(); @@ -1295,11 +1295,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT - code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; + code << tab << "const CeedScalarBase *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; } } for (CeedInt i = 0; i < num_output_fields; i++) { - code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; + code << tab << "CeedScalarBase *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; } code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; @@ -1572,11 +1572,20 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b // Compile { - bool is_compile_good = false; - const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + bool is_compile_good = false; + const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + CeedScalarType precision; + + // Check for mixed precision + CeedCallBackend(CeedOperatorGetPrecision(op, &precision)); data->thread_1d = T_1d; - CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d)); + if (precision != CEED_SCALAR_TYPE) { + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", T_1d, "CEED_JIT_PRECISION", + CeedScalarTypes[precision])); + } else { + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d)); + } if (is_compile_good) { *is_good_build = true; CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op)); @@ -1689,8 +1698,8 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; code << tab << "// -----------------------------------------------------------------------------\n"; code << tab << "extern \"C\" __global__ void " << operator_name - << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " - "points, CeedScalar *__restrict__ values_array) {\n"; + << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarBase *W, Points_Cuda " + "points, CeedScalarBase *__restrict__ values_array) {\n"; tab.push(); // Scratch buffers @@ -1699,11 +1708,11 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT - code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; + code << tab << "const CeedScalarBase *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; } } for (CeedInt i = 0; i < num_output_fields; i++) { - code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; + code << tab << "CeedScalarBase *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; } code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; @@ -2043,12 +2052,22 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo // Compile { - bool is_compile_good = false; - const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + bool is_compile_good = false; + const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + CeedScalarType precision; + + // Check for mixed precision + CeedCallBackend(CeedOperatorGetPrecision(op, &precision)); data->thread_1d = T_1d; - CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, - is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d)); + if (precision != CEED_SCALAR_TYPE) { + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, + is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", T_1d, + "CEED_JIT_PRECISION", CeedScalarTypes[precision])); + } else { + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, + is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d)); + } if (is_compile_good) { *is_good_build = true; CeedCallBackend(CeedGetKernel_Cuda(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(), @@ -2221,8 +2240,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; code << tab << "// -----------------------------------------------------------------------------\n"; code << tab << "extern \"C\" __global__ void " << operator_name - << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " - "points, CeedScalar *__restrict__ values_array) {\n"; + << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarBase *W, Points_Cuda " + "points, CeedScalarBase *__restrict__ values_array) {\n"; tab.push(); // Scratch buffers @@ -2231,11 +2250,11 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT - code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; + code << tab << "const CeedScalarBase *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; } } for (CeedInt i = 0; i < num_output_fields; i++) { - code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; + code << tab << "CeedScalarBase *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; } code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; @@ -2485,8 +2504,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { - code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*" - << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; + code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" + << "dim_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; } else { code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; } @@ -2623,11 +2642,20 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera // Compile { - bool is_compile_good = false; - const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + bool is_compile_good = false; + const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); + CeedScalarType precision; + + // Check for mixed precision + CeedCallBackend(CeedOperatorGetPrecision(op, &precision)); data->thread_1d = T_1d; - CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d)); + if (precision != CEED_SCALAR_TYPE) { + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", T_1d, + "CEED_JIT_PRECISION", CeedScalarTypes[precision])); + } else { + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d)); + } if (is_compile_good) { *is_good_build = true; CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction)); diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c index e826f0aab1..5f74560022 100644 --- a/backends/cuda-gen/ceed-cuda-gen.c +++ b/backends/cuda-gen/ceed-cuda-gen.c @@ -29,6 +29,7 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) { CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Cuda(ceed, resource)); + CeedCallBackend(CeedSetSupportsMixedPrecision(ceed, true)); CeedCallBackend(CeedInit("/gpu/cuda/shared", &ceed_shared)); CeedCallBackend(CeedSetDelegate(ceed, ceed_shared)); diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp index dc430b81fa..37acf263e5 100644 --- a/backends/cuda/ceed-cuda-compile.cpp +++ b/backends/cuda/ceed-cuda-compile.cpp @@ -52,12 +52,20 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_ // Get kernel specific options, such as kernel constants if (num_defines > 0) { char *name; - int val; for (int i = 0; i < num_defines; i++) { name = va_arg(args, char *); - val = va_arg(args, int); - code << "#define " << name << " " << val << "\n"; + if (!strcmp(name, "CEED_JIT_PRECISION")) { + char *val; + + val = va_arg(args, char *); + code << "#define " << name << " " << val << "\n"; + } else { + int val; + + val = va_arg(args, int); + code << "#define " << name << " " << val << "\n"; + } } } diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp index 9e29d16cb4..3f17b64cfb 100644 --- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp +++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp @@ -2481,8 +2481,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { - code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*" - << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; + code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" + << "dim_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; } else { code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; } diff --git a/include/ceed-impl.h b/include/ceed-impl.h index 1b6875a6ca..b6e482df77 100644 --- a/include/ceed-impl.h +++ b/include/ceed-impl.h @@ -126,6 +126,7 @@ struct Ceed_private { void *data; bool is_debug; bool is_deterministic; + bool supports_mixed_precision; char err_msg[CEED_MAX_RESOURCE_LEN]; FOffset *f_offsets; CeedWorkVectors work_vectors; @@ -378,6 +379,7 @@ struct CeedOperator_private { bool is_composite; bool is_at_points; bool has_restriction; + CeedScalarType precision; CeedQFunctionAssemblyData qf_assembled; CeedOperatorAssemblyData op_assembled; CeedOperator *sub_operators; diff --git a/include/ceed/backend.h b/include/ceed/backend.h index c48e0d4666..b5bfad46ff 100644 --- a/include/ceed/backend.h +++ b/include/ceed/backend.h @@ -249,6 +249,7 @@ CEED_EXTERN int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_ CEED_EXTERN int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed); CEED_EXTERN int CeedSetOperatorFallbackCeed(Ceed ceed, Ceed fallback_ceed); CEED_EXTERN int CeedSetDeterministic(Ceed ceed, bool is_deterministic); +CEED_INTERN int CeedSetSupportsMixedPrecision(Ceed ceed, bool supports_mixed_precision); CEED_EXTERN int CeedSetBackendFunctionImpl(Ceed ceed, const char *type, void *object, const char *func_name, void (*f)(void)); CEED_EXTERN int CeedGetData(Ceed ceed, void *data); CEED_EXTERN int CeedSetData(Ceed ceed, void *data); diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h index 0bce734257..e11a4eff1f 100644 --- a/include/ceed/ceed-f32.h +++ b/include/ceed/ceed-f32.h @@ -10,11 +10,26 @@ /// Include this header in ceed.h to use float instead of double. #pragma once +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif + #define CEED_SCALAR_IS_FP32 /// Set base scalar type to FP32. (See CeedScalarType enum in ceed.h for all options.) #define CEED_SCALAR_TYPE CEED_SCALAR_FP32 -typedef float CeedScalar; +#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_PRECISION) && (CEED_JIT_PRECISION != CEED_SCALAR_TYPE) +#ifdef CEED_JIT_PRECISION == CEED_SCALAR_FP64 +typedef double CeedScalar; +typedef float CeedScalarBase; + +/// Machine epsilon +static const CeedScalar CEED_EPSILON = DBL_EPSILON; +#endif // CEED_JIT_PRECISION +#else +typedef float CeedScalar; +typedef CeedScalar CeedScalarBase; /// Machine epsilon -#define CEED_EPSILON 6e-08 +static const CeedScalar CEED_EPSILON = FLT_EPSILON; +#endif diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h index b74d867c18..e72ad0f617 100644 --- a/include/ceed/ceed-f64.h +++ b/include/ceed/ceed-f64.h @@ -10,11 +10,26 @@ /// This is the default header included in ceed.h. #pragma once +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif + #define CEED_SCALAR_IS_FP64 /// Set base scalar type to FP64. (See CeedScalarType enum in ceed.h for all options.) #define CEED_SCALAR_TYPE CEED_SCALAR_FP64 -typedef double CeedScalar; +#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_PRECISION) && (CEED_JIT_PRECISION != CEED_SCALAR_TYPE) +#if CEED_JIT_PRECISION == CEED_SCALAR_FP32 +typedef float CeedScalar; +typedef double CeedScalarBase; + +/// Machine epsilon +static const CeedScalar CEED_EPSILON = FLT_EPSILON; +#endif // CEED_JIT_PRECISION +#else +typedef double CeedScalar; +typedef CeedScalar CeedScalarBase; /// Machine epsilon -#define CEED_EPSILON 1e-16 +static const CeedScalar CEED_EPSILON = DBL_EPSILON; +#endif // CEED_RUNNING_JIT_PASS && CEED_JIT_MIXED_PRECISION diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h index ce6284b910..fc40362e16 100644 --- a/include/ceed/ceed.h +++ b/include/ceed/ceed.h @@ -106,6 +106,7 @@ CEED_EXTERN int CeedSetStream(Ceed ceed, void *handle); CEED_EXTERN int CeedReferenceCopy(Ceed ceed, Ceed *ceed_copy); CEED_EXTERN int CeedGetResource(Ceed ceed, const char **resource); CEED_EXTERN int CeedIsDeterministic(Ceed ceed, bool *is_deterministic); +CEED_EXTERN int CeedGetSupportsMixedPrecision(Ceed ceed, bool *supports_mixed_precision); CEED_EXTERN int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root); CEED_EXTERN int CeedAddJitDefine(Ceed ceed, const char *jit_define); CEED_EXTERN int CeedView(Ceed ceed, FILE *stream); @@ -177,6 +178,7 @@ CEED_EXTERN const char *const CeedEvalModes[]; CEED_EXTERN const char *const CeedQuadModes[]; CEED_EXTERN const char *const CeedElemTopologies[]; CEED_EXTERN const char *const CeedContextFieldTypes[]; +CEED_EXTERN const char *const CeedScalarTypes[]; CEED_EXTERN int CeedGetPreferredMemType(Ceed ceed, CeedMemType *type); @@ -426,6 +428,8 @@ CEED_EXTERN int CeedOperatorCheckReady(CeedOperator op); CEED_EXTERN int CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size); CEED_EXTERN int CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data); CEED_EXTERN int CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs_data_update); +CEED_EXTERN int CeedOperatorSetPrecision(CeedOperator op, CeedScalarType precision); +CEED_EXTERN int CeedOperatorGetPrecision(CeedOperator op, CeedScalarType *precision); CEED_EXTERN int CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request); CEED_EXTERN int CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request); diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h index f4dccf54ea..d78e83eeab 100644 --- a/include/ceed/jit-source/cuda/cuda-gen-templates.h +++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h @@ -12,8 +12,8 @@ //------------------------------------------------------------------------------ // Load matrices for basis actions //------------------------------------------------------------------------------ -template -inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { +template +inline __device__ void LoadMatrix(SharedData_Cuda &data, const ScalarIn *__restrict__ d_B, ScalarOut *B) { for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; } @@ -24,9 +24,9 @@ inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__res //------------------------------------------------------------------------------ // L-vector -> single point //------------------------------------------------------------------------------ -template +template inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) { const CeedInt ind = indices[p + elem * NUM_PTS]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -37,9 +37,9 @@ inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, cons //------------------------------------------------------------------------------ // Single point -> L-vector //------------------------------------------------------------------------------ -template +template inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) { + const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_u, ScalarOut *d_u) { if (p < points_in_elem) { const CeedInt ind = indices[p + elem * NUM_PTS]; @@ -56,8 +56,8 @@ inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, con //------------------------------------------------------------------------------ // Set E-vector value //------------------------------------------------------------------------------ -template -inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { +template +inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const ScalarIn value, ScalarOut *__restrict__ r_v) { const CeedInt target_comp = n / P_1D; const CeedInt target_node = n % P_1D; @@ -69,9 +69,9 @@ inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const Cee //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -template +template inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = indices[node + elem * P_1D]; @@ -83,9 +83,8 @@ inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt n //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, - CeedScalar *__restrict__ r_u) { +template +inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -97,9 +96,9 @@ inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt el //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = indices[node + elem * P_1D]; @@ -108,10 +107,10 @@ inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt } } -template +template inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, - CeedScalar *__restrict__ d_v) { + const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_v, + ScalarOut *__restrict__ d_v) { const CeedInt target_comp = n / P_1D; const CeedInt target_node = n % P_1D; @@ -125,9 +124,9 @@ inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const C //------------------------------------------------------------------------------ // E-vector -> L-vector, full assembly //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { const CeedInt in_comp = in / P_1D; const CeedInt in_node = in % P_1D; const CeedInt e_vec_size = P_1D * NUM_COMP; @@ -144,9 +143,9 @@ inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const //------------------------------------------------------------------------------ // E-vector -> L-vector, Qfunction assembly //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, - const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const CeedInt output_offset, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < Q_1D) { const CeedInt ind = data.t_id_x + elem * Q_1D; @@ -159,9 +158,8 @@ inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, con //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, - CeedScalar *__restrict__ d_v) { +template +inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -177,8 +175,8 @@ inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt e //------------------------------------------------------------------------------ // Set E-vector value //------------------------------------------------------------------------------ -template -inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { +template +inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const ScalarIn value, ScalarOut *__restrict__ r_v) { const CeedInt target_comp = n / (P_1D * P_1D); const CeedInt target_node_x = n % P_1D; const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; @@ -191,9 +189,9 @@ inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const Cee //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -template +template inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = indices[node + elem * P_1D * P_1D]; @@ -205,9 +203,8 @@ inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt n //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, - CeedScalar *__restrict__ r_u) { +template +inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -219,9 +216,9 @@ inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt el //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = indices[node + elem * P_1D * P_1D]; @@ -230,10 +227,10 @@ inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt } } -template +template inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, - CeedScalar *__restrict__ d_v) { + const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_v, + ScalarOut *__restrict__ d_v) { const CeedInt target_comp = n / (P_1D * P_1D); const CeedInt target_node_x = n % P_1D; const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; @@ -249,9 +246,9 @@ inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const C //------------------------------------------------------------------------------ // E-vector -> L-vector, full assembly //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { const CeedInt elem_size = P_1D * P_1D; const CeedInt in_comp = in / elem_size; const CeedInt in_node_x = in % P_1D; @@ -273,9 +270,9 @@ inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const //------------------------------------------------------------------------------ // E-vector -> L-vector, Qfunction assembly //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, - const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const CeedInt output_offset, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D; @@ -288,9 +285,8 @@ inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, con //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, - CeedScalar *__restrict__ d_v) { +template +inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -306,8 +302,8 @@ inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt e //------------------------------------------------------------------------------ // Set E-vector value //------------------------------------------------------------------------------ -template -inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) { +template +inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const ScalarIn value, ScalarOut *__restrict__ r_v) { const CeedInt target_comp = n / (P_1D * P_1D * P_1D); const CeedInt target_node_x = n % P_1D; const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; @@ -321,9 +317,9 @@ inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const Cee //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -template +template inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { + const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -337,9 +333,8 @@ inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt n //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, - CeedScalar *__restrict__ r_u) { +template +inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -353,10 +348,9 @@ inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt el //------------------------------------------------------------------------------ // E-vector -> Q-vector, offests provided //------------------------------------------------------------------------------ -template +template inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, - CeedScalar *__restrict__ r_u) { + const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) { if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; @@ -368,9 +362,9 @@ inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // E-vector -> Q-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, - CeedScalar *__restrict__ r_u) { +template +inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const ScalarIn *__restrict__ d_u, + ScalarOut *__restrict__ r_u) { if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; @@ -382,9 +376,9 @@ inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedI //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -395,10 +389,10 @@ inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt } } -template +template inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n, - const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v, - CeedScalar *__restrict__ d_v) { + const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_v, + ScalarOut *__restrict__ d_v) { const CeedInt target_comp = n / (P_1D * P_1D * P_1D); const CeedInt target_node_x = n % P_1D; const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D; @@ -415,9 +409,9 @@ inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const C //------------------------------------------------------------------------------ // E-vector -> L-vector, full assembly //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in, - const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { const CeedInt elem_size = P_1D * P_1D * P_1D; const CeedInt in_comp = in / elem_size; const CeedInt in_node_x = in % P_1D; @@ -442,9 +436,9 @@ inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const //------------------------------------------------------------------------------ // E-vector -> L-vector, Qfunction assembly //------------------------------------------------------------------------------ -template +template inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset, - const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + const CeedInt output_offset, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt z = 0; z < Q_1D; z++) { const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D; @@ -459,9 +453,8 @@ inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, con //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template -inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, - CeedScalar *__restrict__ d_v) { +template +inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -475,9 +468,9 @@ inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt e //------------------------------------------------------------------------------ // 3D collocated derivatives computation //------------------------------------------------------------------------------ -template -inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G, + ScalarOut *__restrict__ r_V) { if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { __syncthreads(); @@ -505,9 +498,9 @@ inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, //------------------------------------------------------------------------------ // 3D collocated derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G, + ScalarOut *__restrict__ r_V) { if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { __syncthreads(); diff --git a/include/ceed/jit-source/cuda/cuda-jit.h b/include/ceed/jit-source/cuda/cuda-jit.h index baa15a1e85..22a0e903b2 100644 --- a/include/ceed/jit-source/cuda/cuda-jit.h +++ b/include/ceed/jit-source/cuda/cuda-jit.h @@ -13,4 +13,11 @@ #define CeedPragmaSIMD #define CEED_Q_VLA 1 +#ifndef DBL_EPSILON +#define DBL_EPSILON 2.22044604925031308084726333618164062e-16 +#endif +#ifndef FLT_EPSILON +#define FLT_EPSILON 1.19209289550781250000000000000000000e-7F +#endif + #include "cuda-types.h" diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h index e5b31970ff..04a7718b90 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h @@ -12,8 +12,8 @@ //------------------------------------------------------------------------------ // 1D tensor contraction //------------------------------------------------------------------------------ -template -inline __device__ void Contract1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void Contract1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { data.slice[data.t_id_x] = *U; __syncthreads(); *V = 0.0; @@ -28,8 +28,8 @@ inline __device__ void Contract1d(SharedData_Cuda &data, const CeedScalar *U, co //------------------------------------------------------------------------------ // 1D transpose tensor contraction //------------------------------------------------------------------------------ -template -inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { data.slice[data.t_id_x] = *U; __syncthreads(); if (data.t_id_x < P_1D) { @@ -43,9 +43,8 @@ inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const CeedScal //------------------------------------------------------------------------------ // Interpolate to quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void InterpNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { Contract1d(data, &r_U[comp], c_B, &r_V[comp]); } @@ -54,9 +53,9 @@ inline __device__ void InterpNonTensor(SharedData_Cuda &data, const CeedScalar * //------------------------------------------------------------------------------ // Interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { r_V[comp] = 0.0; ContractTranspose1d(data, &r_U[comp], c_B, &r_V[comp]); @@ -66,8 +65,8 @@ inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const Cee //------------------------------------------------------------------------------ // Derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G, ScalarOut *__restrict__ r_V) { for (CeedInt dim = 0; dim < DIM; dim++) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { Contract1d(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]); @@ -78,9 +77,9 @@ inline __device__ void GradNonTensor(SharedData_Cuda &data, const CeedScalar *__ //------------------------------------------------------------------------------ // Derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G, + ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0; for (CeedInt dim = 0; dim < DIM; dim++) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -92,7 +91,7 @@ inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedS //------------------------------------------------------------------------------ // Quadrature weights //------------------------------------------------------------------------------ -template -inline __device__ void WeightNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) { +template +inline __device__ void WeightNonTensor(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight, ScalarOut *w) { *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0; } diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h index ec7102ea2c..dbe7ef3ae0 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h @@ -15,7 +15,8 @@ //------------------------------------------------------------------------------ // Interp kernels //------------------------------------------------------------------------------ -extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *__restrict__ d_U, + CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -41,8 +42,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, } } -extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *__restrict__ d_U, + CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -68,8 +69,8 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca } } -extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *__restrict__ d_U, + CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -98,7 +99,8 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed //------------------------------------------------------------------------------ // Grad kernels //------------------------------------------------------------------------------ -extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalarBase *c_G, const CeedScalarBase *__restrict__ d_U, + CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -124,8 +126,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, c } } -extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalarBase *c_G, const CeedScalarBase *__restrict__ d_U, + CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -151,8 +153,8 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala } } -extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalarBase *c_G, const CeedScalarBase *__restrict__ d_U, + CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -181,7 +183,7 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc //------------------------------------------------------------------------------ // Weight kernel //------------------------------------------------------------------------------ -extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) { +extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalarBase *__restrict__ q_weight, CeedScalarBase *__restrict__ d_W) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h index cb62c4f80b..74cbeb6809 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h @@ -12,8 +12,8 @@ //------------------------------------------------------------------------------ // Load matrices for basis actions //------------------------------------------------------------------------------ -template -inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { +template +inline __device__ void LoadMatrix(SharedData_Cuda &data, const ScalarIn *__restrict__ d_B, ScalarOut *B) { for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; } @@ -24,9 +24,9 @@ inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__res //------------------------------------------------------------------------------ // E-vector -> single element //------------------------------------------------------------------------------ -template +template inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const CeedInt strides_elem, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * strides_node + elem * strides_elem; @@ -40,9 +40,9 @@ inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt //------------------------------------------------------------------------------ // Single element -> E-vector //------------------------------------------------------------------------------ -template +template inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * strides_node + elem * strides_elem; @@ -53,9 +53,9 @@ inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedIn } } -template +template inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) { if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * strides_node + elem * strides_elem; @@ -73,9 +73,9 @@ inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt //------------------------------------------------------------------------------ // E-vector -> single element //------------------------------------------------------------------------------ -template +template inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const CeedInt strides_elem, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * strides_node + elem * strides_elem; @@ -89,9 +89,9 @@ inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt //------------------------------------------------------------------------------ // Single element -> E-vector //------------------------------------------------------------------------------ -template +template inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * strides_node + elem * strides_elem; @@ -102,9 +102,9 @@ inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedIn } } -template +template inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * strides_node + elem * strides_elem; @@ -122,9 +122,9 @@ inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt //------------------------------------------------------------------------------ // E-vector -> single element //------------------------------------------------------------------------------ -template +template inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const CeedInt strides_elem, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -140,9 +140,9 @@ inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt //------------------------------------------------------------------------------ // Single element -> E-vector //------------------------------------------------------------------------------ -template +template inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -155,9 +155,9 @@ inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedIn } } -template +template inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp, - const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) { + const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) { if (data.t_id_x < P_1D && data.t_id_y < P_1D) { for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; @@ -177,10 +177,10 @@ inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt //------------------------------------------------------------------------------ // E-vector -> single point //------------------------------------------------------------------------------ -template +template inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, - const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const ScalarIn *__restrict__ d_u, ScalarOut *r_u) { const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem; if (p < points_in_elem) { @@ -197,10 +197,10 @@ inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, cons //------------------------------------------------------------------------------ // Single point -> E-vector //------------------------------------------------------------------------------ -template +template inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, - const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v, - CeedScalar *d_v) { + const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const ScalarIn *r_v, + ScalarOut *d_v) { if (p < points_in_elem) { const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem; diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h index 0da9163716..2018ef429e 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h @@ -40,9 +40,9 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar //------------------------------------------------------------------------------ // 1D interpolate to points //------------------------------------------------------------------------------ -template -inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_V) { CeedScalar chebyshev_x[Q_1D]; for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; @@ -61,9 +61,9 @@ inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, //------------------------------------------------------------------------------ // 1D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, - CeedScalar *__restrict__ r_C) { +template +inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_C) { CeedScalar chebyshev_x[Q_1D]; ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); @@ -86,9 +86,9 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce //------------------------------------------------------------------------------ // 1D derivatives at points //------------------------------------------------------------------------------ -template -inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_V) { CeedScalar chebyshev_x[Q_1D]; ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); @@ -108,9 +108,9 @@ inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, co //------------------------------------------------------------------------------ // 1D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, - CeedScalar *__restrict__ r_C) { +template +inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_C) { CeedScalar chebyshev_x[Q_1D]; ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); @@ -137,9 +137,9 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 2D interpolate to points //------------------------------------------------------------------------------ -template -inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_V) { for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { CeedScalar buffer[Q_1D]; @@ -168,9 +168,9 @@ inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, //------------------------------------------------------------------------------ // 2D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, - CeedScalar *__restrict__ r_C) { +template +inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_C) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { CeedScalar buffer[Q_1D]; CeedScalar chebyshev_x[Q_1D]; @@ -206,9 +206,9 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce //------------------------------------------------------------------------------ // 2D derivatives at points //------------------------------------------------------------------------------ -template -inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_V) { for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { CeedScalar buffer[Q_1D]; @@ -241,9 +241,9 @@ inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, co //------------------------------------------------------------------------------ // 2D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, - CeedScalar *__restrict__ r_C) { +template +inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_C) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { CeedScalar buffer[Q_1D]; CeedScalar chebyshev_x[Q_1D]; @@ -287,9 +287,9 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 3D interpolate to points //------------------------------------------------------------------------------ -template -inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_V) { for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; for (CeedInt k = 0; k < Q_1D; k++) { CeedScalar buffer[Q_1D]; @@ -324,9 +324,9 @@ inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, //------------------------------------------------------------------------------ // 3D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, - CeedScalar *__restrict__ r_C) { +template +inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_C) { for (CeedInt k = 0; k < Q_1D; k++) { CeedScalar buffer[Q_1D]; CeedScalar chebyshev_x[Q_1D]; @@ -368,9 +368,9 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce //------------------------------------------------------------------------------ // 3D derivatives at points //------------------------------------------------------------------------------ -template -inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_V) { for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; for (CeedInt k = 0; k < Q_1D; k++) { CeedScalar buffer[Q_1D]; @@ -415,9 +415,9 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, - CeedScalar *__restrict__ r_C) { +template +inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X, + ScalarOut *__restrict__ r_C) { for (CeedInt k = 0; k < Q_1D; k++) { CeedScalar buffer[Q_1D]; CeedScalar chebyshev_x[Q_1D]; diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h index dcb1763e38..2dae4f7eab 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h @@ -20,8 +20,9 @@ //------------------------------------------------------------------------------ // Interp //------------------------------------------------------------------------------ -extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, - const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalarBase *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, + const CeedScalarBase *__restrict__ d_X, const CeedScalarBase *__restrict__ d_U, + CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -75,9 +76,9 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal } } -extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, - const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, - const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalarBase *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalarBase *__restrict__ d_X, + const CeedScalarBase *__restrict__ d_U, CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -145,9 +146,9 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const } } -extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, - const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, - const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalarBase *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalarBase *__restrict__ d_X, + const CeedScalarBase *__restrict__ d_U, CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -207,8 +208,9 @@ extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, co //------------------------------------------------------------------------------ // Grad //------------------------------------------------------------------------------ -extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, - const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalarBase *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, + const CeedScalarBase *__restrict__ d_X, const CeedScalarBase *__restrict__ d_U, + CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -262,9 +264,9 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar } } -extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, - const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, - const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalarBase *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalarBase *__restrict__ d_X, + const CeedScalarBase *__restrict__ d_U, CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -333,9 +335,9 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C } } -extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, - const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, - const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalarBase *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalarBase *__restrict__ d_X, + const CeedScalarBase *__restrict__ d_U, CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h index 4f76825d50..3429fcc76f 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h @@ -16,9 +16,9 @@ //------------------------------------------------------------------------------ // 2D tensor contraction x //------------------------------------------------------------------------------ -template -inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, - CeedScalar *V) { +template +inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, const ScalarIn2 *B, + ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D] = *U; __syncthreads(); @@ -33,9 +33,9 @@ inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_i //------------------------------------------------------------------------------ // 2D tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B, - CeedScalar *V) { +template +inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, const ScalarIn2 *B, + ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D] = *U; __syncthreads(); @@ -50,9 +50,9 @@ inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_i //------------------------------------------------------------------------------ // 2D transpose tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D] = *U; __syncthreads(); @@ -67,9 +67,9 @@ inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, cons //------------------------------------------------------------------------------ // 2D transpose tensor contract x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D] = *U; __syncthreads(); @@ -84,9 +84,9 @@ inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, cons //------------------------------------------------------------------------------ // 2D transpose tensor contract and add x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D] = *U; __syncthreads(); @@ -100,8 +100,8 @@ inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, c //------------------------------------------------------------------------------ // 2D pack/unpack quadrature values //------------------------------------------------------------------------------ -template -inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { +template +inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, ScalarOut *U) { const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -112,8 +112,8 @@ inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const in } } -template -inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) { +template +inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, ScalarOut *U) { const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -127,9 +127,9 @@ inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const //------------------------------------------------------------------------------ // 2D interpolate to quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; CeedScalar r_t[1]; @@ -146,9 +146,9 @@ inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar //------------------------------------------------------------------------------ // 2D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; CeedScalar r_t[1]; @@ -164,9 +164,9 @@ inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, C //------------------------------------------------------------------------------ // 2D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; CeedScalar r_t[1]; @@ -185,9 +185,9 @@ inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar * //------------------------------------------------------------------------------ // 2D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { const int t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D; CeedScalar r_t[1]; @@ -205,8 +205,8 @@ inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, Cee //------------------------------------------------------------------------------ // 2D quadrature weights //------------------------------------------------------------------------------ -template -inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { +template +inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) { const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D; *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0; @@ -219,9 +219,9 @@ inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 3D tensor contract x //------------------------------------------------------------------------------ -template -inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -236,9 +236,9 @@ inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_i //------------------------------------------------------------------------------ // 3D tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -253,9 +253,9 @@ inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_i //------------------------------------------------------------------------------ // 3D tensor contract z //------------------------------------------------------------------------------ -template -inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -270,9 +270,9 @@ inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_i //------------------------------------------------------------------------------ // 3D tensor contract z //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -287,9 +287,9 @@ inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, cons //------------------------------------------------------------------------------ // 3D transpose tensor contract z //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, - const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -303,9 +303,9 @@ inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, c //------------------------------------------------------------------------------ // 3D transpose tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -320,9 +320,9 @@ inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, cons //------------------------------------------------------------------------------ // 3D transpose tensor contract y //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, - const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -336,9 +336,9 @@ inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, c //------------------------------------------------------------------------------ // 3D transpose tensor contract x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U, - const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U, + const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -353,9 +353,9 @@ inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, cons //------------------------------------------------------------------------------ // 3D transpose tensor contract add x //------------------------------------------------------------------------------ -template +template inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, - const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { + const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U; __syncthreads(); @@ -369,8 +369,8 @@ inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, c //------------------------------------------------------------------------------ // 3D pack/unpack quadrature values //------------------------------------------------------------------------------ -template -inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { +template +inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, ScalarOut *U) { const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D); for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -381,8 +381,8 @@ inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const in } } -template -inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) { +template +inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, ScalarOut *U) { const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D); for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -396,9 +396,9 @@ inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const //------------------------------------------------------------------------------ // 3D interpolate to quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); CeedScalar r_t1[1], r_t2[1]; @@ -416,9 +416,9 @@ inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar //------------------------------------------------------------------------------ // 3D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); CeedScalar r_t1[1], r_t2[1]; @@ -435,9 +435,9 @@ inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, C //------------------------------------------------------------------------------ // 3D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); CeedScalar r_t1[1], r_t2[1]; @@ -461,9 +461,9 @@ inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar * //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); CeedScalar r_t1[1], r_t2[1]; @@ -486,9 +486,9 @@ inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, Cee //------------------------------------------------------------------------------ // 3D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); CeedScalar r_t1[1], r_t2[1]; @@ -509,9 +509,9 @@ inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, Ce //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + const ScalarIn3 *c_G, ScalarOut *__restrict__ r_V) { const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D); CeedScalar r_t1[1], r_t2[1]; @@ -531,8 +531,8 @@ inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda //------------------------------------------------------------------------------ // 3D quadrature weights //------------------------------------------------------------------------------ -template -inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { +template +inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) { const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D); *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0; diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h index f4f701505a..b453d66596 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h @@ -16,8 +16,8 @@ //------------------------------------------------------------------------------ // 1D tensor contraction x //------------------------------------------------------------------------------ -template -inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractX1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x] = *U; __syncthreads(); @@ -32,8 +32,8 @@ inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ // 1D transpose tensor contraction x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x] = *U; __syncthreads(); @@ -48,8 +48,8 @@ inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ // 1D interpolate to quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { +template +inline __device__ void Interp1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractX1d(data, &r_U[comp], c_B, &r_V[comp]); } @@ -58,9 +58,9 @@ inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restr //------------------------------------------------------------------------------ // 1D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeX1d(data, &r_U[comp], c_B, &r_V[comp]); } @@ -69,9 +69,9 @@ inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar //------------------------------------------------------------------------------ // 1D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void Grad1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractX1d(data, &r_U[comp], c_G, &r_V[comp]); } @@ -80,9 +80,9 @@ inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restric //------------------------------------------------------------------------------ // 1D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTranspose1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeX1d(data, &r_U[comp], c_G, &r_V[comp]); } @@ -91,8 +91,8 @@ inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar * //------------------------------------------------------------------------------ // 1D quadrature weights //------------------------------------------------------------------------------ -template -inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { +template +inline __device__ void Weight1d(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) { *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0; } @@ -103,8 +103,8 @@ inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restr //------------------------------------------------------------------------------ // 2D tensor contraction x //------------------------------------------------------------------------------ -template -inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractX2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); @@ -119,8 +119,8 @@ inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ // 2D tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractY2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); @@ -135,8 +135,8 @@ inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ // 2D transpose tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); @@ -151,8 +151,8 @@ inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ // 2D transpose tensor contract x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); @@ -167,8 +167,8 @@ inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ // 2D transpose tensor contract and add x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { __syncthreads(); data.slice[data.t_id_x + data.t_id_y * T_1D] = *U; __syncthreads(); @@ -182,9 +182,8 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 2D interpolate to quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractX2d(data, &r_U[comp], c_B, r_t); @@ -195,9 +194,9 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_ //------------------------------------------------------------------------------ // 2D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeY2d(data, &r_U[comp], c_B, r_t); @@ -208,9 +207,9 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 2D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractX2d(data, &r_U[comp], c_G, r_t); @@ -223,9 +222,9 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r //------------------------------------------------------------------------------ // 2D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ContractTransposeY2d(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t); @@ -238,8 +237,8 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc //------------------------------------------------------------------------------ // 2D quadrature weights //------------------------------------------------------------------------------ -template -inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { +template +inline __device__ void WeightTensor2d(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) { *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; } @@ -250,8 +249,8 @@ inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *_ //------------------------------------------------------------------------------ // 3D tensor contract x //------------------------------------------------------------------------------ -template -inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractX3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { CeedScalar r_B[P_1D]; for (CeedInt i = 0; i < P_1D; i++) { r_B[i] = B[i + data.t_id_x * P_1D]; @@ -273,8 +272,8 @@ inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ // 3D tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractY3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { CeedScalar r_B[P_1D]; for (CeedInt i = 0; i < P_1D; i++) { r_B[i] = B[i + data.t_id_y * P_1D]; @@ -296,8 +295,8 @@ inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ // 3D tensor contract z //------------------------------------------------------------------------------ -template -inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractZ3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { for (CeedInt k = 0; k < Q_1D; k++) { V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { @@ -311,8 +310,8 @@ inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, c //------------------------------------------------------------------------------ // 3D transpose tensor contract z //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { for (CeedInt k = 0; k < P_1D; k++) { V[k] = 0.0; if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { @@ -326,8 +325,8 @@ inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ // 3D transpose tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { r_B[i] = B[data.t_id_y + i * P_1D]; @@ -349,8 +348,8 @@ inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ // 3D transpose tensor contract y //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { r_B[i] = B[data.t_id_y + i * P_1D]; @@ -371,8 +370,8 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 3D transpose tensor contract x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { r_B[i] = B[data.t_id_x + i * P_1D]; @@ -394,8 +393,8 @@ inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedSca //------------------------------------------------------------------------------ // 3D transpose tensor contract add x //------------------------------------------------------------------------------ -template -inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { +template +inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) { CeedScalar r_B[Q_1D]; for (CeedInt i = 0; i < Q_1D; i++) { r_B[i] = B[data.t_id_x + i * P_1D]; @@ -416,9 +415,8 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 3D interpolate to quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -431,9 +429,9 @@ inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *_ //------------------------------------------------------------------------------ // 3D interpolate transpose //------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + ScalarOut *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -446,9 +444,9 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const Ceed //------------------------------------------------------------------------------ // 3D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -467,9 +465,9 @@ inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__r //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -488,9 +486,9 @@ inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedSc //------------------------------------------------------------------------------ // 3D derivatives at quadrature points //------------------------------------------------------------------------------ -template -inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, - CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G, + ScalarOut *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -506,9 +504,9 @@ inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedS //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, - const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { +template +inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, + const ScalarIn3 *c_G, ScalarOut *__restrict__ r_V) { CeedScalar r_t1[T_1D]; CeedScalar r_t2[T_1D]; for (CeedInt comp = 0; comp < NUM_COMP; comp++) { @@ -524,8 +522,8 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, co //------------------------------------------------------------------------------ // 3D quadrature weights //------------------------------------------------------------------------------ -template -inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) { +template +inline __device__ void WeightTensor3d(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) { const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D); const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0; for (CeedInt q = 0; q < Q_1D; q++) { diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h index 1252c8197d..602d4b2f91 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h @@ -15,7 +15,8 @@ //------------------------------------------------------------------------------ // Interp kernel by dim //------------------------------------------------------------------------------ -extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { +extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *__restrict__ d_U, + CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -53,8 +54,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, } } -extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *__restrict__ d_U, + CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -92,8 +93,8 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca } } -extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *__restrict__ d_U, + CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -134,8 +135,8 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed //------------------------------------------------------------------------------ // Grad kernel by dim //------------------------------------------------------------------------------ -extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *c_G, const CeedScalarBase *__restrict__ d_U, + CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -177,8 +178,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, c } } -extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *c_G, + const CeedScalarBase *__restrict__ d_U, CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -220,8 +221,8 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala } } -extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { +extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *c_G, + const CeedScalarBase *__restrict__ d_U, CeedScalarBase *__restrict__ d_V) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; @@ -266,7 +267,7 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc //------------------------------------------------------------------------------ // Weight kernels by dim //------------------------------------------------------------------------------ -extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) { +extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalarBase *__restrict__ q_weight_1d, CeedScalarBase *__restrict__ d_W) { extern __shared__ CeedScalar slice[]; SharedData_Cuda data; diff --git a/include/ceed/jit-source/cuda/cuda-types.h b/include/ceed/jit-source/cuda/cuda-types.h index 9acb0064a3..0149923bba 100644 --- a/include/ceed/jit-source/cuda/cuda-types.h +++ b/include/ceed/jit-source/cuda/cuda-types.h @@ -14,8 +14,8 @@ #define CEED_CUDA_NUMBER_FIELDS 16 typedef struct { - const CeedScalar *inputs[CEED_CUDA_NUMBER_FIELDS]; - CeedScalar *outputs[CEED_CUDA_NUMBER_FIELDS]; + const CeedScalarBase *inputs[CEED_CUDA_NUMBER_FIELDS]; + CeedScalarBase *outputs[CEED_CUDA_NUMBER_FIELDS]; } Fields_Cuda; typedef struct { @@ -24,10 +24,10 @@ typedef struct { } FieldsInt_Cuda; typedef struct { - CeedInt num_elem; - const CeedInt *num_per_elem; - const CeedInt *indices; - const CeedScalar *coords; + CeedInt num_elem; + const CeedInt *num_per_elem; + const CeedInt *indices; + const CeedScalarBase *coords; } Points_Cuda; typedef struct { diff --git a/include/ceed/jit-source/hip/hip-jit.h b/include/ceed/jit-source/hip/hip-jit.h index 70a00416e4..03040e6908 100644 --- a/include/ceed/jit-source/hip/hip-jit.h +++ b/include/ceed/jit-source/hip/hip-jit.h @@ -13,4 +13,11 @@ #define CeedPragmaSIMD #define CEED_Q_VLA 1 +#ifndef DBL_EPSILON +#define DBL_EPSILON 2.22044604925031308084726333618164062e-16 +#endif +#ifndef FLT_EPSILON +#define FLT_EPSILON 1.19209289550781250000000000000000000e-7F +#endif + #include "hip-types.h" diff --git a/include/ceed/jit-source/sycl/sycl-jit.h b/include/ceed/jit-source/sycl/sycl-jit.h index 1a2971f4df..b42aa5304f 100644 --- a/include/ceed/jit-source/sycl/sycl-jit.h +++ b/include/ceed/jit-source/sycl/sycl-jit.h @@ -13,5 +13,12 @@ #define CeedPragmaSIMD #define CEED_Q_VLA 1 +#ifndef DBL_EPSILON +#define DBL_EPSILON 2.22044604925031308084726333618164062e-16 +#endif +#ifndef FLT_EPSILON +#define FLT_EPSILON 1.19209289550781250000000000000000000e-7F +#endif + // Need quotes for recursive header inclusion #include "sycl-types.h" diff --git a/include/ceed/types.h b/include/ceed/types.h index 2739390d8e..fb51f94a1d 100644 --- a/include/ceed/types.h +++ b/include/ceed/types.h @@ -118,10 +118,12 @@ typedef signed char CeedInt8; /// @ingroup Ceed typedef enum { /// Single precision - CEED_SCALAR_FP32, + CEED_SCALAR_FP32 = 0, /// Double precision - CEED_SCALAR_FP64 + CEED_SCALAR_FP64 = 1 } CeedScalarType; +#define CEED_SCALAR_FP32 0 +#define CEED_SCALAR_FP64 1 /// Base scalar type for the library to use: change which header is included to change the precision. #include "ceed-f64.h" // IWYU pragma: export diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c index 6dfbd55d61..e8369b44ca 100644 --- a/interface/ceed-basis.c +++ b/interface/ceed-basis.c @@ -1329,7 +1329,7 @@ int CeedSymmetricSchurDecomposition(Ceed ceed, CeedScalar *mat, CeedScalar *lamb // Reduce sub and super diagonal CeedInt p = 0, q = 0, itr = 0, max_itr = n * n * n * n; - CeedScalar tol = CEED_EPSILON; + CeedScalar tol = 10 * CEED_EPSILON; while (itr < max_itr) { // Update p, q, size of reduced portions of diagonal diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c index 039f8522db..bf18552441 100644 --- a/interface/ceed-operator.c +++ b/interface/ceed-operator.c @@ -634,6 +634,49 @@ int CeedOperatorIsSetupDone(CeedOperator op, bool *is_setup_done) { return CEED_ERROR_SUCCESS; } +/** + @brief Set the floating point precision for `CeedOperator` application + + @param[in] op `CeedOperator` + @param[in] precision `CeedScalarType` to use for operator application + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedOperatorSetPrecision(CeedOperator op, CeedScalarType scalar_type) { + bool is_immutable, is_composite, supports_mixed_precision; + Ceed ceed; + + CeedCall(CeedOperatorGetCeed(op, &ceed)); + CeedCall(CeedOperatorIsImmutable(op, &is_immutable)); + CeedCheck(!is_immutable, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetPrecision must be called before operator is finalized"); + CeedCall(CeedOperatorIsComposite(op, &is_composite)); + CeedCheck(!is_composite, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetPrecision should be set on single operators"); + CeedCall(CeedGetSupportsMixedPrecision(ceed, &supports_mixed_precision)); + CeedCheck(scalar_type == CEED_SCALAR_TYPE || supports_mixed_precision, ceed, CEED_ERROR_UNSUPPORTED, + "Backend does not implement mixed precision operators"); + + op->precision = scalar_type; + CeedCallBackend(CeedDestroy(&ceed)); + return CEED_ERROR_SUCCESS; +} + +/** + @brief Get whether a `CeedOperator` is set to use reduced precision for operator application + + @param[in] op `CeedOperator` + @param[out] precision Variable to store operator precision + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedOperatorGetPrecision(CeedOperator op, CeedScalarType *precision) { + *precision = op->precision; + return CEED_ERROR_SUCCESS; +} + /** @brief Get the `CeedQFunction` associated with a `CeedOperator` @@ -768,6 +811,7 @@ int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunc (*op)->ref_count = 1; (*op)->input_size = -1; (*op)->output_size = -1; + (*op)->precision = CEED_SCALAR_TYPE; CeedCall(CeedQFunctionReferenceCopy(qf, &(*op)->qf)); if (dqf && dqf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqf, &(*op)->dqf)); if (dqfT && dqfT != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqfT, &(*op)->dqfT)); @@ -812,6 +856,7 @@ int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, C (*op)->is_at_points = true; (*op)->input_size = -1; (*op)->output_size = -1; + (*op)->precision = CEED_SCALAR_TYPE; CeedCall(CeedQFunctionReferenceCopy(qf, &(*op)->qf)); if (dqf && dqf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqf, &(*op)->dqf)); if (dqfT && dqfT != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqfT, &(*op)->dqfT)); diff --git a/interface/ceed-types.c b/interface/ceed-types.c index e975793307..3186489b58 100644 --- a/interface/ceed-types.c +++ b/interface/ceed-types.c @@ -64,3 +64,8 @@ const char *const CeedFESpaces[] = { [CEED_FE_SPACE_HDIV] = "H(div) space", [CEED_FE_SPACE_HCURL] = "H(curl) space", }; + +const char *const CeedScalarTypes[] = { + [CEED_SCALAR_FP32] = "CEED_SCALAR_FP32", + [CEED_SCALAR_FP64] = "CEED_SCALAR_FP32", +}; diff --git a/interface/ceed.c b/interface/ceed.c index b4bf09883a..9e48b24e00 100644 --- a/interface/ceed.c +++ b/interface/ceed.c @@ -678,6 +678,21 @@ int CeedSetDeterministic(Ceed ceed, bool is_deterministic) { return CEED_ERROR_SUCCESS; } +/** + @brief Flag `Ceed` context as being able to create mixed precision operators + + @param[in] ceed `Ceed` to flag as deterministic + @param[out] supports_mixed_precision Mixed precision status to set + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedSetSupportsMixedPrecision(Ceed ceed, bool supports_mixed_precision) { + ceed->supports_mixed_precision = supports_mixed_precision; + return CEED_ERROR_SUCCESS; +} + /** @brief Set a backend function. @@ -1387,6 +1402,21 @@ int CeedIsDeterministic(Ceed ceed, bool *is_deterministic) { return CEED_ERROR_SUCCESS; } +/** + @brief Get deterministic status of `Ceed` context + + @param[in] ceed `Ceed` context + @param[out] supports_mixed_precision Variable to store deterministic status + + @return An error code: 0 - success, otherwise - failure + + @ref User +**/ +int CeedGetSupportsMixedPrecision(Ceed ceed, bool *supports_mixed_precision) { + *supports_mixed_precision = ceed->supports_mixed_precision; + return CEED_ERROR_SUCCESS; +} + /** @brief Set additional JiT source root for `Ceed` context diff --git a/tests/t360-basis.c b/tests/t360-basis.c index f953157e1c..5e8a3fbe2b 100644 --- a/tests/t360-basis.c +++ b/tests/t360-basis.c @@ -40,7 +40,7 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < p_dim; i++) area += v_array[i]; - if (fabs(area - 2.0 * CeedIntPow(2, dim)) > 5E-6) printf("Incorrect area computed %f != %f\n", area, 2.0 * CeedIntPow(2, dim)); + if (fabs(area - 2.0 * CeedIntPow(2, dim)) > 1E-5) printf("Incorrect area computed %f != %f\n", area, 2.0 * CeedIntPow(2, dim)); CeedVectorRestoreArrayRead(v, &v_array); } diff --git a/tests/t502-operator.c b/tests/t502-operator.c index b9e517396b..b1756b3035 100644 --- a/tests/t502-operator.c +++ b/tests/t502-operator.c @@ -1,12 +1,17 @@ /// @file /// Test creation, action, and destruction for mass matrix operator with multiple components /// \test Test creation, action, and destruction for mass matrix operator with multiple components -#include "t502-operator.h" + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 #include #include #include #include +#include + +#include "t502-operator.h" int main(int argc, char **argv) { Ceed ceed; @@ -18,8 +23,21 @@ int main(int argc, char **argv) { CeedInt num_elem = 15, p = 5, q = 8; CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1; CeedInt ind_x[num_elem * 2], ind_u[num_elem * p]; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } CeedVectorCreate(ceed, num_nodes_x, &x); { @@ -69,11 +87,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_setup, precision); CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_mass, precision); CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); @@ -100,8 +120,8 @@ int main(int argc, char **argv) { sum_2 += v_array[2 * i + 1]; } CeedVectorRestoreArrayRead(v, &v_array); - if (fabs(sum_1 - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum_1); - if (fabs(sum_2 - 2.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 2.0\n", sum_2); + if (fabs(sum_1 - 1.) > 1000. * epsilon) printf("Computed Area: %f != True Area: 1.0\n", sum_1); + if (fabs(sum_2 - 2.) > 1000. * epsilon) printf("Computed Area: %f != True Area: 2.0\n", sum_2); } CeedVectorDestroy(&x); diff --git a/tests/t503-operator.c b/tests/t503-operator.c index 7b6caae59b..c9977e708f 100644 --- a/tests/t503-operator.c +++ b/tests/t503-operator.c @@ -1,10 +1,15 @@ /// @file /// Test creation, action, and destruction for mass matrix operator with passive inputs and outputs /// \test Test creation, action, and destruction for mass matrix operator with passive inputs and outputs + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 + #include #include #include #include +#include #include "t500-operator.h" @@ -18,8 +23,21 @@ int main(int argc, char **argv) { CeedInt num_elem = 15, p = 5, q = 8; CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1; CeedInt ind_x[num_elem * 2], ind_u[num_elem * p]; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } // Vectors CeedVectorCreate(ceed, num_nodes_x, &x); @@ -70,11 +88,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, x); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); + CeedOperatorSetPrecision(op_setup, precision); CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, u); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, v); + CeedOperatorSetPrecision(op_mass, precision); // Note - It is atypical to use only passive fields; this test is intended // as a test for all passive input modes rather than as an example. @@ -90,7 +110,7 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < num_nodes_u; i++) sum += v_array[i]; CeedVectorRestoreArrayRead(v, &v_array); - if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); + if (fabs(sum - 1.) > 1000. * epsilon) printf("Computed Area: %f != True Area: 1.0\n", sum); } CeedVectorDestroy(&x); diff --git a/tests/t505-operator.c b/tests/t505-operator.c index 1cda4550d4..154ec6561a 100644 --- a/tests/t505-operator.c +++ b/tests/t505-operator.c @@ -1,10 +1,15 @@ /// @file /// Test CeedOperatorApplyAdd /// \test Test CeedOperatorApplyAdd + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 + #include #include #include #include +#include #include "t500-operator.h" @@ -18,8 +23,21 @@ int main(int argc, char **argv) { CeedInt num_elem = 15, p = 5, q = 8; CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1; CeedInt ind_x[num_elem * 2], ind_u[num_elem * p]; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } CeedVectorCreate(ceed, num_nodes_x, &x); { @@ -68,11 +86,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_setup, precision); CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_mass, precision); CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); @@ -89,7 +109,7 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < num_nodes_u; i++) sum += v_array[i]; CeedVectorRestoreArrayRead(v, &v_array); - if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); + if (fabs(sum - 1.) > 1000. * epsilon) printf("Computed Area: %f != True Area: 1.0\n", sum); } // Apply with V = 1 @@ -104,7 +124,7 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < num_nodes_u; i++) sum += v_array[i]; CeedVectorRestoreArrayRead(v, &v_array); - if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum); + if (fabs(sum - 1.) > 1000. * epsilon) printf("Computed Area: %f != True Area: 1.0\n", sum); } CeedVectorDestroy(&x); diff --git a/tests/t506-operator.c b/tests/t506-operator.c index a17b6cbb0a..8b2975717a 100644 --- a/tests/t506-operator.c +++ b/tests/t506-operator.c @@ -1,10 +1,15 @@ /// @file /// Test creation reuse of the same QFunction for multiple operators /// \test Test creation reuse of the same QFunction for multiple operators + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 + #include #include #include #include +#include #include "t502-operator.h" @@ -18,8 +23,21 @@ int main(int argc, char **argv) { CeedInt num_elem = 15, p = 5, q = 8, scale = 3, num_comp = 2; CeedInt num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1; CeedInt ind_x[num_elem * 2], ind_u[num_elem * p]; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } CeedVectorCreate(ceed, num_nodes_x, &x); { @@ -75,22 +93,26 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup_small, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_small, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup_small, "x", elem_restriction_x, basis_x_small, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup_small, "rho", elem_restriction_q_data_small, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_setup_small, precision); CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_small); CeedOperatorSetField(op_mass_small, "rho", elem_restriction_q_data_small, CEED_BASIS_NONE, q_data_small); CeedOperatorSetField(op_mass_small, "u", elem_restriction_u, basis_u_small, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass_small, "v", elem_restriction_u, basis_u_small, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_mass_small, precision); // 'Large' operators CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_large); CeedOperatorSetField(op_setup_large, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_large, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup_large, "x", elem_restriction_x, basis_x_large, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup_large, "rho", elem_restriction_q_data_large, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_setup_large, precision); CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_large); CeedOperatorSetField(op_mass_large, "rho", elem_restriction_q_data_large, CEED_BASIS_NONE, q_data_large); CeedOperatorSetField(op_mass_large, "u", elem_restriction_u, basis_u_large, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass_large, "v", elem_restriction_u, basis_u_large, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_mass_large, precision); // Setup CeedOperatorApply(op_setup_small, x, q_data_small, CEED_REQUEST_IMMEDIATE); @@ -121,8 +143,8 @@ int main(int argc, char **argv) { sum_2 += v_array[num_comp * i + 1]; } CeedVectorRestoreArrayRead(v, &v_array); - if (fabs(sum_1 - 1.) > 1000. * CEED_EPSILON) printf("Small Problem, Component 1: Computed Area %f != True Area 1.0\n", sum_1); - if (fabs(sum_2 - 2.) > 1000. * CEED_EPSILON) printf("Small Problem, Component 2: Computed Area %f != True Area 2.0\n", sum_2); + if (fabs(sum_1 - 1.) > 1000. * epsilon) printf("Small Problem, Component 1: Computed Area %f != True Area 1.0\n", sum_1); + if (fabs(sum_2 - 2.) > 1000. * epsilon) printf("Small Problem, Component 2: Computed Area %f != True Area 2.0\n", sum_2); } // 'Large' operator @@ -140,8 +162,8 @@ int main(int argc, char **argv) { } CeedVectorRestoreArrayRead(v, &v_array); - if (fabs(sum_1 - 1.) > 1000. * CEED_EPSILON) printf("Large Problem, Component 1: Computed Area %f != True Area 1.0\n", sum_1); - if (fabs(sum_2 - 2.) > 1000. * CEED_EPSILON) printf("Large Problem, Component 2: Computed Area %f != True Area 2.0\n", sum_2); + if (fabs(sum_1 - 1.) > 1000. * epsilon) printf("Large Problem, Component 1: Computed Area %f != True Area 1.0\n", sum_1); + if (fabs(sum_2 - 2.) > 1000. * epsilon) printf("Large Problem, Component 2: Computed Area %f != True Area 2.0\n", sum_2); } CeedVectorDestroy(&x); diff --git a/tests/t510-operator.c b/tests/t510-operator.c index 059ad9c447..9d8a60eb10 100644 --- a/tests/t510-operator.c +++ b/tests/t510-operator.c @@ -1,12 +1,17 @@ /// @file /// Test creation, action, and destruction for mass matrix operator /// \test Test creation, action, and destruction for mass matrix operator + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 + #include "t510-operator.h" #include #include #include #include +#include #include "t320-basis.h" @@ -24,8 +29,21 @@ int main(int argc, char **argv) { CeedInt ind_x[num_elem * p]; CeedScalar q_ref[dim * q], q_weight[q]; CeedScalar interp[p * q], grad[dim * p * q]; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } CeedVectorCreate(ceed, dim * num_dofs, &x); { @@ -90,11 +108,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_setup, precision); CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_mass, precision); CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE); @@ -107,7 +127,7 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < num_dofs; i++) { - if (fabs(v_array[i]) > 1e-14) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]); + if (fabs(v_array[i]) > 200 * epsilon) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]); } CeedVectorRestoreArrayRead(v, &v_array); } diff --git a/tests/t520-operator.c b/tests/t520-operator.c index 31fceb9c97..2679c23e3d 100644 --- a/tests/t520-operator.c +++ b/tests/t520-operator.c @@ -1,10 +1,15 @@ /// @file /// Test creation, action, and destruction for composite mass matrix operator /// \test Test creation, action, and destruction for composite mass matrix operator + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 + #include #include #include #include +#include #include "t320-basis.h" #include "t510-operator.h" @@ -22,19 +27,32 @@ int main(int argc, char **argv) { Ceed ceed; CeedElemRestriction elem_restriction_x_tet, elem_restriction_u_tet, elem_restriction_q_data_tet, elem_restriction_x_hex, elem_restriction_u_hex, elem_restriction_q_data_hex; - CeedBasis basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex; - CeedQFunction qf_setup_tet, qf_mass_tet, qf_setup_hex, qf_mass_hex; - CeedOperator op_setup_tet, op_mass_tet, op_setup_hex, op_mass_hex, op_setup, op_mass; - CeedVector q_data_tet, q_data_hex, x, u, v; - CeedInt num_elem_tet = 6, p_tet = 6, q_tet = 4, num_elem_hex = 6, p_hex = 3, q_hex = 4, dim = 2; - CeedInt n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; - CeedInt row, col, offset; - CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * q_tet, num_qpts_hex = num_elem_hex * q_hex * q_hex; - CeedInt ind_x_tet[num_elem_tet * p_tet], ind_x_hex[num_elem_hex * p_hex * p_hex]; - CeedScalar q_ref[dim * q_tet], q_weight[q_tet]; - CeedScalar interp[p_tet * q_tet], grad[dim * p_tet * q_tet]; + CeedBasis basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex; + CeedQFunction qf_setup_tet, qf_mass_tet, qf_setup_hex, qf_mass_hex; + CeedOperator op_setup_tet, op_mass_tet, op_setup_hex, op_mass_hex, op_setup, op_mass; + CeedVector q_data_tet, q_data_hex, x, u, v; + CeedInt num_elem_tet = 6, p_tet = 6, q_tet = 4, num_elem_hex = 6, p_hex = 3, q_hex = 4, dim = 2; + CeedInt n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; + CeedInt row, col, offset; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * q_tet, num_qpts_hex = num_elem_hex * q_hex * q_hex; + CeedInt ind_x_tet[num_elem_tet * p_tet], ind_x_hex[num_elem_hex * p_hex * p_hex]; + CeedScalar q_ref[dim * q_tet], q_weight[q_tet]; + CeedScalar interp[p_tet * q_tet], grad[dim * p_tet * q_tet]; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } // Vectors CeedVectorCreate(ceed, dim * num_dofs, &x); @@ -106,12 +124,14 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup_tet, "_weight", CEED_ELEMRESTRICTION_NONE, basis_x_tet, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup_tet, "dx", elem_restriction_x_tet, basis_x_tet, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet); + CeedOperatorSetPrecision(op_setup_tet, precision); // ---- Mass Tet CeedOperatorCreate(ceed, qf_mass_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_tet); CeedOperatorSetField(op_mass_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet); CeedOperatorSetField(op_mass_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); CeedOperatorSetName(op_mass_tet, "mass tet"); + CeedOperatorSetPrecision(op_mass_tet, precision); // Set up Hex Elements // -- Restrictions @@ -150,12 +170,14 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup_hex, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_hex, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup_hex, "dx", elem_restriction_x_hex, basis_x_hex, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex); + CeedOperatorSetPrecision(op_setup_hex, precision); CeedOperatorCreate(ceed, qf_mass_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_hex); CeedOperatorSetField(op_mass_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex); CeedOperatorSetField(op_mass_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); CeedOperatorSetName(op_mass_hex, "mass hex"); + CeedOperatorSetPrecision(op_mass_hex, precision); // Set up Composite Operators // -- Create @@ -193,7 +215,7 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < num_dofs; i++) { - if (fabs(v_array[i]) > 1e-14) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]); + if (fabs(v_array[i]) > 200 * epsilon) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]); } CeedVectorRestoreArrayRead(v, &v_array); } diff --git a/tests/t522-operator.c b/tests/t522-operator.c index b2e1da90ac..3c57223a1a 100644 --- a/tests/t522-operator.c +++ b/tests/t522-operator.c @@ -1,12 +1,17 @@ /// @file /// Test creation, action, and destruction for diffusion matrix operator /// \test Test creation, action, and destruction for diffusion matrix operator + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 + #include "t522-operator.h" #include #include #include #include +#include #include "t320-basis.h" @@ -23,19 +28,32 @@ int main(int argc, char **argv) { Ceed ceed; CeedElemRestriction elem_restriction_x_tet, elem_restriction_u_tet, elem_restriction_q_data_tet, elem_restriction_x_hex, elem_restriction_u_hex, elem_restriction_q_data_hex; - CeedBasis basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex; - CeedQFunction qf_setup_tet, qf_diff_tet, qf_setup_hex, qf_diff_hex; - CeedOperator op_setup_tet, op_diff_tet, op_setup_hex, op_diff_hex, op_setup, op_diff; - CeedVector q_data_tet, q_data_hex, x, u, v; - CeedInt num_elem_tet = 6, p_tet = 6, q_tet = 4, num_elem_hex = 6, p_hex = 3, q_hex = 4, dim = 2; - CeedInt n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; - CeedInt row, col, offset; - CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * q_tet, num_qpts_hex = num_elem_hex * q_hex * q_hex; - CeedInt ind_x_tet[num_elem_tet * p_tet], ind_x_hex[num_elem_hex * p_hex * p_hex]; - CeedScalar q_ref[dim * q_tet], q_weight[q_tet]; - CeedScalar interp[p_tet * q_tet], grad[dim * p_tet * q_tet]; + CeedBasis basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex; + CeedQFunction qf_setup_tet, qf_diff_tet, qf_setup_hex, qf_diff_hex; + CeedOperator op_setup_tet, op_diff_tet, op_setup_hex, op_diff_hex, op_setup, op_diff; + CeedVector q_data_tet, q_data_hex, x, u, v; + CeedInt num_elem_tet = 6, p_tet = 6, q_tet = 4, num_elem_hex = 6, p_hex = 3, q_hex = 4, dim = 2; + CeedInt n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3; + CeedInt row, col, offset; + CeedInt num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * q_tet, num_qpts_hex = num_elem_hex * q_hex * q_hex; + CeedInt ind_x_tet[num_elem_tet * p_tet], ind_x_hex[num_elem_hex * p_hex * p_hex]; + CeedScalar q_ref[dim * q_tet], q_weight[q_tet]; + CeedScalar interp[p_tet * q_tet], grad[dim * p_tet * q_tet]; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } // Vectors CeedVectorCreate(ceed, dim * num_dofs, &x); @@ -108,11 +126,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup_tet, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_tet, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup_tet, "dx", elem_restriction_x_tet, basis_x_tet, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet); + CeedOperatorSetPrecision(op_setup_tet, precision); // ---- Diff Tet CeedOperatorCreate(ceed, qf_diff_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff_tet); CeedOperatorSetField(op_diff_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet); CeedOperatorSetField(op_diff_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_diff_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_diff_tet, precision); // Hex Elements // -- Restrictions @@ -152,11 +172,13 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup_hex, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_hex, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup_hex, "dx", elem_restriction_x_hex, basis_x_hex, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex); + CeedOperatorSetPrecision(op_setup_hex, precision); CeedOperatorCreate(ceed, qf_diff_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff_hex); CeedOperatorSetField(op_diff_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex); CeedOperatorSetField(op_diff_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_diff_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_diff_hex, precision); // Composite Operators CeedCompositeOperatorCreate(ceed, &op_setup); @@ -180,7 +202,7 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < num_dofs; i++) { - if (fabs(v_array[i]) > 100. * CEED_EPSILON) printf("Computed: %f != True: 0.0\n", v_array[i]); + if (fabs(v_array[i]) > 100. * epsilon) printf("Computed: %f != True: 0.0\n", v_array[i]); } CeedVectorRestoreArrayRead(v, &v_array); } diff --git a/tests/t591-operator.c b/tests/t591-operator.c index f587ddd2cc..922bd95ecb 100644 --- a/tests/t591-operator.c +++ b/tests/t591-operator.c @@ -1,12 +1,17 @@ /// @file /// Test creation, action, and destruction for mass matrix operator at points /// \test Test creation, action, and destruction for mass matrix operator at points + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 + #include "t591-operator.h" #include #include #include #include +#include int main(int argc, char **argv) { Ceed ceed; @@ -18,8 +23,21 @@ int main(int argc, char **argv) { CeedQFunction qf_setup, qf_mass; CeedOperator op_setup, op_mass; bool is_at_points; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } // Point reference coordinates CeedVectorCreate(ceed, dim * num_points, &x_points); @@ -139,6 +157,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_setup, precision); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); @@ -153,6 +172,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_mass, precision); CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); CeedOperatorIsAtPoints(op_mass, &is_at_points); @@ -171,7 +191,7 @@ int main(int argc, char **argv) { for (CeedInt i = 0; i < num_nodes; i++) sum += v_array[i]; CeedVectorRestoreArrayRead(v, &v_array); // Summing 9 reference elements - if (fabs(sum - 1.0 * num_elem) > CEED_EPSILON * 5e3) printf("Incorrect area computed, %f != %f\n", sum, 1.0 * num_elem); + if (fabs(sum - 1.0 * num_elem) > epsilon * 5e3) printf("Incorrect area computed, %f != %f\n", sum, 1.0 * num_elem); } CeedVectorDestroy(&x_points); diff --git a/tests/t592-operator.c b/tests/t592-operator.c index 1650e0fa89..c45f92cd08 100644 --- a/tests/t592-operator.c +++ b/tests/t592-operator.c @@ -1,10 +1,15 @@ /// @file /// Test assembly of mass matrix operator QFunction at points /// \test Test assembly of mass matrix operator QFunction at points + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 + #include #include #include #include +#include #include "t591-operator.h" @@ -18,8 +23,21 @@ int main(int argc, char **argv) { CeedQFunction qf_setup, qf_mass; CeedOperator op_setup, op_mass; bool is_at_points; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } // Point reference coordinates CeedVectorCreate(ceed, dim * num_points, &x_points); @@ -139,6 +157,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_setup, precision); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); @@ -153,6 +172,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_mass, precision); CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); CeedOperatorIsAtPoints(op_mass, &is_at_points); @@ -195,7 +215,7 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < num_nodes; i++) area += v_array[i]; - if (fabs(area - 1.0 * num_elem) > CEED_EPSILON * 5e3) printf("Error: True operator computed area = %f != 1.0\n", area); + if (fabs(area - 1.0 * num_elem) > epsilon * 5e3) printf("Error: True operator computed area = %f != 1.0\n", area); CeedVectorRestoreArrayRead(v, &v_array); } @@ -219,7 +239,7 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < num_nodes; i++) area += v_array[i]; CeedVectorRestoreArrayRead(v, &v_array); - if (fabs(area - 1.0 * num_elem) > CEED_EPSILON * 5e3) printf("Error: Linearized operator computed area = %f != 1.0\n", area); + if (fabs(area - 1.0 * num_elem) > epsilon * 5e3) printf("Error: Linearized operator computed area = %f != 1.0\n", area); } // Cleanup diff --git a/tests/t593-operator.c b/tests/t593-operator.c index 2e0710c7fc..1e45fe7c47 100644 --- a/tests/t593-operator.c +++ b/tests/t593-operator.c @@ -1,10 +1,15 @@ /// @file /// Test 1D mass matrix operator at points with heterogeneous points per element /// \test Test 1D mass matrix operator at points with heterogeneous points per element + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 + #include #include #include #include +#include #include "t500-operator.h" @@ -20,8 +25,21 @@ int main(int argc, char **argv) { CeedQFunction qf_setup, qf_mass; CeedOperator op_setup, op_mass; bool is_at_points; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } // Mesh coordinates for (CeedInt i = 0; i < num_nodes_x; i++) x_array_mesh[i] = (CeedScalar)i / (num_nodes_x - 1); @@ -93,6 +111,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_setup, precision); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); @@ -107,6 +126,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_mass, precision); CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); CeedOperatorIsAtPoints(op_mass, &is_at_points); @@ -125,7 +145,7 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array); for (CeedInt i = 0; i < num_nodes_u; i++) { - if (fabs(v_array[i]) > 1e-14) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]); + if (fabs(v_array[i]) > 200 * epsilon) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]); } CeedVectorRestoreArrayRead(v, &v_array); } diff --git a/tests/t594-operator.c b/tests/t594-operator.c index 49405e37a4..bc08cff2f1 100644 --- a/tests/t594-operator.c +++ b/tests/t594-operator.c @@ -1,10 +1,15 @@ /// @file /// Test diagonal assembly of mass matrix operator at points /// \test Test diagonal assembly of mass matrix operator at points + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 + #include #include #include #include +#include #include "t500-operator.h" @@ -20,8 +25,21 @@ int main(int argc, char **argv) { CeedQFunction qf_setup, qf_mass; CeedOperator op_setup, op_mass; bool is_at_points; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } // Mesh coordinates for (CeedInt i = 0; i < num_nodes_x; i++) x_array_mesh[i] = (CeedScalar)i / (num_nodes_x - 1); @@ -93,6 +111,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_setup, precision); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); @@ -107,6 +126,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_mass, precision); CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); CeedOperatorIsAtPoints(op_mass, &is_at_points); @@ -147,7 +167,7 @@ int main(int argc, char **argv) { CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array); for (CeedInt i = 0; i < num_nodes_u; i++) { - if (fabs(assembled_array[i] - assembled_true[i]) > 100. * CEED_EPSILON) { + if (fabs(assembled_array[i] - assembled_true[i]) > 100. * epsilon) { // LCOV_EXCL_START printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, assembled_array[i], assembled_true[i]); // LCOV_EXCL_STOP diff --git a/tests/t596-operator.c b/tests/t596-operator.c index 81ca865ebd..6880abb4e1 100644 --- a/tests/t596-operator.c +++ b/tests/t596-operator.c @@ -1,17 +1,35 @@ /// @file /// Test full assembly of mass matrix operator /// \test Test full assembly of mass matrix operator AtPoints + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 + #include #include #include #include +#include #include "t596-operator.h" int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) { CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data; @@ -111,12 +129,14 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_setup, precision); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass); CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_mass, precision); CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points); // Apply Setup Operator @@ -168,7 +188,7 @@ int main(int argc, char **argv) { // Check output for (CeedInt i = 0; i < num_comp * num_dofs; i++) { for (CeedInt j = 0; j < num_comp * num_dofs; j++) { - if (fabs(assembled_values[i * num_dofs * num_comp + j] - assembled_true[i * num_dofs * num_comp + j]) > 100. * CEED_EPSILON) { + if (fabs(assembled_values[i * num_dofs * num_comp + j] - assembled_true[i * num_dofs * num_comp + j]) > 100. * epsilon) { // LCOV_EXCL_START printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_dofs * num_comp + j], assembled_true[i * num_dofs * num_comp + j]); diff --git a/tests/t597-operator.c b/tests/t597-operator.c index 25d6b3cf3f..ede3f8de50 100644 --- a/tests/t597-operator.c +++ b/tests/t597-operator.c @@ -1,17 +1,35 @@ /// @file /// Test full assembly of Poisson operator AtPoints /// \test Test full assembly of Poisson operator AtPoints + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 + #include #include #include #include +#include #include "t597-operator.h" int main(int argc, char **argv) { - Ceed ceed; + Ceed ceed; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) { CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data; @@ -99,6 +117,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "q data", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_setup, precision); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); // Apply Setup Operator @@ -123,6 +142,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_diff, "du", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_diff, "q data", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_diff, precision); CeedOperatorAtPointsSetPoints(op_diff, elem_restriction_x_points, x_points); // Fully assemble operator @@ -169,7 +189,7 @@ int main(int argc, char **argv) { // Check output for (CeedInt i = 0; i < num_comp * num_dofs; i++) { for (CeedInt j = 0; j < num_comp * num_dofs; j++) { - if (fabs(assembled_values[i * num_comp * num_dofs + j] - assembled_true[i * num_comp * num_dofs + j]) > 100. * CEED_EPSILON) { + if (fabs(assembled_values[i * num_comp * num_dofs + j] - assembled_true[i * num_comp * num_dofs + j]) > 100. * epsilon) { // LCOV_EXCL_START printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_comp * num_dofs + j], assembled_true[i * num_comp * num_dofs + j]); diff --git a/tests/t598-operator.c b/tests/t598-operator.c index 55c7560fbb..e917530cfb 100644 --- a/tests/t598-operator.c +++ b/tests/t598-operator.c @@ -1,12 +1,17 @@ /// @file /// Test creation, action, and destruction for mass matrix operator AtPoints /// \test Test creation, action, and destruction for mass matrix operator AtPoints + +//TESTARGS {ceed_resource} fp32 +//TESTARGS {ceed_resource} fp64 + #include "t591-operator.h" #include #include #include #include +#include int main(int argc, char **argv) { Ceed ceed; @@ -19,8 +24,21 @@ int main(int argc, char **argv) { CeedBasis basis_x, basis_u_coarse, basis_u_fine; CeedQFunction qf_setup, qf_mass; CeedOperator op_setup, op_mass_coarse, op_mass_fine, op_prolong, op_restrict; + CeedScalarType precision = CEED_SCALAR_TYPE; + CeedScalar epsilon = CEED_EPSILON; CeedInit(argv[1], &ceed); + if (argc == 3) { + if (!strcmp(argv[2], "fp32")) { + precision = CEED_SCALAR_FP32; + epsilon = FLT_EPSILON; + } else if (!strcmp(argv[2], "fp64")) { + precision = CEED_SCALAR_FP64; + } else { + printf("Unknown scalar type: %s\n", argv[2]); + exit(1); + } + } // Point reference coordinates CeedVectorCreate(ceed, dim * num_points, &x_points); @@ -173,6 +191,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE); CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_setup, precision); CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points); CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE); @@ -187,6 +206,7 @@ int main(int argc, char **argv) { CeedOperatorSetField(op_mass_fine, "u", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE); CeedOperatorSetField(op_mass_fine, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data); CeedOperatorSetField(op_mass_fine, "v", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE); + CeedOperatorSetPrecision(op_mass_fine, precision); CeedOperatorAtPointsSetPoints(op_mass_fine, elem_restriction_x_points, x_points); CeedVectorCreate(ceed, num_nodes_fine, &u_fine); @@ -213,7 +233,7 @@ int main(int argc, char **argv) { sum += v_array[i]; } CeedVectorRestoreArrayRead(v_coarse, &v_array); - if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); + if (fabs(sum - num_elem) > 1000. * epsilon) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); } // Prolong coarse u @@ -233,7 +253,7 @@ int main(int argc, char **argv) { } CeedVectorRestoreArrayRead(v_fine, &v_array); - if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Fine Grid: %f != True Area: 2.0\n", sum); + if (fabs(sum - num_elem) > 1000. * epsilon) printf("Computed Area Fine Grid: %f != True Area: 2.0\n", sum); } // Restrict state to coarse grid CeedOperatorApply(op_restrict, v_fine, v_coarse, CEED_REQUEST_IMMEDIATE); @@ -248,7 +268,7 @@ int main(int argc, char **argv) { sum += v_array[i]; } CeedVectorRestoreArrayRead(v_coarse, &v_array); - if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); + if (fabs(sum - num_elem) > 1000. * epsilon) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum); } CeedVectorDestroy(&x_points);