diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 481c358466..c4630abfe1 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -1285,7 +1285,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
   code << tab << "// -----------------------------------------------------------------------------\n";
   code << tab << "extern \"C\" __global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarBase *W, Points_Cuda "
           "points) {\n";
   tab.push();
 
@@ -1295,11 +1295,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+      code << tab << "const CeedScalarBase *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    code << tab << "CeedScalarBase *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
@@ -1572,11 +1572,20 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
 
   // Compile
   {
-    bool          is_compile_good = false;
-    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    bool           is_compile_good = false;
+    const CeedInt  T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    CeedScalarType precision;
+
+    // Check for mixed precision
+    CeedCallBackend(CeedOperatorGetPrecision(op, &precision));
 
     data->thread_1d = T_1d;
-    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
+    if (precision != CEED_SCALAR_TYPE) {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", T_1d, "CEED_JIT_PRECISION",
+                                          CeedScalarTypes[precision]));
+    } else {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
+    }
     if (is_compile_good) {
       *is_good_build = true;
       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
@@ -1689,8 +1698,8 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
   code << tab << "// -----------------------------------------------------------------------------\n";
   code << tab << "extern \"C\" __global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
-          "points, CeedScalar *__restrict__ values_array) {\n";
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarBase *W, Points_Cuda "
+          "points, CeedScalarBase *__restrict__ values_array) {\n";
   tab.push();
 
   // Scratch buffers
@@ -1699,11 +1708,11 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+      code << tab << "const CeedScalarBase *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    code << tab << "CeedScalarBase *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
@@ -2043,12 +2052,22 @@ static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, boo
 
   // Compile
   {
-    bool          is_compile_good = false;
-    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    bool           is_compile_good = false;
+    const CeedInt  T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    CeedScalarType precision;
+
+    // Check for mixed precision
+    CeedCallBackend(CeedOperatorGetPrecision(op, &precision));
 
     data->thread_1d = T_1d;
-    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
-                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d));
+    if (precision != CEED_SCALAR_TYPE) {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
+                                          is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", T_1d,
+                                          "CEED_JIT_PRECISION", CeedScalarTypes[precision]));
+    } else {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
+                                          is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d));
+    }
     if (is_compile_good) {
       *is_good_build = true;
       CeedCallBackend(CeedGetKernel_Cuda(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
@@ -2221,8 +2240,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
   code << tab << "// -----------------------------------------------------------------------------\n";
   code << tab << "extern \"C\" __global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
-          "points, CeedScalar *__restrict__ values_array) {\n";
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalarBase *W, Points_Cuda "
+          "points, CeedScalarBase *__restrict__ values_array) {\n";
   tab.push();
 
   // Scratch buffers
@@ -2231,11 +2250,11 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
 
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
-      code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
+      code << tab << "const CeedScalarBase *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
     }
   }
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
+    code << tab << "CeedScalarBase *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
   }
 
   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
@@ -2485,8 +2504,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
-        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
-             << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*"
+             << "dim_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
       } else {
         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
       }
@@ -2623,11 +2642,20 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOpera
 
   // Compile
   {
-    bool          is_compile_good = false;
-    const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    bool           is_compile_good = false;
+    const CeedInt  T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
+    CeedScalarType precision;
+
+    // Check for mixed precision
+    CeedCallBackend(CeedOperatorGetPrecision(op, &precision));
 
     data->thread_1d = T_1d;
-    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d));
+    if (precision != CEED_SCALAR_TYPE) {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", T_1d,
+                                          "CEED_JIT_PRECISION", CeedScalarTypes[precision]));
+    } else {
+      CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d));
+    }
     if (is_compile_good) {
       *is_good_build = true;
       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c
index e826f0aab1..5f74560022 100644
--- a/backends/cuda-gen/ceed-cuda-gen.c
+++ b/backends/cuda-gen/ceed-cuda-gen.c
@@ -29,6 +29,7 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) {
   CeedCallBackend(CeedCalloc(1, &data));
   CeedCallBackend(CeedSetData(ceed, data));
   CeedCallBackend(CeedInit_Cuda(ceed, resource));
+  CeedCallBackend(CeedSetSupportsMixedPrecision(ceed, true));
 
   CeedCallBackend(CeedInit("/gpu/cuda/shared", &ceed_shared));
   CeedCallBackend(CeedSetDelegate(ceed, ceed_shared));
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index dc430b81fa..37acf263e5 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -52,12 +52,20 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
   // Get kernel specific options, such as kernel constants
   if (num_defines > 0) {
     char *name;
-    int   val;
 
     for (int i = 0; i < num_defines; i++) {
       name = va_arg(args, char *);
-      val  = va_arg(args, int);
-      code << "#define " << name << " " << val << "\n";
+      if (!strcmp(name, "CEED_JIT_PRECISION")) {
+        char *val;
+
+        val = va_arg(args, char *);
+        code << "#define " << name << " " << val << "\n";
+      } else {
+        int val;
+
+        val = va_arg(args, int);
+        code << "#define " << name << " " << val << "\n";
+      }
     }
   }
 
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 9e29d16cb4..3f17b64cfb 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -2481,8 +2481,8 @@ extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperat
       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
       if (eval_mode == CEED_EVAL_GRAD) {
-        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
-             << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
+        code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*"
+             << "dim_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
       } else {
         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
       }
diff --git a/include/ceed-impl.h b/include/ceed-impl.h
index 1b6875a6ca..b6e482df77 100644
--- a/include/ceed-impl.h
+++ b/include/ceed-impl.h
@@ -126,6 +126,7 @@ struct Ceed_private {
   void           *data;
   bool            is_debug;
   bool            is_deterministic;
+  bool            supports_mixed_precision;
   char            err_msg[CEED_MAX_RESOURCE_LEN];
   FOffset        *f_offsets;
   CeedWorkVectors work_vectors;
@@ -378,6 +379,7 @@ struct CeedOperator_private {
   bool                      is_composite;
   bool                      is_at_points;
   bool                      has_restriction;
+  CeedScalarType            precision;
   CeedQFunctionAssemblyData qf_assembled;
   CeedOperatorAssemblyData  op_assembled;
   CeedOperator             *sub_operators;
diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index c48e0d4666..b5bfad46ff 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -249,6 +249,7 @@ CEED_EXTERN int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_
 CEED_EXTERN int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed);
 CEED_EXTERN int CeedSetOperatorFallbackCeed(Ceed ceed, Ceed fallback_ceed);
 CEED_EXTERN int CeedSetDeterministic(Ceed ceed, bool is_deterministic);
+CEED_INTERN int CeedSetSupportsMixedPrecision(Ceed ceed, bool supports_mixed_precision);
 CEED_EXTERN int CeedSetBackendFunctionImpl(Ceed ceed, const char *type, void *object, const char *func_name, void (*f)(void));
 CEED_EXTERN int CeedGetData(Ceed ceed, void *data);
 CEED_EXTERN int CeedSetData(Ceed ceed, void *data);
diff --git a/include/ceed/ceed-f32.h b/include/ceed/ceed-f32.h
index 0bce734257..e11a4eff1f 100644
--- a/include/ceed/ceed-f32.h
+++ b/include/ceed/ceed-f32.h
@@ -10,11 +10,26 @@
 /// Include this header in ceed.h to use float instead of double.
 #pragma once
 
+#ifndef CEED_RUNNING_JIT_PASS
+#include <float.h>
+#endif
+
 #define CEED_SCALAR_IS_FP32
 
 /// Set base scalar type to FP32. (See CeedScalarType enum in ceed.h for all options.)
 #define CEED_SCALAR_TYPE CEED_SCALAR_FP32
-typedef float CeedScalar;
+#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_PRECISION) && (CEED_JIT_PRECISION != CEED_SCALAR_TYPE)
+#ifdef CEED_JIT_PRECISION == CEED_SCALAR_FP64
+typedef double CeedScalar;
+typedef float  CeedScalarBase;
+
+/// Machine epsilon
+static const CeedScalar CEED_EPSILON = DBL_EPSILON;
+#endif  // CEED_JIT_PRECISION
+#else
+typedef float      CeedScalar;
+typedef CeedScalar CeedScalarBase;
 
 /// Machine epsilon
-#define CEED_EPSILON 6e-08
+static const CeedScalar CEED_EPSILON = FLT_EPSILON;
+#endif
diff --git a/include/ceed/ceed-f64.h b/include/ceed/ceed-f64.h
index b74d867c18..e72ad0f617 100644
--- a/include/ceed/ceed-f64.h
+++ b/include/ceed/ceed-f64.h
@@ -10,11 +10,26 @@
 /// This is the default header included in ceed.h.
 #pragma once
 
+#ifndef CEED_RUNNING_JIT_PASS
+#include <float.h>
+#endif
+
 #define CEED_SCALAR_IS_FP64
 
 /// Set base scalar type to FP64. (See CeedScalarType enum in ceed.h for all options.)
 #define CEED_SCALAR_TYPE CEED_SCALAR_FP64
-typedef double CeedScalar;
+#if defined(CEED_RUNNING_JIT_PASS) && defined(CEED_JIT_PRECISION) && (CEED_JIT_PRECISION != CEED_SCALAR_TYPE)
+#if CEED_JIT_PRECISION == CEED_SCALAR_FP32
+typedef float  CeedScalar;
+typedef double CeedScalarBase;
+
+/// Machine epsilon
+static const CeedScalar CEED_EPSILON = FLT_EPSILON;
+#endif  // CEED_JIT_PRECISION
+#else
+typedef double     CeedScalar;
+typedef CeedScalar CeedScalarBase;
 
 /// Machine epsilon
-#define CEED_EPSILON 1e-16
+static const CeedScalar CEED_EPSILON = DBL_EPSILON;
+#endif  // CEED_RUNNING_JIT_PASS && CEED_JIT_MIXED_PRECISION
diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h
index ce6284b910..fc40362e16 100644
--- a/include/ceed/ceed.h
+++ b/include/ceed/ceed.h
@@ -106,6 +106,7 @@ CEED_EXTERN int CeedSetStream(Ceed ceed, void *handle);
 CEED_EXTERN int CeedReferenceCopy(Ceed ceed, Ceed *ceed_copy);
 CEED_EXTERN int CeedGetResource(Ceed ceed, const char **resource);
 CEED_EXTERN int CeedIsDeterministic(Ceed ceed, bool *is_deterministic);
+CEED_EXTERN int CeedGetSupportsMixedPrecision(Ceed ceed, bool *supports_mixed_precision);
 CEED_EXTERN int CeedAddJitSourceRoot(Ceed ceed, const char *jit_source_root);
 CEED_EXTERN int CeedAddJitDefine(Ceed ceed, const char *jit_define);
 CEED_EXTERN int CeedView(Ceed ceed, FILE *stream);
@@ -177,6 +178,7 @@ CEED_EXTERN const char *const  CeedEvalModes[];
 CEED_EXTERN const char *const  CeedQuadModes[];
 CEED_EXTERN const char *const  CeedElemTopologies[];
 CEED_EXTERN const char *const  CeedContextFieldTypes[];
+CEED_EXTERN const char *const  CeedScalarTypes[];
 
 CEED_EXTERN int CeedGetPreferredMemType(Ceed ceed, CeedMemType *type);
 
@@ -426,6 +428,8 @@ CEED_EXTERN int  CeedOperatorCheckReady(CeedOperator op);
 CEED_EXTERN int  CeedOperatorGetActiveVectorLengths(CeedOperator op, CeedSize *input_size, CeedSize *output_size);
 CEED_EXTERN int  CeedOperatorSetQFunctionAssemblyReuse(CeedOperator op, bool reuse_assembly_data);
 CEED_EXTERN int  CeedOperatorSetQFunctionAssemblyDataUpdateNeeded(CeedOperator op, bool needs_data_update);
+CEED_EXTERN int  CeedOperatorSetPrecision(CeedOperator op, CeedScalarType precision);
+CEED_EXTERN int  CeedOperatorGetPrecision(CeedOperator op, CeedScalarType *precision);
 CEED_EXTERN int  CeedOperatorLinearAssembleQFunction(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request);
 CEED_EXTERN int  CeedOperatorLinearAssembleQFunctionBuildOrUpdate(CeedOperator op, CeedVector *assembled, CeedElemRestriction *rstr,
                                                                   CeedRequest *request);
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index f4dccf54ea..d78e83eeab 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -12,8 +12,8 @@
 //------------------------------------------------------------------------------
 // Load matrices for basis actions
 //------------------------------------------------------------------------------
-template <int P, int Q>
-inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
+template <int P, int Q, class ScalarIn, class ScalarOut>
+inline __device__ void LoadMatrix(SharedData_Cuda &data, const ScalarIn *__restrict__ d_B, ScalarOut *B) {
   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
 }
 
@@ -24,9 +24,9 @@ inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__res
 //------------------------------------------------------------------------------
 // L-vector -> single point
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
+template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS, class ScalarIn, class ScalarOut>
 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
-                                 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+                                 const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) {
   const CeedInt ind = indices[p + elem * NUM_PTS];
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -37,9 +37,9 @@ inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, cons
 //------------------------------------------------------------------------------
 // Single point -> L-vector
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
+template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS, class ScalarIn, class ScalarOut>
 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
-                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
+                                  const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_u, ScalarOut *d_u) {
   if (p < points_in_elem) {
     const CeedInt ind = indices[p + elem * NUM_PTS];
 
@@ -56,8 +56,8 @@ inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, con
 //------------------------------------------------------------------------------
 // Set E-vector value
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
-inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
+inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const ScalarIn value, ScalarOut *__restrict__ r_v) {
   const CeedInt target_comp = n / P_1D;
   const CeedInt target_node = n % P_1D;
 
@@ -69,9 +69,9 @@ inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const Cee
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+                                          const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = indices[node + elem * P_1D];
@@ -83,9 +83,8 @@ inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt n
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
-                                         CeedScalar *__restrict__ r_u) {
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -97,9 +96,9 @@ inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt el
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                           const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = indices[node + elem * P_1D];
@@ -108,10 +107,10 @@ inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt
   }
 }
 
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
-                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
-                                                  CeedScalar *__restrict__ d_v) {
+                                                  const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_v,
+                                                  ScalarOut *__restrict__ d_v) {
   const CeedInt target_comp = n / P_1D;
   const CeedInt target_node = n % P_1D;
 
@@ -125,9 +124,9 @@ inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const C
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, full assembly
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
-                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                                    const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   const CeedInt in_comp    = in / P_1D;
   const CeedInt in_node    = in % P_1D;
   const CeedInt e_vec_size = P_1D * NUM_COMP;
@@ -144,9 +143,9 @@ inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, Qfunction assembly
 //------------------------------------------------------------------------------
-template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
-                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                                      const CeedInt output_offset, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < Q_1D) {
     const CeedInt ind = data.t_id_x + elem * Q_1D;
 
@@ -159,9 +158,8 @@ inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, con
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
-                                          CeedScalar *__restrict__ d_v) {
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -177,8 +175,8 @@ inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt e
 //------------------------------------------------------------------------------
 // Set E-vector value
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
-inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
+inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const ScalarIn value, ScalarOut *__restrict__ r_v) {
   const CeedInt target_comp   = n / (P_1D * P_1D);
   const CeedInt target_node_x = n % P_1D;
   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
@@ -191,9 +189,9 @@ inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const Cee
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+                                          const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
@@ -205,9 +203,8 @@ inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt n
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
-                                         CeedScalar *__restrict__ r_u) {
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -219,9 +216,9 @@ inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt el
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                           const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
@@ -230,10 +227,10 @@ inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt
   }
 }
 
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
-                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
-                                                  CeedScalar *__restrict__ d_v) {
+                                                  const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_v,
+                                                  ScalarOut *__restrict__ d_v) {
   const CeedInt target_comp   = n / (P_1D * P_1D);
   const CeedInt target_node_x = n % P_1D;
   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
@@ -249,9 +246,9 @@ inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const C
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, full assembly
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
-                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                                    const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   const CeedInt elem_size  = P_1D * P_1D;
   const CeedInt in_comp    = in / elem_size;
   const CeedInt in_node_x  = in % P_1D;
@@ -273,9 +270,9 @@ inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, Qfunction assembly
 //------------------------------------------------------------------------------
-template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
-                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                                      const CeedInt output_offset, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D;
 
@@ -288,9 +285,8 @@ inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, con
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
-                                          CeedScalar *__restrict__ d_v) {
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -306,8 +302,8 @@ inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt e
 //------------------------------------------------------------------------------
 // Set E-vector value
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
-inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
+inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const ScalarIn value, ScalarOut *__restrict__ r_v) {
   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
   const CeedInt target_node_x = n % P_1D;
   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
@@ -321,9 +317,9 @@ inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const Cee
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                          const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
+                                          const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -337,9 +333,8 @@ inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt n
 //------------------------------------------------------------------------------
 // L-vector -> E-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
-                                         CeedScalar *__restrict__ r_u) {
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -353,10 +348,9 @@ inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt el
 //------------------------------------------------------------------------------
 // E-vector -> Q-vector, offests provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
+template <int NUM_COMP, int COMP_STRIDE, int Q_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
-                                               const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
-                                               CeedScalar *__restrict__ r_u) {
+                                               const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ d_u, ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
@@ -368,9 +362,9 @@ inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // E-vector -> Q-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
-                                              CeedScalar *__restrict__ r_u) {
+template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const ScalarIn *__restrict__ d_u,
+                                              ScalarOut *__restrict__ r_u) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
@@ -382,9 +376,9 @@ inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedI
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, offsets provided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
-                                           const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                           const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -395,10 +389,10 @@ inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt
   }
 }
 
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
-                                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
-                                                  CeedScalar *__restrict__ d_v) {
+                                                  const CeedInt *__restrict__ indices, const ScalarIn *__restrict__ r_v,
+                                                  ScalarOut *__restrict__ d_v) {
   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
   const CeedInt target_node_x = n % P_1D;
   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
@@ -415,9 +409,9 @@ inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const C
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, full assembly
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int COMP_STRIDE, int P_1D>
+template <int NUM_COMP, int COMP_STRIDE, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
-                                                    const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                                    const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   const CeedInt elem_size  = P_1D * P_1D * P_1D;
   const CeedInt in_comp    = in / elem_size;
   const CeedInt in_node_x  = in % P_1D;
@@ -442,9 +436,9 @@ inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, Qfunction assembly
 //------------------------------------------------------------------------------
-template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
+template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
-                                                      const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
+                                                      const CeedInt output_offset, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt z = 0; z < Q_1D; z++) {
       const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D;
@@ -459,9 +453,8 @@ inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, con
 //------------------------------------------------------------------------------
 // E-vector -> L-vector, strided
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
-inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
-                                          CeedScalar *__restrict__ d_v) {
+template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM, class ScalarIn, class ScalarOut>
+inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const ScalarIn *__restrict__ r_v, ScalarOut *__restrict__ d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -475,9 +468,9 @@ inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt e
 //------------------------------------------------------------------------------
 // 3D collocated derivatives computation
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G,
+                                        ScalarOut *__restrict__ r_V) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       __syncthreads();
@@ -505,9 +498,9 @@ inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q,
 //------------------------------------------------------------------------------
 // 3D collocated derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                                 CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G,
+                                                 ScalarOut *__restrict__ r_V) {
   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       __syncthreads();
diff --git a/include/ceed/jit-source/cuda/cuda-jit.h b/include/ceed/jit-source/cuda/cuda-jit.h
index baa15a1e85..22a0e903b2 100644
--- a/include/ceed/jit-source/cuda/cuda-jit.h
+++ b/include/ceed/jit-source/cuda/cuda-jit.h
@@ -13,4 +13,11 @@
 #define CeedPragmaSIMD
 #define CEED_Q_VLA 1
 
+#ifndef DBL_EPSILON
+#define DBL_EPSILON 2.22044604925031308084726333618164062e-16
+#endif
+#ifndef FLT_EPSILON
+#define FLT_EPSILON 1.19209289550781250000000000000000000e-7F
+#endif
+
 #include "cuda-types.h"
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
index e5b31970ff..04a7718b90 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h
@@ -12,8 +12,8 @@
 //------------------------------------------------------------------------------
 // 1D tensor contraction
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
-inline __device__ void Contract1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void Contract1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   data.slice[data.t_id_x] = *U;
   __syncthreads();
   *V = 0.0;
@@ -28,8 +28,8 @@ inline __device__ void Contract1d(SharedData_Cuda &data, const CeedScalar *U, co
 //------------------------------------------------------------------------------
 // 1D transpose tensor contraction
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
-inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   data.slice[data.t_id_x] = *U;
   __syncthreads();
   if (data.t_id_x < P_1D) {
@@ -43,9 +43,8 @@ inline __device__ void ContractTranspose1d(SharedData_Cuda &data, const CeedScal
 //------------------------------------------------------------------------------
 // Interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P, int Q, int T_1D>
-inline __device__ void InterpNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                       CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P, int Q, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
   }
@@ -54,9 +53,9 @@ inline __device__ void InterpNonTensor(SharedData_Cuda &data, const CeedScalar *
 //------------------------------------------------------------------------------
 // Interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P, int Q, int T_1D>
-inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P, int Q, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                                ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     r_V[comp] = 0.0;
     ContractTranspose1d<NUM_COMP, P, Q>(data, &r_U[comp], c_B, &r_V[comp]);
@@ -66,8 +65,8 @@ inline __device__ void InterpTransposeNonTensor(SharedData_Cuda &data, const Cee
 //------------------------------------------------------------------------------
 // Derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int DIM, int P, int Q, int T_1D>
-inline __device__ void GradNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int DIM, int P, int Q, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G, ScalarOut *__restrict__ r_V) {
   for (CeedInt dim = 0; dim < DIM; dim++) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
       Contract1d<NUM_COMP, P, Q>(data, &r_U[comp], &c_G[dim * P * Q], &r_V[comp + dim * NUM_COMP]);
@@ -78,9 +77,9 @@ inline __device__ void GradNonTensor(SharedData_Cuda &data, const CeedScalar *__
 //------------------------------------------------------------------------------
 // Derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int DIM, int P, int Q, int T_1D>
-inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
-                                              CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int DIM, int P, int Q, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_G,
+                                              ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_V[comp] = 0.0;
   for (CeedInt dim = 0; dim < DIM; dim++) {
     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -92,7 +91,7 @@ inline __device__ void GradTransposeNonTensor(SharedData_Cuda &data, const CeedS
 //------------------------------------------------------------------------------
 // Quadrature weights
 //------------------------------------------------------------------------------
-template <int P, int Q>
-inline __device__ void WeightNonTensor(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight, CeedScalar *w) {
+template <int P, int Q, class ScalarIn, class ScalarOut>
+inline __device__ void WeightNonTensor(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight, ScalarOut *w) {
   *w = (data.t_id_x < Q) ? q_weight[data.t_id_x] : 0.0;
 }
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
index ec7102ea2c..dbe7ef3ae0 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-nontensor.h
@@ -15,7 +15,8 @@
 //------------------------------------------------------------------------------
 // Interp kernels
 //------------------------------------------------------------------------------
-extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *__restrict__ d_U,
+                                  CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -41,8 +42,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B,
   }
 }
 
-extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
-                                           CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *__restrict__ d_U,
+                                           CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -68,8 +69,8 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   }
 }
 
-extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
-                                              CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *__restrict__ d_U,
+                                              CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -98,7 +99,8 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed
 //------------------------------------------------------------------------------
 // Grad kernels
 //------------------------------------------------------------------------------
-extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalarBase *c_G, const CeedScalarBase *__restrict__ d_U,
+                                CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -124,8 +126,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_G, c
   }
 }
 
-extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
-                                         CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalarBase *c_G, const CeedScalarBase *__restrict__ d_U,
+                                         CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -151,8 +153,8 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   }
 }
 
-extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
-                                            CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalarBase *c_G, const CeedScalarBase *__restrict__ d_U,
+                                            CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -181,7 +183,7 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc
 //------------------------------------------------------------------------------
 // Weight kernel
 //------------------------------------------------------------------------------
-extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_W) {
+extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalarBase *__restrict__ q_weight, CeedScalarBase *__restrict__ d_W) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
index cb62c4f80b..74cbeb6809 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h
@@ -12,8 +12,8 @@
 //------------------------------------------------------------------------------
 // Load matrices for basis actions
 //------------------------------------------------------------------------------
-template <int P, int Q>
-inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
+template <int P, int Q, class ScalarIn, class ScalarOut>
+inline __device__ void LoadMatrix(SharedData_Cuda &data, const ScalarIn *__restrict__ d_B, ScalarOut *B) {
   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
 }
 
@@ -24,9 +24,9 @@ inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__res
 //------------------------------------------------------------------------------
 // E-vector -> single element
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                            const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+                                            const CeedInt strides_elem, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * strides_node + elem * strides_elem;
@@ -40,9 +40,9 @@ inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // Single element -> E-vector
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                             const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+                                             const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * strides_node + elem * strides_elem;
@@ -53,9 +53,9 @@ inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedIn
   }
 }
 
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+                                           const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) {
   if (data.t_id_x < P_1D) {
     const CeedInt node = data.t_id_x;
     const CeedInt ind  = node * strides_node + elem * strides_elem;
@@ -73,9 +73,9 @@ inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // E-vector -> single element
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                            const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+                                            const CeedInt strides_elem, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * strides_node + elem * strides_elem;
@@ -89,9 +89,9 @@ inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // Single element -> E-vector
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                             const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+                                             const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * strides_node + elem * strides_elem;
@@ -102,9 +102,9 @@ inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedIn
   }
 }
 
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+                                           const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
     const CeedInt ind  = node * strides_node + elem * strides_elem;
@@ -122,9 +122,9 @@ inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // E-vector -> single element
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                            const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+                                            const CeedInt strides_elem, const ScalarIn *__restrict__ d_u, ScalarOut *r_u) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -140,9 +140,9 @@ inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // Single element -> E-vector
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                             const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+                                             const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -155,9 +155,9 @@ inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedIn
   }
 }
 
-template <int NUM_COMP, int P_1D>
+template <int NUM_COMP, int P_1D, class ScalarIn, class ScalarOut>
 inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
-                                           const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
+                                           const CeedInt strides_elem, const ScalarIn *r_v, ScalarOut *d_v) {
   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
     for (CeedInt z = 0; z < P_1D; z++) {
       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
@@ -177,10 +177,10 @@ inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt
 //------------------------------------------------------------------------------
 // E-vector -> single point
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_PTS>
+template <int NUM_COMP, int NUM_PTS, class ScalarIn, class ScalarOut>
 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
                                  const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem,
-                                 const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
+                                 const ScalarIn *__restrict__ d_u, ScalarOut *r_u) {
   const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
 
   if (p < points_in_elem) {
@@ -197,10 +197,10 @@ inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, cons
 //------------------------------------------------------------------------------
 // Single point -> E-vector
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_PTS>
+template <int NUM_COMP, int NUM_PTS, class ScalarIn, class ScalarOut>
 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
-                                  const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v,
-                                  CeedScalar *d_v) {
+                                  const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const ScalarIn *r_v,
+                                  ScalarOut *d_v) {
   if (p < points_in_elem) {
     const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
 
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
index 0da9163716..2018ef429e 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -40,9 +40,9 @@ inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar
 //------------------------------------------------------------------------------
 // 1D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X,
+                                        ScalarOut *__restrict__ r_V) {
   CeedScalar chebyshev_x[Q_1D];
 
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
@@ -61,9 +61,9 @@ inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p,
 //------------------------------------------------------------------------------
 // 1D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                                 CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X,
+                                                 ScalarOut *__restrict__ r_C) {
   CeedScalar chebyshev_x[Q_1D];
 
   ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -86,9 +86,9 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce
 //------------------------------------------------------------------------------
 // 1D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                      CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X,
+                                      ScalarOut *__restrict__ r_V) {
   CeedScalar chebyshev_x[Q_1D];
 
   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -108,9 +108,9 @@ inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, co
 //------------------------------------------------------------------------------
 // 1D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                               CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X,
+                                               ScalarOut *__restrict__ r_C) {
   CeedScalar chebyshev_x[Q_1D];
 
   ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
@@ -137,9 +137,9 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 2D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X,
+                                        ScalarOut *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
@@ -168,9 +168,9 @@ inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p,
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                                 CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X,
+                                                 ScalarOut *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
@@ -206,9 +206,9 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
 //------------------------------------------------------------------------------
 // 2D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                      CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X,
+                                      ScalarOut *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
@@ -241,9 +241,9 @@ inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, co
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                               CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X,
+                                               ScalarOut *__restrict__ r_C) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
@@ -287,9 +287,9 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D interpolate to points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X,
+                                        ScalarOut *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
   for (CeedInt k = 0; k < Q_1D; k++) {
     CeedScalar buffer[Q_1D];
@@ -324,9 +324,9 @@ inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p,
 //------------------------------------------------------------------------------
 // 3D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                                 CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X,
+                                                 ScalarOut *__restrict__ r_C) {
   for (CeedInt k = 0; k < Q_1D; k++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
@@ -368,9 +368,9 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
 //------------------------------------------------------------------------------
 // 3D derivatives at points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
-                                      CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_C, const ScalarIn2 *r_X,
+                                      ScalarOut *__restrict__ r_V) {
   for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
   for (CeedInt k = 0; k < Q_1D; k++) {
     CeedScalar buffer[Q_1D];
@@ -415,9 +415,9 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D>
-inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
-                                               CeedScalar *__restrict__ r_C) {
+template <int NUM_COMP, int NUM_POINTS, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *r_X,
+                                               ScalarOut *__restrict__ r_C) {
   for (CeedInt k = 0; k < Q_1D; k++) {
     CeedScalar buffer[Q_1D];
     CeedScalar chebyshev_x[Q_1D];
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
index dcb1763e38..2dae4f7eab 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -20,8 +20,9 @@
 //------------------------------------------------------------------------------
 // Interp
 //------------------------------------------------------------------------------
-extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
-                                          const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalarBase *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
+                                          const CeedScalarBase *__restrict__ d_X, const CeedScalarBase *__restrict__ d_U,
+                                          CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -75,9 +76,9 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal
   }
 }
 
-extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
-                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
-                                                   const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalarBase *__restrict__ c_B,
+                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalarBase *__restrict__ d_X,
+                                                   const CeedScalarBase *__restrict__ d_U, CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -145,9 +146,9 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
   }
 }
 
-extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
-                                                      const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
-                                                      const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, const CeedScalarBase *__restrict__ c_B,
+                                                      const CeedInt *__restrict__ points_per_elem, const CeedScalarBase *__restrict__ d_X,
+                                                      const CeedScalarBase *__restrict__ d_U, CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -207,8 +208,9 @@ extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, co
 //------------------------------------------------------------------------------
 // Grad
 //------------------------------------------------------------------------------
-extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
-                                        const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalarBase *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
+                                        const CeedScalarBase *__restrict__ d_X, const CeedScalarBase *__restrict__ d_U,
+                                        CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -262,9 +264,9 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar
   }
 }
 
-extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
-                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
-                                                 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalarBase *__restrict__ c_B,
+                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalarBase *__restrict__ d_X,
+                                                 const CeedScalarBase *__restrict__ d_U, CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -333,9 +335,9 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
   }
 }
 
-extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
-                                                    const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
-                                                    const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, const CeedScalarBase *__restrict__ c_B,
+                                                    const CeedInt *__restrict__ points_per_elem, const CeedScalarBase *__restrict__ d_X,
+                                                    const CeedScalarBase *__restrict__ d_U, CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
index 4f76825d50..3429fcc76f 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h
@@ -16,9 +16,9 @@
 //------------------------------------------------------------------------------
 // 2D tensor contraction x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                            CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, const ScalarIn2 *B,
+                                            ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
@@ -33,9 +33,9 @@ inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_i
 //------------------------------------------------------------------------------
 // 2D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
-                                            CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U, const ScalarIn2 *B,
+                                            ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
@@ -50,9 +50,9 @@ inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_i
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
-                                                     const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U,
+                                                     const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
@@ -67,9 +67,9 @@ inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, cons
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
-                                                     const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U,
+                                                     const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
@@ -84,9 +84,9 @@ inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, cons
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract and add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
-                                                        const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const ScalarIn1 *U,
+                                                        const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D] = *U;
   __syncthreads();
@@ -100,8 +100,8 @@ inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, c
 //------------------------------------------------------------------------------
 // 2D pack/unpack quadrature values
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+template <int NUM_COMP, int Q_1D, int T_1D, class ScalarOut>
+inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, ScalarOut *U) {
   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -112,8 +112,8 @@ inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const in
   }
 }
 
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, CeedScalar *U) {
+template <int NUM_COMP, int Q_1D, int T_1D, class ScalarOut>
+inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, ScalarOut *U) {
   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -127,9 +127,9 @@ inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const
 //------------------------------------------------------------------------------
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                               ScalarOut *__restrict__ r_V) {
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
@@ -146,9 +146,9 @@ inline __device__ void InterpTensor2dFlattened(SharedData_Cuda &data, CeedScalar
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                                        ScalarOut *__restrict__ r_V) {
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
@@ -164,9 +164,9 @@ inline __device__ void InterpTransposeTensor2dFlattened(SharedData_Cuda &data, C
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                             ScalarOut *__restrict__ r_V) {
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
@@ -185,9 +185,9 @@ inline __device__ void GradTensor2dFlattened(SharedData_Cuda &data, CeedScalar *
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                                      ScalarOut *__restrict__ r_V) {
   const int  t_id_x = data.t_id_x % T_1D, t_id_y = data.t_id_x / T_1D;
   CeedScalar r_t[1];
 
@@ -205,8 +205,8 @@ inline __device__ void GradTransposeTensor2dFlattened(SharedData_Cuda &data, Cee
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
-template <int P_1D, int Q_1D>
-inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+template <int P_1D, int Q_1D, class ScalarIn, class ScalarOut>
+inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) {
   const int t_id_x = data.t_id_x % Q_1D, t_id_y = data.t_id_x / Q_1D;
 
   *w = (t_id_x < Q_1D && t_id_y < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] : 0.0;
@@ -219,9 +219,9 @@ inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
-                                            const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U,
+                                            const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -236,9 +236,9 @@ inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_i
 //------------------------------------------------------------------------------
 // 3D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
-                                            const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U,
+                                            const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -253,9 +253,9 @@ inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_i
 //------------------------------------------------------------------------------
 // 3D tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
-                                            const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U,
+                                            const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -270,9 +270,9 @@ inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_i
 //------------------------------------------------------------------------------
 // 3D tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
-                                                     const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U,
+                                                     const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -287,9 +287,9 @@ inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, cons
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
 inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
-                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+                                                        const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -303,9 +303,9 @@ inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, c
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
-                                                     const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U,
+                                                     const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -320,9 +320,9 @@ inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, cons
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
 inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
-                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+                                                        const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -336,9 +336,9 @@ inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, c
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
-                                                     const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const ScalarIn1 *U,
+                                                     const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -353,9 +353,9 @@ inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, cons
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
 inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
-                                                        const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+                                                        const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
   __syncthreads();
@@ -369,8 +369,8 @@ inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, c
 //------------------------------------------------------------------------------
 // 3D pack/unpack quadrature values
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
+template <int NUM_COMP, int Q_1D, int T_1D, class ScalarOut>
+inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, ScalarOut *U) {
   const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -381,8 +381,8 @@ inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const in
   }
 }
 
-template <int NUM_COMP, int Q_1D, int T_1D>
-inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, CeedScalar *U) {
+template <int NUM_COMP, int Q_1D, int T_1D, class ScalarOut>
+inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, ScalarOut *U) {
   const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
 
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -396,9 +396,9 @@ inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const
 //------------------------------------------------------------------------------
 // 3D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                               ScalarOut *__restrict__ r_V) {
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
@@ -416,9 +416,9 @@ inline __device__ void InterpTensor3dFlattened(SharedData_Cuda &data, CeedScalar
 //------------------------------------------------------------------------------
 // 3D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                        CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                                        ScalarOut *__restrict__ r_V) {
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
@@ -435,9 +435,9 @@ inline __device__ void InterpTransposeTensor3dFlattened(SharedData_Cuda &data, C
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                             ScalarOut *__restrict__ r_V) {
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
@@ -461,9 +461,9 @@ inline __device__ void GradTensor3dFlattened(SharedData_Cuda &data, CeedScalar *
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                      const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                                      ScalarOut *__restrict__ r_V) {
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
@@ -486,9 +486,9 @@ inline __device__ void GradTransposeTensor3dFlattened(SharedData_Cuda &data, Cee
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                                       ScalarOut *__restrict__ r_V) {
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
@@ -509,9 +509,9 @@ inline __device__ void GradTensorCollocated3dFlattened(SharedData_Cuda &data, Ce
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                                const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda &data, ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                                                const ScalarIn3 *c_G, ScalarOut *__restrict__ r_V) {
   const CeedInt t_id_x = data.t_id_x % T_1D, t_id_y = (data.t_id_x / T_1D) % T_1D, t_id_z = data.t_id_x / (T_1D * T_1D);
   CeedScalar    r_t1[1], r_t2[1];
 
@@ -531,8 +531,8 @@ inline __device__ void GradTransposeTensorCollocated3dFlattened(SharedData_Cuda
 //------------------------------------------------------------------------------
 // 3D quadrature weights
 //------------------------------------------------------------------------------
-template <int P_1D, int Q_1D>
-inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+template <int P_1D, int Q_1D, class ScalarIn, class ScalarOut>
+inline __device__ void WeightTensor3dFlattened(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) {
   const CeedInt t_id_x = data.t_id_x % Q_1D, t_id_y = (data.t_id_x / Q_1D) % Q_1D, t_id_z = data.t_id_x / (Q_1D * Q_1D);
 
   *w = (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < Q_1D) ? q_weight_1d[t_id_x] * q_weight_1d[t_id_y] * q_weight_1d[t_id_z] : 0.0;
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
index f4f701505a..b453d66596 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h
@@ -16,8 +16,8 @@
 //------------------------------------------------------------------------------
 // 1D tensor contraction x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
-inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractX1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x] = *U;
   __syncthreads();
@@ -32,8 +32,8 @@ inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 1D transpose tensor contraction x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D>
-inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x] = *U;
   __syncthreads();
@@ -48,8 +48,8 @@ inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 1D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void Interp1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
@@ -58,9 +58,9 @@ inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restr
 //------------------------------------------------------------------------------
 // 1D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                         CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                         ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
   }
@@ -69,9 +69,9 @@ inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar
 //------------------------------------------------------------------------------
 // 1D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                              CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void Grad1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                              ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
@@ -80,9 +80,9 @@ inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restric
 //------------------------------------------------------------------------------
 // 1D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                       CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTranspose1d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                       ScalarOut *__restrict__ r_V) {
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
   }
@@ -91,8 +91,8 @@ inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *
 //------------------------------------------------------------------------------
 // 1D quadrature weights
 //------------------------------------------------------------------------------
-template <int P_1D, int Q_1D>
-inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+template <int P_1D, int Q_1D, class ScalarIn, class ScalarOut>
+inline __device__ void Weight1d(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) {
   *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
 }
 
@@ -103,8 +103,8 @@ inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restr
 //------------------------------------------------------------------------------
 // 2D tensor contraction x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractX2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -119,8 +119,8 @@ inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 2D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractY2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -135,8 +135,8 @@ inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -151,8 +151,8 @@ inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -167,8 +167,8 @@ inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 2D transpose tensor contract and add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   __syncthreads();
   data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
   __syncthreads();
@@ -182,9 +182,8 @@ inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 2D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                      CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
@@ -195,9 +194,9 @@ inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *_
 //------------------------------------------------------------------------------
 // 2D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                               ScalarOut *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
@@ -208,9 +207,9 @@ inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 2D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                    CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                    ScalarOut *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
@@ -223,9 +222,9 @@ inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__r
 //------------------------------------------------------------------------------
 // 2D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                             ScalarOut *__restrict__ r_V) {
   CeedScalar r_t[1];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
     ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
@@ -238,8 +237,8 @@ inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedSc
 //------------------------------------------------------------------------------
 // 2D quadrature weights
 //------------------------------------------------------------------------------
-template <int P_1D, int Q_1D>
-inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+template <int P_1D, int Q_1D, class ScalarIn, class ScalarOut>
+inline __device__ void WeightTensor2d(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) {
   *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
 }
 
@@ -250,8 +249,8 @@ inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *_
 //------------------------------------------------------------------------------
 // 3D tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractX3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   CeedScalar r_B[P_1D];
   for (CeedInt i = 0; i < P_1D; i++) {
     r_B[i] = B[i + data.t_id_x * P_1D];
@@ -273,8 +272,8 @@ inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 3D tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractY3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   CeedScalar r_B[P_1D];
   for (CeedInt i = 0; i < P_1D; i++) {
     r_B[i] = B[i + data.t_id_y * P_1D];
@@ -296,8 +295,8 @@ inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 3D tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractZ3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   for (CeedInt k = 0; k < Q_1D; k++) {
     V[k] = 0.0;
     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
@@ -311,8 +310,8 @@ inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, c
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract z
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   for (CeedInt k = 0; k < P_1D; k++) {
     V[k] = 0.0;
     if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
@@ -326,8 +325,8 @@ inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
     r_B[i] = B[data.t_id_y + i * P_1D];
@@ -349,8 +348,8 @@ inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract y
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
     r_B[i] = B[data.t_id_y + i * P_1D];
@@ -371,8 +370,8 @@ inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
     r_B[i] = B[data.t_id_x + i * P_1D];
@@ -394,8 +393,8 @@ inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedSca
 //------------------------------------------------------------------------------
 // 3D transpose tensor contract add x
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const ScalarIn1 *U, const ScalarIn2 *B, ScalarOut *V) {
   CeedScalar r_B[Q_1D];
   for (CeedInt i = 0; i < Q_1D; i++) {
     r_B[i] = B[data.t_id_x + i * P_1D];
@@ -416,9 +415,8 @@ inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D interpolate to quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                      CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, ScalarOut *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -431,9 +429,9 @@ inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *_
 //------------------------------------------------------------------------------
 // 3D interpolate transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                               CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarOut>
+inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                               ScalarOut *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -446,9 +444,9 @@ inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const Ceed
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                    CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                    ScalarOut *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -467,9 +465,9 @@ inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__r
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                             CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                             ScalarOut *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -488,9 +486,9 @@ inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedSc
 //------------------------------------------------------------------------------
 // 3D derivatives at quadrature points
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
-                                              CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B, const ScalarIn3 *c_G,
+                                              ScalarOut *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -506,9 +504,9 @@ inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedS
 //------------------------------------------------------------------------------
 // 3D derivatives transpose
 //------------------------------------------------------------------------------
-template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
-inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
-                                                       const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
+template <int NUM_COMP, int P_1D, int Q_1D, int T_1D, class ScalarIn1, class ScalarIn2, class ScalarIn3, class ScalarOut>
+inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const ScalarIn1 *__restrict__ r_U, const ScalarIn2 *c_B,
+                                                       const ScalarIn3 *c_G, ScalarOut *__restrict__ r_V) {
   CeedScalar r_t1[T_1D];
   CeedScalar r_t2[T_1D];
   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
@@ -524,8 +522,8 @@ inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, co
 //------------------------------------------------------------------------------
 // 3D quadrature weights
 //------------------------------------------------------------------------------
-template <int P_1D, int Q_1D>
-inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
+template <int P_1D, int Q_1D, class ScalarIn, class ScalarOut>
+inline __device__ void WeightTensor3d(SharedData_Cuda &data, const ScalarIn *__restrict__ q_weight_1d, ScalarOut *w) {
   const bool       quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
   const CeedScalar pw   = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
   for (CeedInt q = 0; q < Q_1D; q++) {
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
index 1252c8197d..602d4b2f91 100644
--- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor.h
@@ -15,7 +15,8 @@
 //------------------------------------------------------------------------------
 // Interp kernel by dim
 //------------------------------------------------------------------------------
-extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *__restrict__ d_U,
+                                  CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -53,8 +54,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B,
   }
 }
 
-extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
-                                           CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *__restrict__ d_U,
+                                           CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -92,8 +93,8 @@ extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedSca
   }
 }
 
-extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U,
-                                              CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *__restrict__ d_U,
+                                              CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -134,8 +135,8 @@ extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const Ceed
 //------------------------------------------------------------------------------
 // Grad kernel by dim
 //------------------------------------------------------------------------------
-extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
-                                CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *c_G, const CeedScalarBase *__restrict__ d_U,
+                                CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -177,8 +178,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, c
   }
 }
 
-extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
-                                         CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *c_G,
+                                         const CeedScalarBase *__restrict__ d_U, CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -220,8 +221,8 @@ extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScala
   }
 }
 
-extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U,
-                                            CeedScalar *__restrict__ d_V) {
+extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalarBase *c_B, const CeedScalarBase *c_G,
+                                            const CeedScalarBase *__restrict__ d_U, CeedScalarBase *__restrict__ d_V) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
@@ -266,7 +267,7 @@ extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedSc
 //------------------------------------------------------------------------------
 // Weight kernels by dim
 //------------------------------------------------------------------------------
-extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) {
+extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalarBase *__restrict__ q_weight_1d, CeedScalarBase *__restrict__ d_W) {
   extern __shared__ CeedScalar slice[];
 
   SharedData_Cuda data;
diff --git a/include/ceed/jit-source/cuda/cuda-types.h b/include/ceed/jit-source/cuda/cuda-types.h
index 9acb0064a3..0149923bba 100644
--- a/include/ceed/jit-source/cuda/cuda-types.h
+++ b/include/ceed/jit-source/cuda/cuda-types.h
@@ -14,8 +14,8 @@
 #define CEED_CUDA_NUMBER_FIELDS 16
 
 typedef struct {
-  const CeedScalar *inputs[CEED_CUDA_NUMBER_FIELDS];
-  CeedScalar       *outputs[CEED_CUDA_NUMBER_FIELDS];
+  const CeedScalarBase *inputs[CEED_CUDA_NUMBER_FIELDS];
+  CeedScalarBase       *outputs[CEED_CUDA_NUMBER_FIELDS];
 } Fields_Cuda;
 
 typedef struct {
@@ -24,10 +24,10 @@ typedef struct {
 } FieldsInt_Cuda;
 
 typedef struct {
-  CeedInt           num_elem;
-  const CeedInt    *num_per_elem;
-  const CeedInt    *indices;
-  const CeedScalar *coords;
+  CeedInt               num_elem;
+  const CeedInt        *num_per_elem;
+  const CeedInt        *indices;
+  const CeedScalarBase *coords;
 } Points_Cuda;
 
 typedef struct {
diff --git a/include/ceed/jit-source/hip/hip-jit.h b/include/ceed/jit-source/hip/hip-jit.h
index 70a00416e4..03040e6908 100644
--- a/include/ceed/jit-source/hip/hip-jit.h
+++ b/include/ceed/jit-source/hip/hip-jit.h
@@ -13,4 +13,11 @@
 #define CeedPragmaSIMD
 #define CEED_Q_VLA 1
 
+#ifndef DBL_EPSILON
+#define DBL_EPSILON 2.22044604925031308084726333618164062e-16
+#endif
+#ifndef FLT_EPSILON
+#define FLT_EPSILON 1.19209289550781250000000000000000000e-7F
+#endif
+
 #include "hip-types.h"
diff --git a/include/ceed/jit-source/sycl/sycl-jit.h b/include/ceed/jit-source/sycl/sycl-jit.h
index 1a2971f4df..b42aa5304f 100644
--- a/include/ceed/jit-source/sycl/sycl-jit.h
+++ b/include/ceed/jit-source/sycl/sycl-jit.h
@@ -13,5 +13,12 @@
 #define CeedPragmaSIMD
 #define CEED_Q_VLA 1
 
+#ifndef DBL_EPSILON
+#define DBL_EPSILON 2.22044604925031308084726333618164062e-16
+#endif
+#ifndef FLT_EPSILON
+#define FLT_EPSILON 1.19209289550781250000000000000000000e-7F
+#endif
+
 // Need quotes for recursive header inclusion
 #include "sycl-types.h"
diff --git a/include/ceed/types.h b/include/ceed/types.h
index 2739390d8e..fb51f94a1d 100644
--- a/include/ceed/types.h
+++ b/include/ceed/types.h
@@ -118,10 +118,12 @@ typedef signed char CeedInt8;
 /// @ingroup Ceed
 typedef enum {
   /// Single precision
-  CEED_SCALAR_FP32,
+  CEED_SCALAR_FP32 = 0,
   /// Double precision
-  CEED_SCALAR_FP64
+  CEED_SCALAR_FP64 = 1
 } CeedScalarType;
+#define CEED_SCALAR_FP32 0
+#define CEED_SCALAR_FP64 1
 /// Base scalar type for the library to use: change which header is included to change the precision.
 #include "ceed-f64.h"  // IWYU pragma: export
 
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 6dfbd55d61..e8369b44ca 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -1329,7 +1329,7 @@ int CeedSymmetricSchurDecomposition(Ceed ceed, CeedScalar *mat, CeedScalar *lamb
 
   // Reduce sub and super diagonal
   CeedInt    p = 0, q = 0, itr = 0, max_itr = n * n * n * n;
-  CeedScalar tol = CEED_EPSILON;
+  CeedScalar tol = 10 * CEED_EPSILON;
 
   while (itr < max_itr) {
     // Update p, q, size of reduced portions of diagonal
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 039f8522db..bf18552441 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -634,6 +634,49 @@ int CeedOperatorIsSetupDone(CeedOperator op, bool *is_setup_done) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Set the floating point precision for `CeedOperator` application
+
+  @param[in] op        `CeedOperator`
+  @param[in] precision `CeedScalarType` to use for operator application
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorSetPrecision(CeedOperator op, CeedScalarType scalar_type) {
+  bool is_immutable, is_composite, supports_mixed_precision;
+  Ceed ceed;
+
+  CeedCall(CeedOperatorGetCeed(op, &ceed));
+  CeedCall(CeedOperatorIsImmutable(op, &is_immutable));
+  CeedCheck(!is_immutable, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetPrecision must be called before operator is finalized");
+  CeedCall(CeedOperatorIsComposite(op, &is_composite));
+  CeedCheck(!is_composite, ceed, CEED_ERROR_INCOMPATIBLE, "CeedOperatorSetPrecision should be set on single operators");
+  CeedCall(CeedGetSupportsMixedPrecision(ceed, &supports_mixed_precision));
+  CeedCheck(scalar_type == CEED_SCALAR_TYPE || supports_mixed_precision, ceed, CEED_ERROR_UNSUPPORTED,
+            "Backend does not implement mixed precision operators");
+
+  op->precision = scalar_type;
+  CeedCallBackend(CeedDestroy(&ceed));
+  return CEED_ERROR_SUCCESS;
+}
+
+/**
+  @brief Get whether a `CeedOperator` is set to use reduced precision for operator application
+
+  @param[in]  op        `CeedOperator`
+  @param[out] precision Variable to store operator precision
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedOperatorGetPrecision(CeedOperator op, CeedScalarType *precision) {
+  *precision = op->precision;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Get the `CeedQFunction` associated with a `CeedOperator`
 
@@ -768,6 +811,7 @@ int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunc
   (*op)->ref_count   = 1;
   (*op)->input_size  = -1;
   (*op)->output_size = -1;
+  (*op)->precision   = CEED_SCALAR_TYPE;
   CeedCall(CeedQFunctionReferenceCopy(qf, &(*op)->qf));
   if (dqf && dqf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqf, &(*op)->dqf));
   if (dqfT && dqfT != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqfT, &(*op)->dqfT));
@@ -812,6 +856,7 @@ int CeedOperatorCreateAtPoints(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, C
   (*op)->is_at_points = true;
   (*op)->input_size   = -1;
   (*op)->output_size  = -1;
+  (*op)->precision    = CEED_SCALAR_TYPE;
   CeedCall(CeedQFunctionReferenceCopy(qf, &(*op)->qf));
   if (dqf && dqf != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqf, &(*op)->dqf));
   if (dqfT && dqfT != CEED_QFUNCTION_NONE) CeedCall(CeedQFunctionReferenceCopy(dqfT, &(*op)->dqfT));
diff --git a/interface/ceed-types.c b/interface/ceed-types.c
index e975793307..3186489b58 100644
--- a/interface/ceed-types.c
+++ b/interface/ceed-types.c
@@ -64,3 +64,8 @@ const char *const CeedFESpaces[] = {
     [CEED_FE_SPACE_HDIV]  = "H(div) space",
     [CEED_FE_SPACE_HCURL] = "H(curl) space",
 };
+
+const char *const CeedScalarTypes[] = {
+    [CEED_SCALAR_FP32] = "CEED_SCALAR_FP32",
+    [CEED_SCALAR_FP64] = "CEED_SCALAR_FP32",
+};
diff --git a/interface/ceed.c b/interface/ceed.c
index b4bf09883a..9e48b24e00 100644
--- a/interface/ceed.c
+++ b/interface/ceed.c
@@ -678,6 +678,21 @@ int CeedSetDeterministic(Ceed ceed, bool is_deterministic) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Flag `Ceed` context as being able to create mixed precision operators
+
+  @param[in]  ceed                     `Ceed` to flag as deterministic
+  @param[out] supports_mixed_precision Mixed precision status to set
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedSetSupportsMixedPrecision(Ceed ceed, bool supports_mixed_precision) {
+  ceed->supports_mixed_precision = supports_mixed_precision;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Set a backend function.
 
@@ -1387,6 +1402,21 @@ int CeedIsDeterministic(Ceed ceed, bool *is_deterministic) {
   return CEED_ERROR_SUCCESS;
 }
 
+/**
+  @brief Get deterministic status of `Ceed` context
+
+  @param[in]  ceed                     `Ceed` context
+  @param[out] supports_mixed_precision Variable to store deterministic status
+
+  @return An error code: 0 - success, otherwise - failure
+
+  @ref User
+**/
+int CeedGetSupportsMixedPrecision(Ceed ceed, bool *supports_mixed_precision) {
+  *supports_mixed_precision = ceed->supports_mixed_precision;
+  return CEED_ERROR_SUCCESS;
+}
+
 /**
   @brief Set additional JiT source root for `Ceed` context
 
diff --git a/tests/t360-basis.c b/tests/t360-basis.c
index f953157e1c..5e8a3fbe2b 100644
--- a/tests/t360-basis.c
+++ b/tests/t360-basis.c
@@ -40,7 +40,7 @@ int main(int argc, char **argv) {
 
       CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
       for (CeedInt i = 0; i < p_dim; i++) area += v_array[i];
-      if (fabs(area - 2.0 * CeedIntPow(2, dim)) > 5E-6) printf("Incorrect area computed %f != %f\n", area, 2.0 * CeedIntPow(2, dim));
+      if (fabs(area - 2.0 * CeedIntPow(2, dim)) > 1E-5) printf("Incorrect area computed %f != %f\n", area, 2.0 * CeedIntPow(2, dim));
       CeedVectorRestoreArrayRead(v, &v_array);
     }
 
diff --git a/tests/t502-operator.c b/tests/t502-operator.c
index b9e517396b..b1756b3035 100644
--- a/tests/t502-operator.c
+++ b/tests/t502-operator.c
@@ -1,12 +1,17 @@
 /// @file
 /// Test creation, action, and destruction for mass matrix operator with multiple components
 /// \test Test creation, action, and destruction for mass matrix operator with multiple components
-#include "t502-operator.h"
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
 
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
+
+#include "t502-operator.h"
 
 int main(int argc, char **argv) {
   Ceed                ceed;
@@ -18,8 +23,21 @@ int main(int argc, char **argv) {
   CeedInt             num_elem = 15, p = 5, q = 8;
   CeedInt             num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1;
   CeedInt             ind_x[num_elem * 2], ind_u[num_elem * p];
+  CeedScalarType      precision = CEED_SCALAR_TYPE;
+  CeedScalar          epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   CeedVectorCreate(ceed, num_nodes_x, &x);
   {
@@ -69,11 +87,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_setup, precision);
 
   CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_mass, precision);
 
   CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
 
@@ -100,8 +120,8 @@ int main(int argc, char **argv) {
       sum_2 += v_array[2 * i + 1];
     }
     CeedVectorRestoreArrayRead(v, &v_array);
-    if (fabs(sum_1 - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum_1);
-    if (fabs(sum_2 - 2.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 2.0\n", sum_2);
+    if (fabs(sum_1 - 1.) > 1000. * epsilon) printf("Computed Area: %f != True Area: 1.0\n", sum_1);
+    if (fabs(sum_2 - 2.) > 1000. * epsilon) printf("Computed Area: %f != True Area: 2.0\n", sum_2);
   }
 
   CeedVectorDestroy(&x);
diff --git a/tests/t503-operator.c b/tests/t503-operator.c
index 7b6caae59b..c9977e708f 100644
--- a/tests/t503-operator.c
+++ b/tests/t503-operator.c
@@ -1,10 +1,15 @@
 /// @file
 /// Test creation, action, and destruction for mass matrix operator with passive inputs and outputs
 /// \test Test creation, action, and destruction for mass matrix operator with passive inputs and outputs
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
+
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "t500-operator.h"
 
@@ -18,8 +23,21 @@ int main(int argc, char **argv) {
   CeedInt             num_elem = 15, p = 5, q = 8;
   CeedInt             num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1;
   CeedInt             ind_x[num_elem * 2], ind_u[num_elem * p];
+  CeedScalarType      precision = CEED_SCALAR_TYPE;
+  CeedScalar          epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   // Vectors
   CeedVectorCreate(ceed, num_nodes_x, &x);
@@ -70,11 +88,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, x);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetPrecision(op_setup, precision);
 
   CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, u);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, v);
+  CeedOperatorSetPrecision(op_mass, precision);
 
   // Note - It is atypical to use only passive fields; this test is intended
   //   as a test for all passive input modes rather than as an example.
@@ -90,7 +110,7 @@ int main(int argc, char **argv) {
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < num_nodes_u; i++) sum += v_array[i];
     CeedVectorRestoreArrayRead(v, &v_array);
-    if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum);
+    if (fabs(sum - 1.) > 1000. * epsilon) printf("Computed Area: %f != True Area: 1.0\n", sum);
   }
 
   CeedVectorDestroy(&x);
diff --git a/tests/t505-operator.c b/tests/t505-operator.c
index 1cda4550d4..154ec6561a 100644
--- a/tests/t505-operator.c
+++ b/tests/t505-operator.c
@@ -1,10 +1,15 @@
 /// @file
 /// Test CeedOperatorApplyAdd
 /// \test Test CeedOperatorApplyAdd
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
+
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "t500-operator.h"
 
@@ -18,8 +23,21 @@ int main(int argc, char **argv) {
   CeedInt             num_elem = 15, p = 5, q = 8;
   CeedInt             num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1;
   CeedInt             ind_x[num_elem * 2], ind_u[num_elem * p];
+  CeedScalarType      precision = CEED_SCALAR_TYPE;
+  CeedScalar          epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   CeedVectorCreate(ceed, num_nodes_x, &x);
   {
@@ -68,11 +86,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_setup, precision);
 
   CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_mass, precision);
 
   CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
 
@@ -89,7 +109,7 @@ int main(int argc, char **argv) {
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < num_nodes_u; i++) sum += v_array[i];
     CeedVectorRestoreArrayRead(v, &v_array);
-    if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum);
+    if (fabs(sum - 1.) > 1000. * epsilon) printf("Computed Area: %f != True Area: 1.0\n", sum);
   }
 
   // Apply with V = 1
@@ -104,7 +124,7 @@ int main(int argc, char **argv) {
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < num_nodes_u; i++) sum += v_array[i];
     CeedVectorRestoreArrayRead(v, &v_array);
-    if (fabs(sum - 1.) > 1000. * CEED_EPSILON) printf("Computed Area: %f != True Area: 1.0\n", sum);
+    if (fabs(sum - 1.) > 1000. * epsilon) printf("Computed Area: %f != True Area: 1.0\n", sum);
   }
 
   CeedVectorDestroy(&x);
diff --git a/tests/t506-operator.c b/tests/t506-operator.c
index a17b6cbb0a..8b2975717a 100644
--- a/tests/t506-operator.c
+++ b/tests/t506-operator.c
@@ -1,10 +1,15 @@
 /// @file
 /// Test creation reuse of the same QFunction for multiple operators
 /// \test Test creation reuse of the same QFunction for multiple operators
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
+
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "t502-operator.h"
 
@@ -18,8 +23,21 @@ int main(int argc, char **argv) {
   CeedInt             num_elem = 15, p = 5, q = 8, scale = 3, num_comp = 2;
   CeedInt             num_nodes_x = num_elem + 1, num_nodes_u = num_elem * (p - 1) + 1;
   CeedInt             ind_x[num_elem * 2], ind_u[num_elem * p];
+  CeedScalarType      precision = CEED_SCALAR_TYPE;
+  CeedScalar          epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   CeedVectorCreate(ceed, num_nodes_x, &x);
   {
@@ -75,22 +93,26 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup_small, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_small, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup_small, "x", elem_restriction_x, basis_x_small, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup_small, "rho", elem_restriction_q_data_small, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_setup_small, precision);
 
   CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_small);
   CeedOperatorSetField(op_mass_small, "rho", elem_restriction_q_data_small, CEED_BASIS_NONE, q_data_small);
   CeedOperatorSetField(op_mass_small, "u", elem_restriction_u, basis_u_small, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_small, "v", elem_restriction_u, basis_u_small, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_mass_small, precision);
 
   // 'Large' operators
   CeedOperatorCreate(ceed, qf_setup, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_setup_large);
   CeedOperatorSetField(op_setup_large, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_large, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup_large, "x", elem_restriction_x, basis_x_large, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup_large, "rho", elem_restriction_q_data_large, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_setup_large, precision);
 
   CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_large);
   CeedOperatorSetField(op_mass_large, "rho", elem_restriction_q_data_large, CEED_BASIS_NONE, q_data_large);
   CeedOperatorSetField(op_mass_large, "u", elem_restriction_u, basis_u_large, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_large, "v", elem_restriction_u, basis_u_large, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_mass_large, precision);
 
   // Setup
   CeedOperatorApply(op_setup_small, x, q_data_small, CEED_REQUEST_IMMEDIATE);
@@ -121,8 +143,8 @@ int main(int argc, char **argv) {
       sum_2 += v_array[num_comp * i + 1];
     }
     CeedVectorRestoreArrayRead(v, &v_array);
-    if (fabs(sum_1 - 1.) > 1000. * CEED_EPSILON) printf("Small Problem, Component 1: Computed Area %f != True Area 1.0\n", sum_1);
-    if (fabs(sum_2 - 2.) > 1000. * CEED_EPSILON) printf("Small Problem, Component 2: Computed Area %f != True Area 2.0\n", sum_2);
+    if (fabs(sum_1 - 1.) > 1000. * epsilon) printf("Small Problem, Component 1: Computed Area %f != True Area 1.0\n", sum_1);
+    if (fabs(sum_2 - 2.) > 1000. * epsilon) printf("Small Problem, Component 2: Computed Area %f != True Area 2.0\n", sum_2);
   }
 
   // 'Large' operator
@@ -140,8 +162,8 @@ int main(int argc, char **argv) {
     }
     CeedVectorRestoreArrayRead(v, &v_array);
 
-    if (fabs(sum_1 - 1.) > 1000. * CEED_EPSILON) printf("Large Problem, Component 1: Computed Area %f != True Area 1.0\n", sum_1);
-    if (fabs(sum_2 - 2.) > 1000. * CEED_EPSILON) printf("Large Problem, Component 2: Computed Area %f != True Area 2.0\n", sum_2);
+    if (fabs(sum_1 - 1.) > 1000. * epsilon) printf("Large Problem, Component 1: Computed Area %f != True Area 1.0\n", sum_1);
+    if (fabs(sum_2 - 2.) > 1000. * epsilon) printf("Large Problem, Component 2: Computed Area %f != True Area 2.0\n", sum_2);
   }
 
   CeedVectorDestroy(&x);
diff --git a/tests/t510-operator.c b/tests/t510-operator.c
index 059ad9c447..9d8a60eb10 100644
--- a/tests/t510-operator.c
+++ b/tests/t510-operator.c
@@ -1,12 +1,17 @@
 /// @file
 /// Test creation, action, and destruction for mass matrix operator
 /// \test Test creation, action, and destruction for mass matrix operator
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
+
 #include "t510-operator.h"
 
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "t320-basis.h"
 
@@ -24,8 +29,21 @@ int main(int argc, char **argv) {
   CeedInt             ind_x[num_elem * p];
   CeedScalar          q_ref[dim * q], q_weight[q];
   CeedScalar          interp[p * q], grad[dim * p * q];
+  CeedScalarType      precision = CEED_SCALAR_TYPE;
+  CeedScalar          epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   CeedVectorCreate(ceed, dim * num_dofs, &x);
   {
@@ -90,11 +108,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_setup, precision);
 
   CeedOperatorCreate(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_mass, precision);
 
   CeedOperatorApply(op_setup, x, q_data, CEED_REQUEST_IMMEDIATE);
 
@@ -107,7 +127,7 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < num_dofs; i++) {
-      if (fabs(v_array[i]) > 1e-14) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]);
+      if (fabs(v_array[i]) > 200 * epsilon) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]);
     }
     CeedVectorRestoreArrayRead(v, &v_array);
   }
diff --git a/tests/t520-operator.c b/tests/t520-operator.c
index 31fceb9c97..2679c23e3d 100644
--- a/tests/t520-operator.c
+++ b/tests/t520-operator.c
@@ -1,10 +1,15 @@
 /// @file
 /// Test creation, action, and destruction for composite mass matrix operator
 /// \test Test creation, action, and destruction for composite mass matrix operator
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
+
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "t320-basis.h"
 #include "t510-operator.h"
@@ -22,19 +27,32 @@ int main(int argc, char **argv) {
   Ceed                ceed;
   CeedElemRestriction elem_restriction_x_tet, elem_restriction_u_tet, elem_restriction_q_data_tet, elem_restriction_x_hex, elem_restriction_u_hex,
       elem_restriction_q_data_hex;
-  CeedBasis     basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex;
-  CeedQFunction qf_setup_tet, qf_mass_tet, qf_setup_hex, qf_mass_hex;
-  CeedOperator  op_setup_tet, op_mass_tet, op_setup_hex, op_mass_hex, op_setup, op_mass;
-  CeedVector    q_data_tet, q_data_hex, x, u, v;
-  CeedInt       num_elem_tet = 6, p_tet = 6, q_tet = 4, num_elem_hex = 6, p_hex = 3, q_hex = 4, dim = 2;
-  CeedInt       n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3;
-  CeedInt       row, col, offset;
-  CeedInt       num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * q_tet, num_qpts_hex = num_elem_hex * q_hex * q_hex;
-  CeedInt       ind_x_tet[num_elem_tet * p_tet], ind_x_hex[num_elem_hex * p_hex * p_hex];
-  CeedScalar    q_ref[dim * q_tet], q_weight[q_tet];
-  CeedScalar    interp[p_tet * q_tet], grad[dim * p_tet * q_tet];
+  CeedBasis      basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex;
+  CeedQFunction  qf_setup_tet, qf_mass_tet, qf_setup_hex, qf_mass_hex;
+  CeedOperator   op_setup_tet, op_mass_tet, op_setup_hex, op_mass_hex, op_setup, op_mass;
+  CeedVector     q_data_tet, q_data_hex, x, u, v;
+  CeedInt        num_elem_tet = 6, p_tet = 6, q_tet = 4, num_elem_hex = 6, p_hex = 3, q_hex = 4, dim = 2;
+  CeedInt        n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3;
+  CeedInt        row, col, offset;
+  CeedInt        num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * q_tet, num_qpts_hex = num_elem_hex * q_hex * q_hex;
+  CeedInt        ind_x_tet[num_elem_tet * p_tet], ind_x_hex[num_elem_hex * p_hex * p_hex];
+  CeedScalar     q_ref[dim * q_tet], q_weight[q_tet];
+  CeedScalar     interp[p_tet * q_tet], grad[dim * p_tet * q_tet];
+  CeedScalarType precision = CEED_SCALAR_TYPE;
+  CeedScalar     epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   // Vectors
   CeedVectorCreate(ceed, dim * num_dofs, &x);
@@ -106,12 +124,14 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup_tet, "_weight", CEED_ELEMRESTRICTION_NONE, basis_x_tet, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup_tet, "dx", elem_restriction_x_tet, basis_x_tet, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
+  CeedOperatorSetPrecision(op_setup_tet, precision);
   // ---- Mass Tet
   CeedOperatorCreate(ceed, qf_mass_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_tet);
   CeedOperatorSetField(op_mass_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
   CeedOperatorSetField(op_mass_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
   CeedOperatorSetName(op_mass_tet, "mass tet");
+  CeedOperatorSetPrecision(op_mass_tet, precision);
 
   // Set up Hex Elements
   // -- Restrictions
@@ -150,12 +170,14 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup_hex, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_hex, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup_hex, "dx", elem_restriction_x_hex, basis_x_hex, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
+  CeedOperatorSetPrecision(op_setup_hex, precision);
 
   CeedOperatorCreate(ceed, qf_mass_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass_hex);
   CeedOperatorSetField(op_mass_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
   CeedOperatorSetField(op_mass_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
   CeedOperatorSetName(op_mass_hex, "mass hex");
+  CeedOperatorSetPrecision(op_mass_hex, precision);
 
   // Set up Composite Operators
   // -- Create
@@ -193,7 +215,7 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < num_dofs; i++) {
-      if (fabs(v_array[i]) > 1e-14) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]);
+      if (fabs(v_array[i]) > 200 * epsilon) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]);
     }
     CeedVectorRestoreArrayRead(v, &v_array);
   }
diff --git a/tests/t522-operator.c b/tests/t522-operator.c
index b2e1da90ac..3c57223a1a 100644
--- a/tests/t522-operator.c
+++ b/tests/t522-operator.c
@@ -1,12 +1,17 @@
 /// @file
 /// Test creation, action, and destruction for diffusion matrix operator
 /// \test Test creation, action, and destruction for diffusion matrix operator
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
+
 #include "t522-operator.h"
 
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "t320-basis.h"
 
@@ -23,19 +28,32 @@ int main(int argc, char **argv) {
   Ceed                ceed;
   CeedElemRestriction elem_restriction_x_tet, elem_restriction_u_tet, elem_restriction_q_data_tet, elem_restriction_x_hex, elem_restriction_u_hex,
       elem_restriction_q_data_hex;
-  CeedBasis     basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex;
-  CeedQFunction qf_setup_tet, qf_diff_tet, qf_setup_hex, qf_diff_hex;
-  CeedOperator  op_setup_tet, op_diff_tet, op_setup_hex, op_diff_hex, op_setup, op_diff;
-  CeedVector    q_data_tet, q_data_hex, x, u, v;
-  CeedInt       num_elem_tet = 6, p_tet = 6, q_tet = 4, num_elem_hex = 6, p_hex = 3, q_hex = 4, dim = 2;
-  CeedInt       n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3;
-  CeedInt       row, col, offset;
-  CeedInt       num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * q_tet, num_qpts_hex = num_elem_hex * q_hex * q_hex;
-  CeedInt       ind_x_tet[num_elem_tet * p_tet], ind_x_hex[num_elem_hex * p_hex * p_hex];
-  CeedScalar    q_ref[dim * q_tet], q_weight[q_tet];
-  CeedScalar    interp[p_tet * q_tet], grad[dim * p_tet * q_tet];
+  CeedBasis      basis_x_tet, basis_u_tet, basis_x_hex, basis_u_hex;
+  CeedQFunction  qf_setup_tet, qf_diff_tet, qf_setup_hex, qf_diff_hex;
+  CeedOperator   op_setup_tet, op_diff_tet, op_setup_hex, op_diff_hex, op_setup, op_diff;
+  CeedVector     q_data_tet, q_data_hex, x, u, v;
+  CeedInt        num_elem_tet = 6, p_tet = 6, q_tet = 4, num_elem_hex = 6, p_hex = 3, q_hex = 4, dim = 2;
+  CeedInt        n_x = 3, n_y = 3, n_x_tet = 3, n_y_tet = 1, n_x_hex = 3;
+  CeedInt        row, col, offset;
+  CeedInt        num_dofs = (n_x * 2 + 1) * (n_y * 2 + 1), num_qpts_tet = num_elem_tet * q_tet, num_qpts_hex = num_elem_hex * q_hex * q_hex;
+  CeedInt        ind_x_tet[num_elem_tet * p_tet], ind_x_hex[num_elem_hex * p_hex * p_hex];
+  CeedScalar     q_ref[dim * q_tet], q_weight[q_tet];
+  CeedScalar     interp[p_tet * q_tet], grad[dim * p_tet * q_tet];
+  CeedScalarType precision = CEED_SCALAR_TYPE;
+  CeedScalar     epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   // Vectors
   CeedVectorCreate(ceed, dim * num_dofs, &x);
@@ -108,11 +126,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup_tet, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_tet, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup_tet, "dx", elem_restriction_x_tet, basis_x_tet, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
+  CeedOperatorSetPrecision(op_setup_tet, precision);
   // ---- Diff Tet
   CeedOperatorCreate(ceed, qf_diff_tet, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff_tet);
   CeedOperatorSetField(op_diff_tet, "rho", elem_restriction_q_data_tet, CEED_BASIS_NONE, q_data_tet);
   CeedOperatorSetField(op_diff_tet, "u", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_diff_tet, "v", elem_restriction_u_tet, basis_u_tet, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_diff_tet, precision);
 
   // Hex Elements
   // -- Restrictions
@@ -152,11 +172,13 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup_hex, "weight", CEED_ELEMRESTRICTION_NONE, basis_x_hex, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup_hex, "dx", elem_restriction_x_hex, basis_x_hex, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
+  CeedOperatorSetPrecision(op_setup_hex, precision);
 
   CeedOperatorCreate(ceed, qf_diff_hex, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_diff_hex);
   CeedOperatorSetField(op_diff_hex, "rho", elem_restriction_q_data_hex, CEED_BASIS_NONE, q_data_hex);
   CeedOperatorSetField(op_diff_hex, "u", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_diff_hex, "v", elem_restriction_u_hex, basis_u_hex, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_diff_hex, precision);
 
   // Composite Operators
   CeedCompositeOperatorCreate(ceed, &op_setup);
@@ -180,7 +202,7 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < num_dofs; i++) {
-      if (fabs(v_array[i]) > 100. * CEED_EPSILON) printf("Computed: %f != True: 0.0\n", v_array[i]);
+      if (fabs(v_array[i]) > 100. * epsilon) printf("Computed: %f != True: 0.0\n", v_array[i]);
     }
     CeedVectorRestoreArrayRead(v, &v_array);
   }
diff --git a/tests/t591-operator.c b/tests/t591-operator.c
index f587ddd2cc..922bd95ecb 100644
--- a/tests/t591-operator.c
+++ b/tests/t591-operator.c
@@ -1,12 +1,17 @@
 /// @file
 /// Test creation, action, and destruction for mass matrix operator at points
 /// \test Test creation, action, and destruction for mass matrix operator at points
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
+
 #include "t591-operator.h"
 
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 int main(int argc, char **argv) {
   Ceed    ceed;
@@ -18,8 +23,21 @@ int main(int argc, char **argv) {
   CeedQFunction       qf_setup, qf_mass;
   CeedOperator        op_setup, op_mass;
   bool                is_at_points;
+  CeedScalarType      precision = CEED_SCALAR_TYPE;
+  CeedScalar          epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   // Point reference coordinates
   CeedVectorCreate(ceed, dim * num_points, &x_points);
@@ -139,6 +157,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_setup, precision);
   CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
   CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
@@ -153,6 +172,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_mass, precision);
   CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
 
   CeedOperatorIsAtPoints(op_mass, &is_at_points);
@@ -171,7 +191,7 @@ int main(int argc, char **argv) {
     for (CeedInt i = 0; i < num_nodes; i++) sum += v_array[i];
     CeedVectorRestoreArrayRead(v, &v_array);
     // Summing 9 reference elements
-    if (fabs(sum - 1.0 * num_elem) > CEED_EPSILON * 5e3) printf("Incorrect area computed, %f != %f\n", sum, 1.0 * num_elem);
+    if (fabs(sum - 1.0 * num_elem) > epsilon * 5e3) printf("Incorrect area computed, %f != %f\n", sum, 1.0 * num_elem);
   }
 
   CeedVectorDestroy(&x_points);
diff --git a/tests/t592-operator.c b/tests/t592-operator.c
index 1650e0fa89..c45f92cd08 100644
--- a/tests/t592-operator.c
+++ b/tests/t592-operator.c
@@ -1,10 +1,15 @@
 /// @file
 /// Test assembly of mass matrix operator QFunction at points
 /// \test Test assembly of mass matrix operator QFunction at points
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
+
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "t591-operator.h"
 
@@ -18,8 +23,21 @@ int main(int argc, char **argv) {
   CeedQFunction       qf_setup, qf_mass;
   CeedOperator        op_setup, op_mass;
   bool                is_at_points;
+  CeedScalarType      precision = CEED_SCALAR_TYPE;
+  CeedScalar          epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   // Point reference coordinates
   CeedVectorCreate(ceed, dim * num_points, &x_points);
@@ -139,6 +157,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_setup, precision);
   CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
   CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
@@ -153,6 +172,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_mass, precision);
   CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
 
   CeedOperatorIsAtPoints(op_mass, &is_at_points);
@@ -195,7 +215,7 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < num_nodes; i++) area += v_array[i];
-    if (fabs(area - 1.0 * num_elem) > CEED_EPSILON * 5e3) printf("Error: True operator computed area = %f != 1.0\n", area);
+    if (fabs(area - 1.0 * num_elem) > epsilon * 5e3) printf("Error: True operator computed area = %f != 1.0\n", area);
     CeedVectorRestoreArrayRead(v, &v_array);
   }
 
@@ -219,7 +239,7 @@ int main(int argc, char **argv) {
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < num_nodes; i++) area += v_array[i];
     CeedVectorRestoreArrayRead(v, &v_array);
-    if (fabs(area - 1.0 * num_elem) > CEED_EPSILON * 5e3) printf("Error: Linearized operator computed area = %f != 1.0\n", area);
+    if (fabs(area - 1.0 * num_elem) > epsilon * 5e3) printf("Error: Linearized operator computed area = %f != 1.0\n", area);
   }
 
   // Cleanup
diff --git a/tests/t593-operator.c b/tests/t593-operator.c
index 2e0710c7fc..1e45fe7c47 100644
--- a/tests/t593-operator.c
+++ b/tests/t593-operator.c
@@ -1,10 +1,15 @@
 /// @file
 /// Test 1D mass matrix operator at points with heterogeneous points per element
 /// \test Test 1D mass matrix operator at points with heterogeneous points per element
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
+
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "t500-operator.h"
 
@@ -20,8 +25,21 @@ int main(int argc, char **argv) {
   CeedQFunction       qf_setup, qf_mass;
   CeedOperator        op_setup, op_mass;
   bool                is_at_points;
+  CeedScalarType      precision = CEED_SCALAR_TYPE;
+  CeedScalar          epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   // Mesh coordinates
   for (CeedInt i = 0; i < num_nodes_x; i++) x_array_mesh[i] = (CeedScalar)i / (num_nodes_x - 1);
@@ -93,6 +111,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_setup, precision);
   CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
   CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
@@ -107,6 +126,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_mass, precision);
   CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
 
   CeedOperatorIsAtPoints(op_mass, &is_at_points);
@@ -125,7 +145,7 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(v, CEED_MEM_HOST, &v_array);
     for (CeedInt i = 0; i < num_nodes_u; i++) {
-      if (fabs(v_array[i]) > 1e-14) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]);
+      if (fabs(v_array[i]) > 200 * epsilon) printf("[%" CeedInt_FMT "] v %g != 0.0\n", i, v_array[i]);
     }
     CeedVectorRestoreArrayRead(v, &v_array);
   }
diff --git a/tests/t594-operator.c b/tests/t594-operator.c
index 49405e37a4..bc08cff2f1 100644
--- a/tests/t594-operator.c
+++ b/tests/t594-operator.c
@@ -1,10 +1,15 @@
 /// @file
 /// Test diagonal assembly of mass matrix operator at points
 /// \test Test diagonal assembly of mass matrix operator at points
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
+
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "t500-operator.h"
 
@@ -20,8 +25,21 @@ int main(int argc, char **argv) {
   CeedQFunction       qf_setup, qf_mass;
   CeedOperator        op_setup, op_mass;
   bool                is_at_points;
+  CeedScalarType      precision = CEED_SCALAR_TYPE;
+  CeedScalar          epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   // Mesh coordinates
   for (CeedInt i = 0; i < num_nodes_x; i++) x_array_mesh[i] = (CeedScalar)i / (num_nodes_x - 1);
@@ -93,6 +111,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_setup, precision);
   CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
   CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
@@ -107,6 +126,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_mass, precision);
   CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
 
   CeedOperatorIsAtPoints(op_mass, &is_at_points);
@@ -147,7 +167,7 @@ int main(int argc, char **argv) {
 
     CeedVectorGetArrayRead(assembled, CEED_MEM_HOST, &assembled_array);
     for (CeedInt i = 0; i < num_nodes_u; i++) {
-      if (fabs(assembled_array[i] - assembled_true[i]) > 100. * CEED_EPSILON) {
+      if (fabs(assembled_array[i] - assembled_true[i]) > 100. * epsilon) {
         // LCOV_EXCL_START
         printf("[%" CeedInt_FMT "] Error in assembly: %f != %f\n", i, assembled_array[i], assembled_true[i]);
         // LCOV_EXCL_STOP
diff --git a/tests/t596-operator.c b/tests/t596-operator.c
index 81ca865ebd..6880abb4e1 100644
--- a/tests/t596-operator.c
+++ b/tests/t596-operator.c
@@ -1,17 +1,35 @@
 /// @file
 /// Test full assembly of mass matrix operator
 /// \test Test full assembly of mass matrix operator AtPoints
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
+
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "t596-operator.h"
 
 int main(int argc, char **argv) {
-  Ceed ceed;
+  Ceed           ceed;
+  CeedScalarType precision = CEED_SCALAR_TYPE;
+  CeedScalar     epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) {
     CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data;
@@ -111,12 +129,14 @@ int main(int argc, char **argv) {
     CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
     CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
     CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetPrecision(op_setup, precision);
     CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
     CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
     CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
     CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
     CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetPrecision(op_mass, precision);
     CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
 
     // Apply Setup Operator
@@ -168,7 +188,7 @@ int main(int argc, char **argv) {
     // Check output
     for (CeedInt i = 0; i < num_comp * num_dofs; i++) {
       for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
-        if (fabs(assembled_values[i * num_dofs * num_comp + j] - assembled_true[i * num_dofs * num_comp + j]) > 100. * CEED_EPSILON) {
+        if (fabs(assembled_values[i * num_dofs * num_comp + j] - assembled_true[i * num_dofs * num_comp + j]) > 100. * epsilon) {
           // LCOV_EXCL_START
           printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_dofs * num_comp + j],
                  assembled_true[i * num_dofs * num_comp + j]);
diff --git a/tests/t597-operator.c b/tests/t597-operator.c
index 25d6b3cf3f..ede3f8de50 100644
--- a/tests/t597-operator.c
+++ b/tests/t597-operator.c
@@ -1,17 +1,35 @@
 /// @file
 /// Test full assembly of Poisson operator AtPoints
 /// \test Test full assembly of Poisson operator AtPoints
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
+
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "t597-operator.h"
 
 int main(int argc, char **argv) {
-  Ceed ceed;
+  Ceed           ceed;
+  CeedScalarType precision = CEED_SCALAR_TYPE;
+  CeedScalar     epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   for (CeedInt num_comp = 1; num_comp <= 3; num_comp++) {
     CeedElemRestriction elem_restriction_x, elem_restriction_x_points, elem_restriction_u, elem_restriction_q_data;
@@ -99,6 +117,7 @@ int main(int argc, char **argv) {
     CeedOperatorSetField(op_setup, "dx", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
     CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
     CeedOperatorSetField(op_setup, "q data", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetPrecision(op_setup, precision);
     CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
     // Apply Setup Operator
@@ -123,6 +142,7 @@ int main(int argc, char **argv) {
     CeedOperatorSetField(op_diff, "du", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
     CeedOperatorSetField(op_diff, "q data", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
     CeedOperatorSetField(op_diff, "dv", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+    CeedOperatorSetPrecision(op_diff, precision);
     CeedOperatorAtPointsSetPoints(op_diff, elem_restriction_x_points, x_points);
 
     // Fully assemble operator
@@ -169,7 +189,7 @@ int main(int argc, char **argv) {
     // Check output
     for (CeedInt i = 0; i < num_comp * num_dofs; i++) {
       for (CeedInt j = 0; j < num_comp * num_dofs; j++) {
-        if (fabs(assembled_values[i * num_comp * num_dofs + j] - assembled_true[i * num_comp * num_dofs + j]) > 100. * CEED_EPSILON) {
+        if (fabs(assembled_values[i * num_comp * num_dofs + j] - assembled_true[i * num_comp * num_dofs + j]) > 100. * epsilon) {
           // LCOV_EXCL_START
           printf("[%" CeedInt_FMT ", %" CeedInt_FMT "] Error in assembly: %f != %f\n", i, j, assembled_values[i * num_comp * num_dofs + j],
                  assembled_true[i * num_comp * num_dofs + j]);
diff --git a/tests/t598-operator.c b/tests/t598-operator.c
index 55c7560fbb..e917530cfb 100644
--- a/tests/t598-operator.c
+++ b/tests/t598-operator.c
@@ -1,12 +1,17 @@
 /// @file
 /// Test creation, action, and destruction for mass matrix operator AtPoints
 /// \test Test creation, action, and destruction for mass matrix operator AtPoints
+
+//TESTARGS {ceed_resource} fp32
+//TESTARGS {ceed_resource} fp64
+
 #include "t591-operator.h"
 
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 int main(int argc, char **argv) {
   Ceed                ceed;
@@ -19,8 +24,21 @@ int main(int argc, char **argv) {
   CeedBasis           basis_x, basis_u_coarse, basis_u_fine;
   CeedQFunction       qf_setup, qf_mass;
   CeedOperator        op_setup, op_mass_coarse, op_mass_fine, op_prolong, op_restrict;
+  CeedScalarType      precision = CEED_SCALAR_TYPE;
+  CeedScalar          epsilon   = CEED_EPSILON;
 
   CeedInit(argv[1], &ceed);
+  if (argc == 3) {
+    if (!strcmp(argv[2], "fp32")) {
+      precision = CEED_SCALAR_FP32;
+      epsilon   = FLT_EPSILON;
+    } else if (!strcmp(argv[2], "fp64")) {
+      precision = CEED_SCALAR_FP64;
+    } else {
+      printf("Unknown scalar type: %s\n", argv[2]);
+      exit(1);
+    }
+  }
 
   // Point reference coordinates
   CeedVectorCreate(ceed, dim * num_points, &x_points);
@@ -173,6 +191,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_setup, "x", elem_restriction_x, basis_x, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_setup, "weight", CEED_ELEMRESTRICTION_NONE, basis_x, CEED_VECTOR_NONE);
   CeedOperatorSetField(op_setup, "rho", elem_restriction_q_data, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_setup, precision);
   CeedOperatorAtPointsSetPoints(op_setup, elem_restriction_x_points, x_points);
 
   CeedOperatorApply(op_setup, x_elem, q_data, CEED_REQUEST_IMMEDIATE);
@@ -187,6 +206,7 @@ int main(int argc, char **argv) {
   CeedOperatorSetField(op_mass_fine, "u", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE);
   CeedOperatorSetField(op_mass_fine, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
   CeedOperatorSetField(op_mass_fine, "v", elem_restriction_u_fine, basis_u_fine, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetPrecision(op_mass_fine, precision);
   CeedOperatorAtPointsSetPoints(op_mass_fine, elem_restriction_x_points, x_points);
 
   CeedVectorCreate(ceed, num_nodes_fine, &u_fine);
@@ -213,7 +233,7 @@ int main(int argc, char **argv) {
       sum += v_array[i];
     }
     CeedVectorRestoreArrayRead(v_coarse, &v_array);
-    if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum);
+    if (fabs(sum - num_elem) > 1000. * epsilon) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum);
   }
 
   // Prolong coarse u
@@ -233,7 +253,7 @@ int main(int argc, char **argv) {
     }
     CeedVectorRestoreArrayRead(v_fine, &v_array);
 
-    if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Fine Grid: %f != True Area: 2.0\n", sum);
+    if (fabs(sum - num_elem) > 1000. * epsilon) printf("Computed Area Fine Grid: %f != True Area: 2.0\n", sum);
   }
   // Restrict state to coarse grid
   CeedOperatorApply(op_restrict, v_fine, v_coarse, CEED_REQUEST_IMMEDIATE);
@@ -248,7 +268,7 @@ int main(int argc, char **argv) {
       sum += v_array[i];
     }
     CeedVectorRestoreArrayRead(v_coarse, &v_array);
-    if (fabs(sum - num_elem) > 1000. * CEED_EPSILON) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum);
+    if (fabs(sum - num_elem) > 1000. * epsilon) printf("Computed Area Coarse Grid: %f != True Area: 2.0\n", sum);
   }
 
   CeedVectorDestroy(&x_points);