diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e93a258de..cb1a3ef606 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -239,7 +239,7 @@ option(QUDA_CTEST_SEP_DSLASH_POLICIES "Test Dslash policies separately in ctest
 option(QUDA_CTEST_DISABLE_BENCHMARKS "Disable benchmark test" ON)
 
 option(QUDA_FAST_COMPILE_REDUCE "enable fast compilation in blas and reduction kernels (single warp per reduction)" OFF)
-option(QUDA_FAST_COMPILE_DSLASH "enable fast compilation in dslash kernels (~20% perf impact)" OFF)
+option(QUDA_FAST_COMPILE_DSLASH "enable fast compilation in coarse grid dslash kernels (significant perf impact)" OFF)
 
 option(QUDA_OPENMP "enable OpenMP" OFF)
 set(QUDA_CXX_STANDARD
diff --git a/ci/docker/Dockerfile.build b/ci/docker/Dockerfile.build
index ed21db930c..f8d6aa7ab4 100644
--- a/ci/docker/Dockerfile.build
+++ b/ci/docker/Dockerfile.build
@@ -8,7 +8,9 @@ RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \
     build-essential \
     cmake \
     wget \
-    ninja-build && \
+    ninja-build \
+    git \
+    ca-certificates && \
     rm -rf /var/lib/apt/lists/*
 
 ARG MPICH_VERSION=3.3.2
diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 2c46c23ea9..1d63a37900 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -241,9 +241,9 @@ namespace quda
         constexpr int M = nSpinBlock * nColor * nVec;
 #pragma unroll
         for (int i = 0; i < M; i++) {
-          vec_t tmp
-            = vector_load<vec_t>(reinterpret_cast<const vec_t *>(in + parity * offset_cb), x_cb * N + chi * M + i);
-          memcpy(&out[i], &tmp, sizeof(vec_t));
+          auto tmp
+            = vector_load<Float, 2>(reinterpret_cast<const vec_t *>(in + parity * offset_cb), x_cb * N + chi * M + i);
+          memcpy(&out[i], &tmp, sizeof(tmp));
         }
       }
     };
@@ -1010,11 +1010,14 @@ namespace quda
       {
         for (int dim = 0; dim < 4; dim++) {
           for (int dir = 0; dir < 2; dir++) {
-            ghost[2 * dim + dir] = comm_dim_partitioned(dim) ? static_cast<Float *>(ghost_[2 * dim + dir]) : nullptr;
-            ghost_norm[2 * dim + dir] = !comm_dim_partitioned(dim) ?
-              nullptr :
-              reinterpret_cast<norm_type *>(static_cast<char *>(ghost_[2 * dim + dir])
-                                            + nParity * length_ghost * faceVolumeCB[dim] * sizeof(Float));
+            if (comm_dim_partitioned(dim) && ghost_[2 * dim + dir]) {
+              ghost[2 * dim + dir] = static_cast<Float *>(ghost_[2 * dim + dir]);
+              ghost_norm[2 * dim + dir] = reinterpret_cast<norm_type *>(
+                static_cast<char *>(ghost_[2 * dim + dir]) + nParity * length_ghost * faceVolumeCB[dim] * sizeof(Float));
+            } else {
+              ghost[2 * dim + dir] = nullptr;
+              ghost_norm[2 * dim + dir] = nullptr;
+            }
           }
         }
       }
@@ -1023,7 +1026,7 @@ namespace quda
       {
         real v[length_ghost];
         norm_type nrm
-          = isFixed<Float>::value ? vector_load<float>(ghost_norm[2 * dim + dir], parity * faceVolumeCB[dim] + x) : 0.0;
+          = isFixed<Float>::value ? vector_load<float, 1>(ghost_norm[2 * dim + dir], parity * faceVolumeCB[dim] + x)[0] : 0.0;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
@@ -1123,16 +1126,9 @@ namespace quda
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       using AllocInt = typename AllocType<huge_alloc>::type;
-      using norm_type = float;
+      using norm_t = float;
       Float *field = nullptr;
-      //#define LEGACY_ACCESSOR_NORM // legacy code where norm pointer and offset are stored instead of computed
-#ifdef LEGACY_ACCESSOR_NORM
-      norm_type *norm = nullptr;
-#endif
       AllocInt offset = 0; // offset can be 32-bit or 64-bit
-#ifdef LEGACY_ACCESSOR_NORM
-      AllocInt norm_offset = 0;
-#endif
       int volumeCB = 0;
 
       FloatNOrder() = default;
@@ -1141,14 +1137,7 @@ namespace quda
       FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) :
         GhostNOrder(a, nFace, ghost_),
         field(buffer ? buffer : a.data<Float *>()),
-#ifdef LEGACY_ACCESSOR_NORM
-        norm(buffer ? reinterpret_cast<norm_type *>(reinterpret_cast<char *>(buffer) + a.NormOffset()) :
-                      const_cast<norm_type *>(reinterpret_cast<const norm_type *>(a.Norm()))),
-#endif
         offset(a.Bytes() / (2 * sizeof(Float))),
-#ifdef LEGACY_ACCESSOR_NORM
-        norm_offset(a.Bytes() / (2 * sizeof(norm_type))),
-#endif
         volumeCB(a.VolumeCB())
       {
       }
@@ -1157,23 +1146,19 @@ namespace quda
       __device__ __host__ inline void load(complex out[length / 2], int x, int parity = 0) const
       {
         real v[length];
-#ifndef LEGACY_ACCESSOR_NORM
-        auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
-        auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns));
-#endif
-        norm_type nrm = isFixed<Float>::value ? vector_load<float>(norm, x + parity * norm_offset) : 0.0;
-
+        auto norm_offset = (volumeCB * 2 * Nc * Ns + parity * offset) * sizeof(Float) / sizeof(norm_t);
+        norm_t nrm = isFixed<Float>::value ? vector_load<norm_t, 1>(field, x + norm_offset)[0] : 0.0;
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first load from memory
-          auto vecTmp = vector_load<Float, N>(field + parity * offset, volumeCB * i + x);
+          auto vecTmp = vector_load<Float, N>(field, parity * offset, volumeCB * i + x);
           // now copy into output and scale
           copy_and_scale(v + i * N, vecTmp, nrm);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
-          auto vecTmp = vector_load<Float, Nrem>(field + parity * offset + volumeCB * M * N, x);
+          auto vecTmp = vector_load<Float, Nrem>(field, parity * offset + volumeCB * M * N, x);
           copy_and_scale(v + M * N, vecTmp, nrm);
         }
 
@@ -1181,30 +1166,39 @@ namespace quda
         for (int i = 0; i < length / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]);
       }
 
+      __device__ __host__ inline void prefetch(int x, int parity = 0) const
+      {
+        auto norm_offset = (volumeCB * 2 * Nc * Ns + parity * offset) * sizeof(Float) / sizeof(norm_t);
+        if constexpr (isFixed<Float>::value) prefetch_cache_line(reinterpret_cast<norm_t *>(field) + (x + norm_offset));
+
+#pragma unroll
+        for (int i = 0; i < M; i++) prefetch_cache_line(field + (parity * offset + (volumeCB * i + x) * N));
+
+        // now load any remainder
+        if constexpr (Nrem > 0) prefetch_cache_line(field + (parity * offset + volumeCB * M * N + x * Nrem));
+      }
+
       __device__ __host__ inline void save(const complex in[length / 2], int x, int parity = 0) const
       {
         real v[length];
-#ifndef LEGACY_ACCESSOR_NORM
-        auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
-        auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns));
-#endif
+        auto norm_offset = (volumeCB * 2 * Nc * Ns + parity * offset) * sizeof(Float) / sizeof(norm_t);
+
 #pragma unroll
         for (int i = 0; i < length / 2; i++) {
           v[2 * i + 0] = in[i].real();
           v[2 * i + 1] = in[i].imag();
         }
 
-        norm_type scale = 0.0;
-        norm_type scale_inv = 0.0;
+        norm_t scale = 0.0;
+        norm_t scale_inv = 0.0;
         if constexpr (isFixed<Float>::value) {
-          norm_type max_[length / 2];
+          norm_t max_[length / 2];
           // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued)
 #pragma unroll
-          for (int i = 0; i < length / 2; i++)
-            max_[i] = fmaxf(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length / 2]));
+          for (int i = 0; i < length / 2; i++) max_[i] = fmaxf(fabsf((norm_t)v[i]), fabsf((norm_t)v[i + length / 2]));
 #pragma unroll
           for (int i = 0; i < length / 2; i++) scale = fmaxf(max_[i], scale);
-          norm[x + parity * norm_offset] = scale * fixedInvMaxValue<Float>::value;
+          reinterpret_cast<norm_t *>(field)[x + norm_offset] = scale * fixedInvMaxValue<Float>::value;
           scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
         }
 
@@ -1214,14 +1208,14 @@ namespace quda
           // first do scalar copy converting into storage type
           copy_and_scale<Float, real, N>(vecTmp, v + i * N, scale_inv);
           // second do vectorized copy into memory
-          vector_store(field + parity * offset, volumeCB * i + x, vecTmp);
+          vector_store(field, parity * offset, volumeCB * i + x, vecTmp);
         }
 
         if constexpr (Nrem > 0) {
           array<Float, Nrem> vecTmp;
           copy_and_scale<Float, real, Nrem>(vecTmp, v + M * N, scale_inv);
           // second do vectorized copy into memory
-          vector_store(field + parity * offset + volumeCB * M * N, x, vecTmp);
+          vector_store(field, parity * offset + volumeCB * M * N, x, vecTmp);
         }
       }
 
diff --git a/include/complex_quda.h b/include/complex_quda.h
index 51a4fed2ca..c9ab6557d4 100644
--- a/include/complex_quda.h
+++ b/include/complex_quda.h
@@ -928,14 +928,14 @@ namespace quda
   template <typename real> __host__ __device__ inline complex<real> cmul(const complex<real> &x, const complex<real> &y)
   {
     complex<real> rtn = mul2({x.real(), x.real()}, y);
-    return fma2({x.imag(), x.imag()}, {-y.imag(), y.real()}, rtn);
+    return fma2({-x.imag(), x.imag()}, {y.imag(), y.real()}, rtn);
   }
 
   template <typename real>
   __host__ __device__ inline complex<real> cmac(const complex<real> &x, const complex<real> &y, const complex<real> &z)
   {
     complex<real> w = fma2({x.real(), x.real()}, y, z);
-    return fma2({x.imag(), x.imag()}, {-y.imag(), y.real()}, w);
+    return fma2({-x.imag(), x.imag()}, {y.imag(), y.real()}, w);
   }
 
   template <typename T1, typename T2, typename T3>
diff --git a/include/domain_decomposition.h b/include/domain_decomposition.h
index 24e653ac37..8ada3ae905 100644
--- a/include/domain_decomposition.h
+++ b/include/domain_decomposition.h
@@ -39,8 +39,7 @@ namespace quda
       flags[(int)flag] = true;
 
       if ((int)flag == (int)DD::reset) {
-#pragma unroll
-        for (auto i = 0u; i < (int)DD::size; i++) flags[i] = 0;
+        flags = {};
         type = QUDA_DD_NO;
       } else if ((int)flag >= (int)DD::red_black_type) {
         type = QUDA_DD_RED_BLACK;
diff --git a/include/dslash.h b/include/dslash.h
index 8feb23d893..372790f420 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -8,6 +8,7 @@
 #include <tunable_nd.h>
 #include <instantiate.h>
 #include <instantiate_dslash.h>
+#include <tma_helper.hpp>
 
 namespace quda
 {
@@ -70,6 +71,18 @@ namespace quda
       char tile_str[16];
       i32toa(tile_str, Arg::n_src_tile);
       strcat(aux_base, tile_str);
+      if constexpr (dslash_double_store()) strcat(aux_base, ",double_store");
+      if constexpr (Arg::prefetch_distance > 0) {
+        strcat(aux_base, ",prefetch=");
+        i32toa(tile_str, Arg::prefetch_distance);
+        strcat(aux_base, tile_str);
+        if constexpr (dslash_prefetch_type() == PrefetchType::THREAD)
+          strcat(aux_base, ",prefetch=thread");
+        else if constexpr (dslash_prefetch_type() == PrefetchType::BULK)
+          strcat(aux_base, ",prefetch=bulk");
+        else if constexpr (dslash_prefetch_type() == PrefetchType::TENSOR)
+          strcat(aux_base, ",prefetch=tensor");
+      }
     }
 
     /**
@@ -130,7 +143,7 @@ namespace quda
       }
     }
 
-    inline void setParam(TuneParam &tp)
+    template <bool improved = false> inline void setParam(TuneParam &tp, const GaugeField &U, const GaugeField &L = {})
     {
       // Need to reset ghost pointers prior to every call since the
       // ghost buffer may have been changed during policy tuning.
@@ -173,6 +186,16 @@ namespace quda
           0;
         tp.grid.x += arg.exterior_blocks;
       }
+
+      if constexpr (dslash_prefetch_type() == PrefetchType::TENSOR && Arg::prefetch_distance > 0) {
+        Dslash::arg.U.tensor_desc = get_tensor_descriptor(U, tp.block.x);
+        Dslash::arg.Uback.tensor_desc = get_tensor_descriptor(U.shift(), tp.block.x);
+        if constexpr (improved) {
+          assert(!U.empty());
+          Dslash::arg.L.tensor_desc = get_tensor_descriptor(L, tp.block.x);
+          Dslash::arg.Lback.tensor_desc = get_tensor_descriptor(L.shift(), tp.block.x);
+        }
+      }
     }
 
     virtual int blockStep() const override { return (arg.shmem & 64) ? 8 : 16; }
@@ -219,6 +242,15 @@ namespace quda
       }
     }
 
+    virtual bool advanceBlockDim(TuneParam &param) const override
+    {
+      // if TMA is enabled we must keep parity separate in the block (2-d tuning)
+      if constexpr (dslash_prefetch_tma())
+        return TunableKernel2D_base<false>::advanceBlockDim(param);
+      else
+        return TunableKernel3D::advanceBlockDim(param);
+    }
+
     virtual bool advanceTuneParam(TuneParam &param) const override
     {
       return advanceAux(param) || advanceSharedBytes(param) || advanceBlockDim(param) || advanceSharedCarveOut(param)
@@ -268,6 +300,7 @@ namespace quda
     inline void launch(TuneParam &tp, const qudaStream_t &stream)
     {
       tp.set_max_shared_bytes = true;
+      if (dslash_prefetch_tma() && tp.block.z > 1) errorQuda("Z-dimension block size must be 1 when using TMA");
       launch_device<dslash_functor>(
         tp, stream, dslash_functor_arg<D, P, dagger, xpay, kernel_type, Arg>(arg, tp.block.x * tp.grid.x));
     }
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index da002550ff..02d2fe2f6c 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -13,6 +13,7 @@
 #include <tune_quda.h>
 #include <domain_decomposition_helper.cuh>
 #include <kernel_ops.h>
+#include <tma_helper.hpp>
 
 constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::TRUE;
 
@@ -20,13 +21,48 @@ constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::TRUE;
 
 namespace quda
 {
+
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+  constexpr bool dslash_double_store() { return true; }
+#else
+  constexpr bool dslash_double_store() { return false; }
+#endif
+
+  constexpr PrefetchType dslash_prefetch_type()
+  {
+#if defined(QUDA_DSLASH_PREFETCH_TYPE_NONE)
+    return PrefetchType::NONE;
+#elif defined(QUDA_DSLASH_PREFETCH_TYPE_THREAD)
+    return PrefetchType::THREAD;
+#elif defined(QUDA_DSLASH_PREFETCH_TYPE_BULK)
+    return PrefetchType::BULK;
+#elif defined(QUDA_DSLASH_PREFETCH_TYPE_TENSOR)
+    return PrefetchType::TENSOR;
+#else
+#error "Invalid or missing QUDA_DSLASH_PREFETCH_TYPE"
+#endif
+    return PrefetchType::NONE;
+  }
+
+#if defined(NVSHMEM_COMMS) && (defined(QUDA_DSLASH_PREFETCH_TYPE_BULK) || defined(QUDA_DSLASH_PREFETCH_TYPE_TENSOR))
+#error NVSHMEM cannot be used in combination with TMA prefetching at present
+#endif
+
+  constexpr bool dslash_prefetch_tma()
+  {
+    return (dslash_prefetch_type() == PrefetchType::BULK || dslash_prefetch_type() == PrefetchType::TENSOR);
+  }
+
+  static_assert(!dslash_prefetch_tma() || dslash_double_store(),
+                "Cannot use TMA prefetching unless QUDA_DSLASH_DOUBLE_STORE is enabled");
+
   /**
      @brief Helper function to determine if we should do halo
      computation
      @param[in] dim Dimension we are working on.  If dim=-1 (default
      argument) then we return true if type is any halo kernel.
   */
-  template <KernelType type> __host__ __device__ __forceinline__ bool doHalo(int dim = -1)
+  template <KernelType type> __host__ __device__ __forceinline__ constexpr bool doHalo(int dim = -1)
   {
     switch (type) {
     case EXTERIOR_KERNEL_ALL: return true;
@@ -44,7 +80,7 @@ namespace quda
      computation
      @param[in] dim Dimension we are working on
   */
-  template <KernelType type> __host__ __device__ __forceinline__ bool doBulk()
+  template <KernelType type> __host__ __device__ __forceinline__ constexpr bool doBulk()
   {
     switch (type) {
     case EXTERIOR_KERNEL_ALL:
@@ -109,6 +145,7 @@ namespace quda
 
     if (kernel_type == INTERIOR_KERNEL) {
       coord.x_cb = idx;
+      coord.x_cb_0 = (target::block_idx().x - arg.pack_blocks) * target::block_dim().x;
       if (nDim == 5)
         coord.X = getCoords5CB(coord, idx, arg.dc.X, arg.X0h, parity, pc_type);
       else
@@ -158,13 +195,68 @@ namespace quda
 
 #pragma unroll
     for (int d = 0; d < nDim; d++) {
-      coord.in_boundary[1][d] = coord[d] + arg.nFace >= arg.dc.X[d];
-      coord.in_boundary[0][d] = coord[d] - arg.nFace < 0;
+      coord.in_boundary[1][d] = -(coord[d] + arg.nFace >= arg.dc.X[d]);
+      coord.in_boundary[0][d] = -(coord[d] - arg.nFace < 0);
     }
 
     return coord;
   }
 
+  /**
+     @brief Compute the checkerboard 1-d index for the nearest
+     neighbor
+     @param[in] lattice coordinates
+     @param[in] mu dimension in which to add 1
+     @param[in] dir direction (+1 or -1)
+     @param[in] arg parameter struct
+     @return 1-d checkboard index
+   */
+  template <int nFace = 1, typename Coord, typename Arg>
+  __device__ __host__ inline int getNeighborIndexCB(const Coord &x, int mu, int dir, const Arg &arg)
+  {
+    switch (nFace) {
+    case 1:
+      switch (dir) {
+      case +1: // positive direction
+        switch (mu) {
+        case 0: return (x.X + 1 - (x.in_boundary[1][0] & arg.X[0])) >> 1;
+        case 1: return (x.X + arg.X[0] - (x.in_boundary[1][1] & arg.X2X1)) >> 1;
+        case 2: return (x.X + arg.X2X1 - (x.in_boundary[1][2] & arg.X3X2X1)) >> 1;
+        case 3: return (x.X + arg.X3X2X1 - (x.in_boundary[1][3] & arg.X4X3X2X1)) >> 1;
+        case 4: return (x.X + arg.X4X3X2X1 - (x.in_boundary[1][4] & arg.X5X4X3X2X1)) >> 1;
+        }
+      case -1:
+        switch (mu) {
+        case 0: return (x.X - 1 + (x.in_boundary[0][0] & arg.X[0])) >> 1;
+        case 1: return (x.X - arg.X[0] + (x.in_boundary[0][1] & arg.X2X1)) >> 1;
+        case 2: return (x.X - arg.X2X1 + (x.in_boundary[0][2] & arg.X3X2X1)) >> 1;
+        case 3: return (x.X - arg.X3X2X1 + (x.in_boundary[0][3] & arg.X4X3X2X1)) >> 1;
+        case 4: return (x.X - arg.X4X3X2X1 + (x.in_boundary[0][4] & arg.X5X4X3X2X1)) >> 1;
+        }
+      }
+    case 3:
+      switch (dir) {
+      case +1: // positive direction
+        switch (mu) {
+        case 0: return (x.X + 3 - (x.in_boundary[1][0] & arg.X[0])) >> 1;
+        case 1: return (x.X + 3 * arg.X[0] - (x.in_boundary[1][1] & arg.X2X1)) >> 1;
+        case 2: return (x.X + 3 * arg.X2X1 - (x.in_boundary[1][2] & arg.X3X2X1)) >> 1;
+        case 3: return (x.X + 3 * arg.X3X2X1 - (x.in_boundary[1][3] & arg.X4X3X2X1)) >> 1;
+        case 4: return (x.X + 3 * arg.X4X3X2X1 - (x.in_boundary[1][4] & arg.X5X4X3X2X1)) >> 1;
+        }
+      case -1:
+        switch (mu) {
+        case 0: return (x.X - 3 + (x.in_boundary[0][0] & arg.X[0])) >> 1;
+        case 1: return (x.X - 3 * arg.X[0] + (x.in_boundary[0][1] & arg.X2X1)) >> 1;
+        case 2: return (x.X - 3 * arg.X2X1 + (x.in_boundary[0][2] & arg.X3X2X1)) >> 1;
+        case 3: return (x.X - 3 * arg.X3X2X1 + (x.in_boundary[0][3] & arg.X4X3X2X1)) >> 1;
+        case 4: return (x.X - 3 * arg.X4X3X2X1 + (x.in_boundary[0][4] & arg.X5X4X3X2X1)) >> 1;
+        }
+      }
+    }
+    return 0; // should never reach here
+  }
+
   /**
      @brief Compute whether this thread should be active for updating
      the a given offsetDim halo.  For non-fused halo update kernels
@@ -243,7 +335,8 @@ namespace quda
     static constexpr int n_src_tile = n_src_tile_; // how many RHS per thread
     static constexpr int max_regs = 0;             // by default we don't limit register count
     static constexpr bool spill_shared = false;    // whether a given kernel should use shared memory spilling
-
+    static constexpr int prefetch_distance = 0;    // whether we are using prefetching in the dslash
+    static constexpr PrefetchType prefetch_type = dslash_prefetch_type();
     const int parity;  // only use this for single parity fields
     const int nParity; // number of parities we're working on
     const QudaReconstructType reconstruct;
@@ -285,6 +378,7 @@ namespace quda
     int pack_blocks = 0;   // total number of blocks used for packing in the dslash
     int exterior_dims = 0; // dimension to run in the exterior Dslash
     int exterior_blocks = 0;
+    int block_size = 0;
 
     DDArg dd_out;
     DDArg dd_in;
@@ -655,6 +749,7 @@ namespace quda
     static constexpr KernelType kernel_type = kernel_type_;
     static constexpr int max_regs = Arg::max_regs;
     static constexpr bool spill_shared = Arg::spill_shared;
+    static constexpr bool is_dslash = true;
     Arg arg;
 
     dslash_functor_arg(const Arg &arg, unsigned int threads_x) :
@@ -685,6 +780,14 @@ namespace quda
     __forceinline__ __device__ void operator()(int, int s, int parity, bool alive = true)
     {
       typename Arg::D dslash(*this);
+
+      if constexpr (dslash_prefetch_tma()) {
+        // FIXME need warp uniform parity which is not composable with
+        // NVSHMEM since the latter requires blockDim.y and blockDim.z to
+        // cover the entire extent
+        parity = target::block_idx().z; // ensure parity is warp uniform
+      }
+
       // for full fields set parity from z thread index else use arg setting
       if (arg.nParity == 1) parity = arg.parity;
 
diff --git a/include/dslash_quda.h b/include/dslash_quda.h
index f34a41de1a..4017baa69f 100644
--- a/include/dslash_quda.h
+++ b/include/dslash_quda.h
@@ -19,7 +19,7 @@ namespace quda
     int_fastdiv X[QUDA_MAX_DIM];
     int Ls;
 
-    int volume_4d;
+    int_fastdiv volume_4d;
     int_fastdiv volume_4d_cb;
 
     int_fastdiv face_X[4];
@@ -35,11 +35,7 @@ namespace quda
     int X2X1;
     int X3X2X1;
     int X4X3X2X1;
-
-    int X2X1mX1;
-    int X3X2X1mX2X1;
-    int X4X3X2X1mX3X2X1;
-    int X5X4X3X2X1mX4X3X2X1;
+    int X5X4X3X2X1;
   };
 
   /**
diff --git a/include/externals/CLI11.hpp b/include/externals/CLI11.hpp
index a426c5bae4..9174a58890 100644
--- a/include/externals/CLI11.hpp
+++ b/include/externals/CLI11.hpp
@@ -63,6 +63,7 @@
 #include <utility>
 #include <vector>
 #include <array>
+#include <cstdint>
 
 
 // Verbatim copy from CLI/Version.hpp:
@@ -2485,7 +2486,7 @@ class AsNumberWithUnit : public Validator {
 ///   "2 EiB" => 2^61 // Units up to exibyte are supported
 class AsSizeValue : public AsNumberWithUnit {
   public:
-    using result_t = uint64_t;
+    using result_t = std::uint64_t;
 
     /// If kb_is_1000 is true,
     /// interpret 'kb', 'k' as 1000 and 'kib', 'ki' as 1024
diff --git a/include/gauge_field.h b/include/gauge_field.h
index c355bd4818..9332b5c1e8 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -1,9 +1,9 @@
 #pragma once
 
+#include <memory>
 #include <quda_internal.h>
 #include <quda.h>
 #include <lattice_field.h>
-
 #include <comm_key.h>
 
 namespace quda {
@@ -147,6 +147,7 @@ namespace quda {
   class GaugeField : public LatticeField {
 
     friend std::ostream &operator<<(std::ostream &output, const GaugeField &param);
+    friend GaugeField shift(const GaugeField &in, int shift);
 
   private:
     /**
@@ -193,6 +194,10 @@ namespace quda {
     double tadpole = 0.0;
     double fat_link_max = 0.0;
 
+    mutable std::unique_ptr<GaugeField> shifted
+      = nullptr;             // shifted copy of the gauge field, used for double-store enabled dslash
+    bool is_shifted = false; // whether this instance is a shifted one
+
     mutable array<quda_ptr, 2 *QUDA_MAX_DIM> ghost
       = {}; // stores the ghost zone of the gauge field (non-native fields only)
 
@@ -647,6 +652,20 @@ namespace quda {
       }
     }
 
+    /**
+       @brief Return the shifted gauge field by shift in each
+       dimension.  Shifted field is cached for subsequent reuse.
+       @param[in] shift value (1 or 3 supported).  If no argument
+       passed the shift is set to Nface.
+       @return Reference to shifted field
+    */
+    GaugeField &shift(int shift = -1) const;
+
+    /**
+       @brief Resets the shifted field (if it exists).
+    */
+    void shift_reset() const;
+
     /**
      * @brief Print the site data
      * @param[in] parity Parity index
@@ -669,6 +688,17 @@ namespace quda {
   */
   void genericPrintMatrix(const GaugeField &a, int dim, int parity, unsigned int x_cb, int rank = 0);
 
+  /**
+     @brief Shift the gauge field by shift in each dimension and store
+     the resulting shifted field.  This is used to move the backwards
+     links on to this site.  The input field must be a padded field
+     with the ghost pre-exchanged if communications are enabled.
+     @param[in] in Input shifted field
+     @param[in] shift value (1 or 3 supported)
+     @return Shifted field
+   */
+  GaugeField shift(const GaugeField &in, int shift);
+
   /**
      @brief This is a debugging function, where we cast a gauge field
      into a spinor field so we can compute its L1 norm.
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 827dde5bbf..938d0b4ea0 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -23,6 +23,7 @@
 #include <load_store.h>
 #include <aos.h>
 #include <transform_reduce.h>
+#include <tma_helper.hpp>
 
 namespace quda {
 
@@ -997,7 +998,7 @@ namespace quda {
          type)
       */
     template <int N, typename Float, QudaReconstructType, QudaGhostExchange ghostExchange_,
-              QudaStaggeredPhase = QUDA_STAGGERED_PHASE_NO>
+              QudaStaggeredPhase = QUDA_STAGGERED_PHASE_NO, bool = false>
     struct Reconstruct {
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
@@ -1030,14 +1031,10 @@ namespace quda {
         __device__ __host__ inline void Unpack(complex out[N / 2], const real in[N], int, int, real, const I *,
                                                const int *) const
         {
-          if constexpr (isFixed<Float>::value) {
-#pragma unroll
-            for (int i = 0; i < N / 2; i++) { out[i] = scale * complex(in[2 * i + 0], in[2 * i + 1]); }
-          } else {
 #pragma unroll
-            for (int i = 0; i < N / 2; i++) { out[i] = complex(in[2 * i + 0], in[2 * i + 1]); }
-          }
+          for (int i = 0; i < N / 2; i++) { out[i] = complex(in[2 * i + 0], in[2 * i + 1]); }
         }
+
         __device__ __host__ inline real getPhase(const complex[]) const { return 0; }
     };
 
@@ -1052,36 +1049,40 @@ namespace quda {
          @param isLastTimeSlide if we're on the last time slice of nodes
          @param ghostExchange if the field is extended or not (determines indexing type)
       */
-      template <QudaGhostExchange ghostExchange_, typename T, typename I>
-      __device__ __host__ inline T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM],
-          T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice,
-          bool isLastTimeSlice, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_NO)
-      {
+    template <QudaGhostExchange ghostExchange_, bool shifted, typename T, typename I>
+    __device__ __host__ inline T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM], T tBoundary,
+                                              T scale, int firstTimeSliceBound, int lastTimeSliceBound,
+                                              bool isFirstTimeSlice, bool isLastTimeSlice,
+                                              QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_NO)
+    {
 
-        // MWTODO: should this return tBoundary : scale or tBoundary*scale : scale
+      // MWTODO: should this return tBoundary : scale or tBoundary*scale : scale
 
-        if (ghostExchange_ == QUDA_GHOST_EXCHANGE_PAD
-            || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED)) {
-          if (idx >= firstTimeSliceBound) { // halo region on the first time slice
-            return isFirstTimeSlice ? tBoundary : scale;
-          } else if (idx >= lastTimeSliceBound) { // last link on the last time slice
-            return isLastTimeSlice ? tBoundary : scale;
-          } else {
-            return scale;
-          }
-        } else if (ghostExchange_ == QUDA_GHOST_EXCHANGE_EXTENDED
-            || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED)) {
-          if (idx >= (R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < R[3] * X[0] * X[1] * X[2] / 2) {
-            // the boundary condition is on the R[3]-1 time slice
-            return isFirstTimeSlice ? tBoundary : scale;
-          } else if (idx >= (X[3] - R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < (X[3] - R[3]) * X[0] * X[1] * X[2] / 2) {
-            // the boundary condition lies on the X[3]-R[3]-1 time slice
-            return isLastTimeSlice ? tBoundary : scale;
-          } else {
-            return scale;
-          }
+      if (ghostExchange_ == QUDA_GHOST_EXCHANGE_PAD
+          || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED)) {
+
+        if (!shifted && idx >= firstTimeSliceBound) { // halo region on the first time slice
+          return isFirstTimeSlice ? tBoundary : scale;
+        } else if (shifted && idx < firstTimeSliceBound) { // shifted link on first time slice
+          return isFirstTimeSlice ? tBoundary : scale;
+        } else if (!shifted && idx >= lastTimeSliceBound) { // last link on the last time slice
+          return isLastTimeSlice ? tBoundary : scale;
+        } else {
+          return scale;
+        }
+      } else if (ghostExchange_ == QUDA_GHOST_EXCHANGE_EXTENDED
+                 || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED)) {
+        if (idx >= (R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < R[3] * X[0] * X[1] * X[2] / 2) {
+          // the boundary condition is on the R[3]-1 time slice
+          return isFirstTimeSlice ? tBoundary : scale;
+        } else if (idx >= (X[3] - R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < (X[3] - R[3]) * X[0] * X[1] * X[2] / 2) {
+          // the boundary condition lies on the X[3]-R[3]-1 time slice
+          return isLastTimeSlice ? tBoundary : scale;
+        } else {
+          return scale;
         }
-        return scale;
+      }
+      return scale;
       }
 
       // not actually used - here for reference
@@ -1104,8 +1105,8 @@ namespace quda {
          @tparam ghostExchange_ optional template the ghostExchange
          type to avoid the run-time overhead
       */
-      template <typename Float, QudaGhostExchange ghostExchange_>
-      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_12, ghostExchange_> {
+      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase phase, bool shifted>
+      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_12, ghostExchange_, phase, shifted> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         const real anisotropy;
@@ -1119,7 +1120,7 @@ namespace quda {
         Reconstruct(const GaugeField &u) :
           anisotropy(u.Anisotropy()),
           tBoundary(static_cast<real>(u.TBoundary())),
-          firstTimeSliceBound(u.VolumeCB()),
+          firstTimeSliceBound(!shifted ? u.VolumeCB() : u.X()[0] * u.X()[1] * u.X()[2] / 2),
           lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),
           isFirstTimeSlice(comm_coord(3) == 0 ? true : false),
           isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
@@ -1145,8 +1146,8 @@ namespace quda {
 
           const real u0 = dir < 3 ?
             anisotropy :
-            timeBoundary<ghostExchange_>(idx, X, R, tBoundary, static_cast<real>(1.0), firstTimeSliceBound,
-                                         lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange);
+            timeBoundary<ghostExchange_, shifted>(idx, X, R, tBoundary, static_cast<real>(1.0), firstTimeSliceBound,
+                                                  lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange);
 
           // out[6] = u0*conj(out[1]*out[5] - out[2]*out[4]);
           out[6] = cmul(out[2], out[4]);
@@ -1177,8 +1178,8 @@ namespace quda {
          @tparam ghostExchange_ optional template the ghostExchange
          type to avoid the run-time overhead
       */
-      template <typename Float, QudaGhostExchange ghostExchange_>
-      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_10, ghostExchange_> {
+      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase phase, bool shifted>
+      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_10, ghostExchange_, phase, shifted> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
 
@@ -1225,8 +1226,8 @@ namespace quda {
          @tparam ghostExchange_ optional template the ghostExchange
          type to avoid the run-time overhead
       */
-      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
-      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_13, ghostExchange_, stag_phase> {
+      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase, bool shifted>
+      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_13, ghostExchange_, stag_phase, shifted> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         const Reconstruct<18, Float, QUDA_RECONSTRUCT_12, ghostExchange_> reconstruct_12;
@@ -1249,25 +1250,27 @@ namespace quda {
 
           out[6] = cmul(out[2], out[4]);
           out[6] = cmac(out[1], out[5], -out[6]);
-          out[6] = scale_inv * conj(out[6]);
+          out[6] = conj(out[6]);
 
           out[7] = cmul(out[0], out[5]);
           out[7] = cmac(out[2], out[3], -out[7]);
-          out[7] = scale_inv * conj(out[7]);
+          out[7] = conj(out[7]);
 
           out[8] = cmul(out[1], out[3]);
           out[8] = cmac(out[0], out[4], -out[8]);
-          out[8] = scale_inv * conj(out[8]);
+          out[8] = conj(out[8]);
 
           if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing
             // Multiply the third row by exp(I*3*phase), since the cross product will end up in a scale factor of exp(-I*2*phase)
             real cos_sin[2];
             sincospi(static_cast<real>(3.0) * phase, &cos_sin[1], &cos_sin[0]);
             complex A(cos_sin[0], cos_sin[1]);
-            out[6] = cmul(A, out[6]);
-            out[7] = cmul(A, out[7]);
-            out[8] = cmul(A, out[8]);
+            A *= scale_inv;
+            out[6] = cmul(out[6], A);
+            out[7] = cmul(out[7], A);
+            out[8] = cmul(out[8], A);
           } else { // phase is +/- 1 so real multiply is sufficient
+            phase *= scale_inv;
             out[6] *= phase;
             out[7] *= phase;
             out[8] *= phase;
@@ -1302,8 +1305,8 @@ namespace quda {
          @tparam ghostExchange_ optional template the ghostExchange type
          to avoid the run-time overhead
       */
-      template <typename Float, QudaGhostExchange ghostExchange_>
-      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_8, ghostExchange_> {
+      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase, bool shifted>
+      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_8, ghostExchange_, stag_phase, shifted> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         const complex anisotropy; // imaginary value stores inverse
@@ -1318,7 +1321,7 @@ namespace quda {
         Reconstruct(const GaugeField &u, real scale = 1.0) :
           anisotropy(u.Anisotropy() * scale, 1.0 / (u.Anisotropy() * scale)),
           tBoundary(static_cast<real>(u.TBoundary()) * scale, 1.0 / (static_cast<real>(u.TBoundary()) * scale)),
-          firstTimeSliceBound(u.VolumeCB()),
+          firstTimeSliceBound(!shifted ? u.VolumeCB() : u.X()[0] * u.X()[1] * u.X()[2] / 2),
           lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),
           isFirstTimeSlice(comm_coord(3) == 0 ? true : false),
           isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
@@ -1389,29 +1392,31 @@ namespace quda {
           real r_inv2 = u0_inv * row_sum_inv;
           {
             complex A = cmul(conj(out[0]), out[3]);
+            complex u0A = u0 * A;
 
             // out[4] = -(conj(out[6])*conj(out[2]) + u0*A*out[1])*r_inv2; // U11
             out[4] = cmul(conj(out[6]), conj(out[2]));
-            out[4] = cmac(u0 * A, out[1], out[4]);
+            out[4] = cmac(u0A, out[1], out[4]);
             out[4] = -r_inv2 * out[4];
 
             // out[5] = (conj(out[6])*conj(out[1]) - u0*A*out[2])*r_inv2;  // U12
             out[5] = cmul(conj(out[6]), conj(out[1]));
-            out[5] = cmac(-u0 * A, out[2], out[5]);
+            out[5] = cmac(-u0A, out[2], out[5]);
             out[5] = r_inv2 * out[5];
           }
 
           {
             complex A = cmul(conj(out[0]), out[6]);
+            complex u0A = u0 * A;
 
             // out[7] = (conj(out[3])*conj(out[2]) - u0*A*out[1])*r_inv2;  // U21
             out[7] = cmul(conj(out[3]), conj(out[2]));
-            out[7] = cmac(-u0 * A, out[1], out[7]);
+            out[7] = cmac(-u0A, out[1], out[7]);
             out[7] = r_inv2 * out[7];
 
             // out[8] = -(conj(out[3])*conj(out[1]) + u0*A*out[2])*r_inv2; // U12
             out[8] = cmul(conj(out[3]), conj(out[1]));
-            out[8] = cmac(u0 * A, out[2], out[8]);
+            out[8] = cmac(u0A, out[2], out[8]);
             out[8] = -r_inv2 * out[8];
           }
 
@@ -1433,8 +1438,8 @@ namespace quda {
         {
           complex u = dir < 3 ?
             anisotropy :
-            timeBoundary<ghostExchange_>(idx, X, R, tBoundary, scale, firstTimeSliceBound, lastTimeSliceBound,
-                                         isFirstTimeSlice, isLastTimeSlice, ghostExchange);
+            timeBoundary<ghostExchange_, shifted>(idx, X, R, tBoundary, scale, firstTimeSliceBound, lastTimeSliceBound,
+                                                  isFirstTimeSlice, isLastTimeSlice, ghostExchange);
 
           Unpack(out, in, idx, dir, phase, X, R, scale, u);
         }
@@ -1450,11 +1455,11 @@ namespace quda {
          @tparam ghostExchange_ optional template the ghostExchange type
          to avoid the run-time overhead
       */
-      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
-      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_9, ghostExchange_, stag_phase> {
+      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase, bool shifted>
+      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_9, ghostExchange_, stag_phase, shifted> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
-        const Reconstruct<18, Float, QUDA_RECONSTRUCT_8, ghostExchange_> reconstruct_8;
+        const Reconstruct<18, Float, QUDA_RECONSTRUCT_8, ghostExchange_, stag_phase, shifted> reconstruct_8;
         const real scale;
         const real scale_inv;
 
@@ -1551,18 +1556,19 @@ namespace quda {
 
       template <typename Float, int length_, QudaReconstructType recon,
                 QudaStaggeredPhase stag_phase = QUDA_STAGGERED_PHASE_NO, bool huge_alloc = default_huge_alloc,
-                QudaGhostExchange ghostExchange_ = QUDA_GHOST_EXCHANGE_INVALID, bool use_inphase = false>
+                QudaGhostExchange ghostExchange_ = QUDA_GHOST_EXCHANGE_INVALID, bool use_inphase = false, bool shifted = false>
       struct FloatNOrder {
-        using Accessor = FloatNOrder<Float, length_, recon, stag_phase, huge_alloc, ghostExchange_, use_inphase>;
+        using Accessor = FloatNOrder<Float, length_, recon, stag_phase, huge_alloc, ghostExchange_, use_inphase, shifted>;
 
         using store_t = Float;
         static constexpr int length = length_;
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         typedef typename AllocType<huge_alloc>::type AllocInt;
-        Reconstruct<length, Float, recon, ghostExchange_, stag_phase> reconstruct;
+        Reconstruct<length, Float, recon, ghostExchange_, stag_phase, shifted> reconstruct;
         static constexpr int reconLen = recon;
         static constexpr int hasPhase = (reconLen == 9 || reconLen == 13) ? 1 : 0;
+        static constexpr bool loadPhase = hasPhase && !(static_phase<stag_phase>() && (reconLen == 13 || use_inphase));
         static constexpr int N = gauge::get_vector_order<Float>(reconLen - hasPhase);
         static constexpr int M = (reconLen - hasPhase) / N;
         static constexpr int Nrem = reconLen - hasPhase - M * N;
@@ -1580,6 +1586,9 @@ namespace quda {
         const int geometry;
         const AllocInt phaseOffset;
         size_t bytes;
+        gauge::tensor_desc_t tensor_desc;
+        const real combined_scale; // Precomputed scale for copy_and_scale: fixedInvMaxValue * reconstruct.scale
+        const real phase_scale; // Precomputed scale for phase loading: fixedInvMaxValue * 2.0 (or just 2.0 for float)
 
         FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
           reconstruct(u),
@@ -1590,7 +1599,18 @@ namespace quda {
           stride(u.Stride()),
           geometry(u.Geometry()),
           phaseOffset(u.PhaseOffset() / sizeof(Float)),
-          bytes(u.Bytes())
+          bytes(u.Bytes()),
+          combined_scale([&]() {
+            if constexpr (recon == 18) {
+              // QUDA_RECONSTRUCT_NO: combine fixedInvMaxValue with reconstruct.scale
+              return isFixed<Float>::value ? fixedInvMaxValue<Float>::value * reconstruct.scale : 1.0;
+            } else {
+              // Other reconstruction types: only need fixedInvMaxValue (reconstruct.scale doesn't exist)
+              return isFixed<Float>::value ? fixedInvMaxValue<Float>::value : 1.0;
+            }
+          }()),
+          phase_scale(isFixed<Float>::value ? fixedInvMaxValue<Float>::value * static_cast<real>(2.0) :
+                                              static_cast<real>(2.0))
         {
           if (geometry == QUDA_COARSE_GEOMETRY)
             errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
@@ -1612,26 +1632,97 @@ namespace quda {
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first load from memory
-          auto vecTmp = vector_load<Float, N>(gauge + parity * offset + dir * (M * N + Nrem) * stride, i * stride + x);
-          // second do copy converting into register type
-          copy(tmp + i * N, vecTmp);
+          auto vecTmp = vector_load<Float, N>(gauge, parity * offset + dir * (M * N + Nrem) * stride, i * stride + x);
+          // second do copy converting into register type with combined scaling
+          copy_and_scale(tmp + i * N, vecTmp, combined_scale);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
-          auto vecTmp = vector_load<Float, Nrem>(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x);
-          copy(tmp + M * N, vecTmp);
+          auto vecTmp = vector_load<Float, Nrem>(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x);
+          copy_and_scale(tmp + M * N, vecTmp, combined_scale);
         }
 
-        constexpr bool load_phase = (hasPhase && !(static_phase<stag_phase>() && (reconLen == 13 || use_inphase)));
-        if constexpr (load_phase) {
-          copy(phase, gauge[parity * offset + phaseOffset + stride * dir + x]);
-          phase *= static_cast<real>(2.0);
+        if constexpr (loadPhase) {
+          if constexpr (isFixed<Float>::value) {
+            copy_and_scale(phase, gauge[parity * offset + phaseOffset + stride * dir + x], phase_scale);
+          } else {
+            copy(phase, gauge[parity * offset + phaseOffset + stride * dir + x]);
+            phase *= static_cast<real>(2.0);
+          }
         }
 
         reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
       }
 
+      __device__ __host__ inline void raw_load(array<store_t, reconLen> &v, int x, int dir, int parity) const
+      {
+#pragma unroll
+        for (int i = 0; i < M; i++) {
+          // first load from memory
+          auto vecTmp = vector_load<Float, N>(gauge, parity * offset + dir * (M * N + Nrem) * stride, i * stride + x);
+          memcpy(&v[i * N], &vecTmp, sizeof(vecTmp));
+        }
+
+        // now load any remainder
+        if constexpr (Nrem > 0) {
+          auto vecTmp = vector_load<Float, Nrem>(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x);
+          memcpy(&v[M * N], &vecTmp, sizeof(vecTmp));
+        }
+
+        if constexpr (loadPhase)
+          memcpy(&v[M * N + Nrem], &gauge[parity * offset + phaseOffset + stride * dir + x], sizeof(store_t));
+      }
+
+      template <PrefetchType type> __device__ inline void prefetch(int x, int dir, int parity, int block_size = 0) const
+      {
+        if constexpr (type == PrefetchType::THREAD) { // use per-thread prefetching
+#pragma unroll
+          for (int i = 0; i < M; i++)
+            prefetch_cache_line(gauge + (parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N));
+
+          // now load any remainder
+          if constexpr (Nrem > 0)
+            prefetch_cache_line(gauge + (parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem));
+
+          if constexpr (loadPhase) prefetch_cache_line(gauge + (parity * offset + phaseOffset + stride * dir + x));
+        } else if constexpr (type == PrefetchType::BULK) { // bulk prefetch
+          if (block_size == 0) block_size = blockDim.x;
+          if (target::is_thread_zero()) {
+#pragma unroll
+            for (int i = 0; i < M; i++)
+              prefetch_cache_bulk(gauge + (parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N),
+                                  block_size * N * sizeof(Float));
+
+            // now load any remainder
+            if constexpr (Nrem > 0)
+              prefetch_cache_bulk(gauge + (parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem),
+                                  block_size * Nrem * sizeof(Float));
+
+            if constexpr (loadPhase)
+              prefetch_cache_bulk(gauge + (parity * offset + phaseOffset + stride * dir + x), block_size * sizeof(Float));
+          }
+        } else if constexpr (type == PrefetchType::TENSOR) { // n-d tensor prefetch
+          if (target::is_thread_zero()) {
+            prefetch_cache_tensor_5d(tensor_desc.N, x, x / 16, 0, dir, parity);
+            if constexpr (Nrem > 0) prefetch_cache_tensor_4d(tensor_desc.Nrem, x, x / 16, dir, parity);
+            if constexpr (loadPhase) prefetch_cache_tensor_4d(tensor_desc.phase, x, x / 16, dir, parity);
+          }
+#if 0 // L1 prefetching is a disabled experiment
+        } else { // L1 prefetching
+#pragma unroll
+          for (int i = 0; i < M; i++)
+            prefetch_L1_cache_line(gauge + (parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N));
+
+          // now load any remainder
+          if constexpr (Nrem > 0)
+            prefetch_L1_cache_line(gauge + (parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem));
+
+          if constexpr (loadPhase) prefetch_L1_cache_line(gauge + (parity * offset + phaseOffset + stride * dir + x));
+#endif
+        }
+      }
+
       __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const
       {
         real tmp[reconLen];
@@ -1644,7 +1735,7 @@ namespace quda {
 #pragma unroll
           for (int j = 0; j < N; j++) copy(vecTmp[j], tmp[i * N + j]);
           // second do vectorized copy into memory
-          vector_store(gauge + parity * offset + dir * (M * N + Nrem) * stride, x + i * stride, vecTmp);
+          vector_store(gauge, parity * offset + dir * (M * N + Nrem) * stride, x + i * stride, vecTmp);
         }
 
         // now save any remainder
@@ -1653,7 +1744,7 @@ namespace quda {
 #pragma unroll
           for (int j = 0; j < Nrem; j++) copy(vecTmp[j], tmp[M * N + j]);
           // second do vectorized copy into memory
-          vector_store(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x, vecTmp);
+          vector_store(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x, vecTmp);
         }
 
         if constexpr (hasPhase) {
@@ -1662,6 +1753,29 @@ namespace quda {
         }
       }
 
+      __device__ __host__ inline void raw_save(const array<store_t, reconLen> &v, int x, int dir, int parity) const
+      {
+#pragma unroll
+        for (int i = 0; i < M; i++) {
+          array<Float, N> vecTmp;
+          // first do copy converting into storage type
+          memcpy(&vecTmp, &v[i * N], sizeof(vecTmp));
+          // second do vectorized copy into memory
+          vector_store(gauge, parity * offset + dir * (M * N + Nrem) * stride, x + i * stride, vecTmp);
+        }
+
+        // now save any remainder
+        if constexpr (Nrem > 0) {
+          array<Float, Nrem> vecTmp;
+          memcpy(&vecTmp, &v[M * N], sizeof(vecTmp));
+          // second do vectorized copy into memory
+          vector_store(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x, vecTmp);
+        }
+
+        if constexpr (hasPhase)
+          memcpy(&gauge[parity * offset + phaseOffset + dir * stride + x], &v[M * N + Nrem], sizeof(store_t));
+      }
+
       /**
 	 @brief This accessor routine returns a gauge_wrapper to this object,
 	 allowing us to overload various operators for manipulating at
@@ -1690,15 +1804,15 @@ namespace quda {
             // first do vectorized copy from memory into registers
             auto vecTmp = vector_load<Float, N>(ghost[dir], (i * 2 + parity) * faceVolumeCB[dir] + x);
 
-            // second do copy converting into register type
-            copy(tmp + i * N, vecTmp);
+            // second do copy converting into register type with combined scaling
+            copy_and_scale(tmp + i * N, vecTmp, combined_scale);
           }
 
           // now load any remainder
           if constexpr (Nrem > 0) {
             auto vecTmp
-              = vector_load<Float, Nrem>(ghost[dir] + 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x);
-            copy(tmp + M * N, vecTmp);
+              = vector_load<Float, Nrem>(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x);
+            copy_and_scale(tmp + M * N, vecTmp, combined_scale);
           }
 
           real phase = 0.;
@@ -1707,8 +1821,13 @@ namespace quda {
             // if(stag_phase == QUDA_STAGGERED_PHASE_MILC )  {
             //   phase = inphase < static_cast<real>(0) ? static_cast<real>(-0.5) : static_cast<real>(0.5);
             // } else {
-            copy(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x]);
-            phase *= static_cast<real>(2.0);
+            if constexpr (isFixed<Float>::value) {
+              copy_and_scale(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x],
+                             phase_scale);
+            } else {
+              copy(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x]);
+              phase *= static_cast<real>(2.0);
+            }
             // }
           }
           reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
@@ -1739,7 +1858,7 @@ namespace quda {
 #pragma unroll
             for (int j = 0; j < Nrem; j++) copy(vecTmp[j], tmp[M * N + j]);
             // second do vectorized copy into memory
-            vector_store(ghost[dir] + 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x, vecTmp);
+            vector_store(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x, vecTmp);
           }
 
           if constexpr (hasPhase) {
@@ -1790,27 +1909,36 @@ namespace quda {
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first do vectorized copy from memory
-          auto vecTmp = vector_load<Float, N>(ghost[dim] + dir * reconLen * 2 * geometry * R[dim] * faceVolumeCB[dim],
+          auto vecTmp = vector_load<Float, N>(ghost[dim], dir * reconLen * 2 * geometry * R[dim] * faceVolumeCB[dim],
                                               ((i * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
-          // second do copy converting into register type
-          copy(tmp + i * N, vecTmp);
+          // second do copy converting into register type with combined scaling
+          copy_and_scale(tmp + i * N, vecTmp, combined_scale);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
           auto vecTmp
-            = vector_load<Float, Nrem>(ghost[dim] + (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
+            = vector_load<Float, Nrem>(ghost[dim], (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
                                        (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
-          copy(tmp + M * N, vecTmp);
+          copy_and_scale(tmp + M * N, vecTmp, combined_scale);
         }
 
         real phase = 0.;
-        if constexpr (hasPhase)
-          copy(phase,
-               ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
-                          + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x]);
+        if constexpr (hasPhase) {
+          if constexpr (isFixed<Float>::value) {
+            copy_and_scale(phase,
+                           ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
+                                      + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x],
+                           phase_scale);
+          } else {
+            copy(phase,
+                 ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
+                            + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x]);
+            phase *= static_cast<real>(2.0);
+          }
+        }
 
         // use the extended_idx to determine the boundary condition
         reconstruct.Unpack(v, tmp, extended_idx, g, 2. * phase, X, R);
@@ -1829,7 +1957,7 @@ namespace quda {
 #pragma unroll
           for (int j = 0; j < N; j++) copy(vecTmp[j], tmp[i * N + j]);
           // second do vectorized copy to memory
-          vector_store(ghost[dim] + dir * reconLen * 2 * geometry * R[dim] * faceVolumeCB[dim],
+          vector_store(ghost[dim], dir * reconLen * 2 * geometry * R[dim] * faceVolumeCB[dim],
                        ((i * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] + x, vecTmp);
         }
 
@@ -1839,7 +1967,7 @@ namespace quda {
 #pragma unroll
           for (int j = 0; j < Nrem; j++) copy(vecTmp[j], tmp[M * N + j]);
           // second do vectorized copy into memory
-          vector_store(ghost[dim] + (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
+          vector_store(ghost[dim], (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
                        (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x, vecTmp);
         }
 
@@ -2538,20 +2666,20 @@ namespace quda {
 
   template <typename T, QudaReconstructType recon, int N = 18, QudaStaggeredPhase stag = QUDA_STAGGERED_PHASE_NO,
             bool huge_alloc = gauge::default_huge_alloc, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_INVALID,
-            bool use_inphase = false, QudaGaugeFieldOrder order = QUDA_NATIVE_GAUGE_ORDER>
+            bool use_inphase = false, QudaGaugeFieldOrder order = QUDA_NATIVE_GAUGE_ORDER, bool shifted = false>
   struct gauge_mapper {
-    typedef gauge::FloatNOrder<T, N, recon, stag, huge_alloc, ghostExchange, use_inphase> type;
+    typedef gauge::FloatNOrder<T, N, recon, stag, huge_alloc, ghostExchange, use_inphase, shifted> type;
   };
 
   template <typename T, QudaReconstructType recon, int N, QudaStaggeredPhase stag, bool huge_alloc,
-            QudaGhostExchange ghostExchange, bool use_inphase>
-  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_MILC_GAUGE_ORDER> {
+            QudaGhostExchange ghostExchange, bool use_inphase, bool shifted>
+  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_MILC_GAUGE_ORDER, shifted> {
     typedef gauge::MILCOrder<T, N> type;
   };
 
   template <typename T, QudaReconstructType recon, int N, QudaStaggeredPhase stag, bool huge_alloc,
-            QudaGhostExchange ghostExchange, bool use_inphase>
-  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_QDP_GAUGE_ORDER> {
+            QudaGhostExchange ghostExchange, bool use_inphase, bool shifted>
+  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_QDP_GAUGE_ORDER, shifted> {
     typedef gauge::QDPOrder<T, N> type;
   };
 
diff --git a/include/index_helper.cuh b/include/index_helper.cuh
index 35ec4bd0e5..7eff25a882 100644
--- a/include/index_helper.cuh
+++ b/include/index_helper.cuh
@@ -234,47 +234,15 @@ namespace quda {
     array<int, nDim> gx = {};   // nDim global lattice coordinates
     array<int, nDim> gDim = {}; // global lattice dimensions
     int x_cb;    // checkerboard lattice site index
+    int x_cb_0;  // value of x_cb on first thread in block
     int s;       // fifth dimension coord
     int X;       // full lattice site index
     constexpr const int& operator[](int i) const { return x[i]; }
     constexpr int& operator[](int i) { return x[i]; }
-    array_2d<bool, 2, nDim> in_boundary = {};
+    array_2d<int, 2, nDim> in_boundary = {};
     constexpr int size() const { return nDim; }
   };
 
-  /**
-     @brief Compute the checkerboard 1-d index for the nearest
-     neighbor
-     @param[in] lattice coordinates
-     @param[in] mu dimension in which to add 1
-     @param[in] dir direction (+1 or -1)
-     @param[in] arg parameter struct
-     @return 1-d checkboard index
-   */
-  template <typename Coord, typename Arg>
-  __device__ __host__ inline int getNeighborIndexCB(const Coord &x, int mu, int dir, const Arg &arg)
-  {
-    switch (dir) {
-    case +1: // positive direction
-      switch (mu) {
-      case 0: return (x.in_boundary[1][0] ? x.X - (arg.X[0] - 1) : x.X + 1) >> 1;
-      case 1: return (x.in_boundary[1][1] ? x.X - arg.X2X1mX1 : x.X + arg.X[0]) >> 1;
-      case 2: return (x.in_boundary[1][2] ? x.X - arg.X3X2X1mX2X1 : x.X + arg.X2X1) >> 1;
-      case 3: return (x.in_boundary[1][3] ? x.X - arg.X4X3X2X1mX3X2X1 : x.X + arg.X3X2X1) >> 1;
-      case 4: return (x.in_boundary[1][4] ? x.X - arg.X5X4X3X2X1mX4X3X2X1 : x.X + arg.X4X3X2X1) >> 1;
-      }
-    case -1:
-      switch (mu) {
-      case 0: return (x.in_boundary[0][0] ? x.X + (arg.X[0] - 1) : x.X - 1) >> 1;
-      case 1: return (x.in_boundary[0][1] ? x.X + arg.X2X1mX1 : x.X - arg.X[0]) >> 1;
-      case 2: return (x.in_boundary[0][2] ? x.X + arg.X3X2X1mX2X1 : x.X - arg.X2X1) >> 1;
-      case 3: return (x.in_boundary[0][3] ? x.X + arg.X4X3X2X1mX3X2X1 : x.X - arg.X3X2X1) >> 1;
-      case 4: return (x.in_boundary[0][4] ? x.X + arg.X5X4X3X2X1mX4X3X2X1 : x.X - arg.X4X3X2X1) >> 1;
-      }
-    }
-    return 0; // should never reach here
-  }
-
   /**
      Compute the 4-d spatial index from the checkerboarded 1-d index at parity parity
 
@@ -839,7 +807,7 @@ namespace quda {
   // int idx = indexFromFaceIndex<4,QUDA_4D_PC,dim,nFace,0>(ghost_idx, parity, arg);
 
   template <int nDim, typename Arg>
-  constexpr int indexFromFaceIndexStaggered(int dim, int face_num, int face_idx_in, int parity, int nLayers, QudaPCType, const Arg &arg)
+  __host__ __device__ inline int indexFromFaceIndexStaggered(int dim, int face_num, int face_idx_in, int parity, int nLayers, QudaPCType, const Arg &arg)
   {
     const auto *X = arg.dc.X;            // grid dimension
     const auto &V4 = arg.dc.volume_4d;   // 4-d volume
@@ -854,7 +822,7 @@ namespace quda {
     int s = face_idx_in / arg.dc.face_XYZT[dim];
     int face_idx = face_idx_in - s * arg.dc.face_XYZT[dim];
 
-    int dims[3] = {};
+    std::remove_const_t<std::remove_reference_t<decltype(arg.dc.X[0])>> dims[3] = {};
     int d1 = 0;
 #pragma unroll 4
     for (int d2 = 0; d2 < 4; d2++) { // this will evaluate at compile time
@@ -898,7 +866,7 @@ namespace quda {
   }
 
   template <int nDim, QudaPCType type, int dim, int nLayers, int face_num, typename Arg>
-  constexpr int indexFromFaceIndexStaggered(int face_idx_in, int parity, const Arg &arg)
+  __host__ __device__ int indexFromFaceIndexStaggered(int face_idx_in, int parity, const Arg &arg)
   {
     return indexFromFaceIndexStaggered<nDim>(dim, face_num, face_idx_in, parity, nLayers, type, arg);
   }
diff --git a/include/kernel_helper.h b/include/kernel_helper.h
index 14727c327a..075295f9b6 100644
--- a/include/kernel_helper.h
+++ b/include/kernel_helper.h
@@ -19,11 +19,14 @@ namespace quda
     static constexpr bool check_bounds = check_bounds_;
     static constexpr int max_regs = 0;          // by default we don't limit register count
     static constexpr bool spill_shared = false; // whether a given kernel should use shared memory spilling
+    static constexpr bool is_dslash = false;    // whether the arg is for a dslash (with its nested arg struct)
     dim3 threads;          /** number of active threads required */
+    int block_size;        /** product of thread block dimensions */
     int comms_rank;        /** per process value of comm_rank() */
     int comms_rank_global; /** per process value comm_rank_global() */
     int comms_coord[4];    /** array storing {comm_coord(0), ..., comm_coord(3)} */
     int comms_dim[4];      /** array storing {comm_dim(0), ..., comm_dim(3)} */
+    int comms_dim_partitioned[4]; /** array storing {comm_dim_partitioned(0), ..., comm_dim_partiitoned(3)} */
 
     constexpr kernel_param() = default;
 
@@ -32,7 +35,9 @@ namespace quda
       comms_rank(comm_rank()),
       comms_rank_global(comm_rank_global()),
       comms_coord {comm_coord(0), comm_coord(1), comm_coord(2), comm_coord(3)},
-      comms_dim {comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3)}
+      comms_dim {comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3)},
+      comms_dim_partitioned {comm_dim_partitioned(0), comm_dim_partitioned(1), comm_dim_partitioned(2),
+                             comm_dim_partitioned(3)}
     {
     }
 
diff --git a/include/kernels/block_orthogonalize.cuh b/include/kernels/block_orthogonalize.cuh
index e3e0868c7f..3db52ae6b0 100644
--- a/include/kernels/block_orthogonalize.cuh
+++ b/include/kernels/block_orthogonalize.cuh
@@ -80,7 +80,7 @@ namespace quda {
   };
 
   template <typename Arg> struct BlockOrtho_Params {
-    static constexpr int mVec = tile_size<Arg::nColor, Arg::nVec, Arg::block_size>();
+    static constexpr int mVec = tile_size<Arg::nColor, Arg::nVec, Arg::block_size_cxpr>();
     using dot_t = array<complex<typename Arg::sum_t>, mVec>;
     static constexpr int block_dim = 1;
     using BlockReduceDot = BlockReduce<dot_t, block_dim>;
@@ -90,7 +90,7 @@ namespace quda {
 
   template <typename Arg> struct BlockOrtho_ : BlockOrtho_Params<Arg>::Ops {
     const Arg &arg;
-    static constexpr unsigned block_size = Arg::block_size;
+    static constexpr unsigned block_size = Arg::block_size_cxpr;
     static constexpr int fineSpin = Arg::fineSpin;
     static constexpr int spinBlock = (fineSpin == 1) ? 1 : fineSpin / Arg::coarseSpin; // size of spin block
     static constexpr int nColor = Arg::nColor;
diff --git a/include/kernels/dslash_coarse_mma.cuh b/include/kernels/dslash_coarse_mma.cuh
index 0a8d5ea9ac..0cc3ac31e5 100644
--- a/include/kernels/dslash_coarse_mma.cuh
+++ b/include/kernels/dslash_coarse_mma.cuh
@@ -216,7 +216,7 @@ namespace quda
       // Initialize barrier. All `blockDim.x` threads in block participate.
       init(bar, blockDim.x * blockDim.y * blockDim.z);
       // Make initialized barrier visible in async proxy.
-      cde::fence_proxy_async_shared_cta();
+      cuda::ptx::fence_proxy_async();
     }
     // Syncthreads so initialized barrier is visible to all threads.
     __syncthreads();
diff --git a/include/kernels/dslash_domain_wall_4d_fused_m5.cuh b/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
index 46e0ae876a..ea90177228 100644
--- a/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
+++ b/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
@@ -25,6 +25,7 @@ namespace quda
     using DomainWall4DArg::threads;
     using DomainWall4DArg::x;
     using DomainWall4DArg::xpay;
+    using DomainWall4DArg::block_size;
 
     using F = typename DomainWall4DArg::F;
 
diff --git a/include/kernels/dslash_domain_wall_5d.cuh b/include/kernels/dslash_domain_wall_5d.cuh
index 0cb3190293..e1f9171763 100644
--- a/include/kernels/dslash_domain_wall_5d.cuh
+++ b/include/kernels/dslash_domain_wall_5d.cuh
@@ -25,7 +25,7 @@ namespace quda
     {
       // remove the batch dimension from these constants, since these are used for 5-d checkerboard indexing
       DslashArg<Float, nDim, DDArg>::dc.X[4] = in.X(4);
-      DslashArg<Float, nDim, DDArg>::dc.X5X4X3X2X1mX4X3X2X1 = (in.X(4) - 1) * DslashArg<Float, nDim, DDArg>::dc.X4X3X2X1;
+      DslashArg<Float, nDim, DDArg>::dc.X5X4X3X2X1 = in.X(4) * DslashArg<Float, nDim, DDArg>::dc.X4X3X2X1;
     }
   };
 
diff --git a/include/kernels/dslash_mdw_fused.cuh b/include/kernels/dslash_mdw_fused.cuh
index 2b57d5b0a9..65bfeb6c76 100644
--- a/include/kernels/dslash_mdw_fused.cuh
+++ b/include/kernels/dslash_mdw_fused.cuh
@@ -37,7 +37,7 @@ namespace quda {
       static constexpr bool reload = reload_;
       static constexpr bool spin_project = true;
       static constexpr bool spinor_direct_load = true; // false means texture load
-      using F = typename colorspinor_mapper<storage_type, 4, nColor, spin_project, spinor_direct_load>::type; // color spin field order
+      using F = typename colorspinor_mapper<storage_type, 4, nColor, spin_project, spinor_direct_load, true>::type; // color spin field order
       static constexpr bool gauge_direct_load = true;                          // false means texture load
       static constexpr QudaGhostExchange ghost = QUDA_GHOST_EXCHANGE_EXTENDED; // gauge field used is an extended one
       using G = typename gauge_mapper<storage_type, recon, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost>::type; // gauge field order
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index ae46c6a900..efaf33c5d7 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -1,12 +1,10 @@
 #pragma once
 
-#include <dslash_helper.cuh>
 #include <color_spinor_field_order.h>
 #include <gauge_field_order.h>
 #include <color_spinor.h>
 #include <dslash_helper.cuh>
-#include <index_helper.cuh>
-#include <kernels/dslash_pack.cuh> // forthe packing kernel
+#include <kernels/dslash_pack.cuh> // for the packing kernel
 
 namespace quda
 {
@@ -33,23 +31,30 @@ namespace quda
     static constexpr QudaGhostExchange ghost = QUDA_GHOST_EXCHANGE_PAD;
     static constexpr bool use_inphase = improved_ ? false : true;
     static constexpr QudaStaggeredPhase phase = phase_;
-    using GU = typename gauge_mapper<Float, reconstruct_u, 18, phase, gauge_direct_load, ghost, use_inphase>::type;
-    using GL =
-        typename gauge_mapper<Float, reconstruct_l, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost, use_inphase>::type;
+    template <bool shifted>
+    using GU = typename gauge_mapper<Float, reconstruct_u, 18, phase, gauge_direct_load, ghost, use_inphase,
+                                     QUDA_NATIVE_GAUGE_ORDER, shifted>::type;
+    template <bool shifted>
+    using GL = typename gauge_mapper<Float, reconstruct_l, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost,
+                                     use_inphase, QUDA_NATIVE_GAUGE_ORDER, shifted>::type;
 
     F out[MAX_MULTI_RHS];  /** output vector field */
     F in[MAX_MULTI_RHS];   /** input vector field */
     const Ghost halo_pack; /** accessor for writing the halo */
     const Ghost halo;      /** accessor for reading the halo */
     F x[MAX_MULTI_RHS];    /** input vector when doing xpay */
-    const GU U; /** the gauge field */
-    const GL L; /** the long gauge field */
+    mutable GU<false> U;     /** the gauge field */
+    mutable GU<true> Uback;  /** the gauge field */
+    mutable GL<false> L;     /** the long gauge field */
+    mutable GL<true> Lback;  /** the long gauge field */
 
     const real a; /** xpay scale factor */
     const real tboundary; /** temporal boundary condition */
     const bool is_first_time_slice; /** are we on the first (global) time slice */
     const bool is_last_time_slice; /** are we on the last (global) time slice */
     static constexpr bool improved = improved_;
+    static constexpr int prefetch_distance = QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED;
+    static constexpr int prefetch_distance_l1 = 0;
 
     const real dagger_scale;
 
@@ -59,11 +64,15 @@ namespace quda
       DslashArg < Float,
     nDim, DDArg, improved ? 3 : 1, n_src_tile
       > (out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, spin_project, comm_override),
-    halo_pack(halo, improved_ ? 3 : 1), halo(halo, improved_ ? 3 : 1), U(U), L(L), a(a), tboundary(U.TBoundary()),
-    is_first_time_slice(comm_coord(3) == 0 ? true : false),
+    halo_pack(halo, improved_ ? 3 : 1), halo(halo, improved_ ? 3 : 1), U(U),
+    Uback(dslash_double_store() ? U.shift(1) : U), L(L), Lback(dslash_double_store() ? L.shift(3) : L), a(a),
+    tboundary(U.TBoundary()), is_first_time_slice(comm_coord(3) == 0 ? true : false),
     is_last_time_slice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
     dagger_scale(dagger ? static_cast<real>(-1.0) : static_cast<real>(1.0))
     {
+      if (!improved && prefetch_distance > 7)
+        warningQuda("dslash prefetch distance %d is greater than pipeline length for naive staggered", prefetch_distance);
+
       for (auto i = 0u; i < out.size(); i++) {
         this->out[i] = out[i];
         this->in[i] = in[i];
@@ -72,6 +81,69 @@ namespace quda
     }
   };
 
+  /**
+     @brief Prefetch the gauge field into cache.
+     @param[in] dim The dimension we are presently working on
+     @param[in] dir The direction we are presently working on (1 = forwards, 0 = backwards)
+     @param[in] hop The hopping term we are presently working on (0 = 1 - hop, 1 = 3 - hop)
+     @param[in] coord Coordinates that we are working on with hop-3 boundary conditions evaluated
+     @param[in] coord1 Copy of coordinates that we are working on with hop-1 boundary conditions evaluated
+     @param[in] parity Partiry that we are working on
+     @param[in] arg Paramter struct
+   */
+  template <PrefetchType prefetch_type, int distance, class coord_t, class Arg>
+  __device__ __host__ void prefetch(int dim, int dir, int hop, const coord_t &coord, const coord_t &coord1, int parity,
+                                    const Arg &arg)
+  {
+    int step = 4 * dim + 2 * dir + hop + distance;
+    if (step >= Arg::improved ? 16 : 8) return;
+
+    // if using a TMA prefetch we need to use block's first coordinate
+    auto x_cb = dslash_prefetch_tma() ? coord.x_cb_0 : coord.x_cb;
+    x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
+
+    if constexpr (Arg::improved) {
+      int dim2 = step / 4;
+      switch (step % 4) {
+      case 0: arg.U.template prefetch<prefetch_type>(x_cb, dim2, parity); break;
+      case 1: arg.L.template prefetch<prefetch_type>(x_cb, dim2, parity); break;
+      case 2:
+        if constexpr (dslash_double_store())
+          arg.Uback.template prefetch<prefetch_type>(x_cb, dim2, parity);
+        else
+          arg.U.template prefetch<prefetch_type>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity);
+        break;
+      case 3:
+        if constexpr (dslash_double_store())
+          arg.Lback.template prefetch<prefetch_type>(x_cb, dim2, parity);
+        else
+          arg.L.template prefetch<prefetch_type>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity);
+        break;
+      }
+    } else {
+      int dim2 = step / 2;
+      switch (step % 2) {
+      case 0: arg.U.template prefetch<prefetch_type>(x_cb, dim2, parity); break;
+      case 1:
+        if constexpr (dslash_double_store())
+          arg.Uback.template prefetch<prefetch_type>(x_cb, dim2, parity);
+        else
+          arg.U.template prefetch<prefetch_type>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity);
+        break;
+      }
+    }
+  }
+
+  template <class coord_t, class Arg>
+  __device__ __host__ void prefetch(int dim, int dir, int hop, const coord_t &coord, const coord_t &coord1, int parity,
+                                    const Arg &arg)
+  {
+    if constexpr (Arg::prefetch_distance_l1 > 0) // L1 prefetch
+      prefetch<3, Arg::prefetch_distance_l1>(dim, dir, hop, coord, coord1, parity, arg);
+    if constexpr (Arg::prefetch_distance > 0) // L2 prefetch
+      prefetch<Arg::prefetch_type, Arg::prefetch_distance>(dim, dir, hop, coord, coord1, parity, arg);
+  };
+
   /**
      @brief Applies the off-diagonal part of the Staggered / Asqtad
      operator.
@@ -90,104 +162,141 @@ namespace quda
     typedef Matrix<complex<real>, Arg::nColor> Link;
     const int their_spinor_parity = (arg.nParity == 2) ? 1 - parity : 0;
 
+    Coord coord1 = coord;
+    if constexpr (Arg::improved) { // need to compute 1-hop in_boundary
+#pragma unroll
+      for (int d = 0; d < 4; d++) {
+        coord1.in_boundary[1][d] = -(coord[d] + 1 >= arg.dc.X[d]);
+        coord1.in_boundary[0][d] = -(coord[d] - 1 < 0);
+      }
+    }
+
 #pragma unroll
     for (int d = 0; d < 4; d++) { // loop over dimension
 
       // standard - forward direction
       if (arg.dd_in.doHopping(coord, d, +1)) {
-        const bool ghost = (coord[d] + 1 >= arg.dc.X[d]) && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord1.in_boundary[1][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
+
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dc.X, d, 1);
-          const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
+          const Link U = dslash_double_store() ?
+            static_cast<const Link>(arg.Uback.Ghost(d, ghost_idx, 1 - parity, StaggeredPhase(coord, d, +1, arg))) :
+            static_cast<const Link>(arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg)));
+
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             Vector in = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.dc.ghostFaceCB[d], their_spinor_parity);
             out[s] = mv_add(U, in, out[s]);
           }
-        } else if (doBulk<kernel_type>() && !ghost) {
-          const int fwd_idx = linkIndexP1(coord, arg.dc.X, d);
-          const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
+        }
+
+        if constexpr (doBulk<kernel_type>()) {
+          if (!ghost) {
+            const int fwd_idx = getNeighborIndexCB<1>(coord1, d, 1, arg.dc);
+            const Link U = arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
 #pragma unroll
-          for (auto s = 0; s < n_src_tile; s++) {
-            Vector in = arg.in[src_idx + s](fwd_idx, their_spinor_parity);
-            out[s] = mv_add(U, in, out[s]);
+            for (auto s = 0; s < n_src_tile; s++) {
+              Vector in = arg.in[src_idx + s](fwd_idx, their_spinor_parity);
+              out[s] = mv_add(U, in, out[s]);
+            }
           }
+          prefetch(d, 0, 0, coord, coord1, parity, arg); // prefetch the gauge link Arg::prefetch_distance ahead
         }
       }
 
       // improved - forward direction
       if (arg.improved && arg.dd_in.doHopping(coord, d, +3)) {
-        const bool ghost = coord.in_boundary[1][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord.in_boundary[1][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dc.X, d, arg.nFace);
-          const Link L = arg.L(d, coord.x_cb, parity);
+          const Link L = dslash_double_store() ? static_cast<const Link>(arg.Lback.Ghost(d, ghost_idx, 1 - parity)) :
+                                                 static_cast<const Link>(arg.L(d, coord.x_cb, parity));
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             const Vector in
               = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.dc.ghostFaceCB[d], their_spinor_parity);
             out[s] = mv_add(L, in, out[s]);
           }
-        } else if (doBulk<kernel_type>() && !ghost) {
-          const int fwd3_idx = linkIndexP3(coord, arg.dc.X, d);
-          const Link L = arg.L(d, coord.x_cb, parity);
+        }
+
+        if constexpr (doBulk<kernel_type>()) {
+          if (!ghost) {
+            const int fwd3_idx = getNeighborIndexCB<3>(coord, d, 1, arg.dc);
+            const Link L = arg.L(d, coord.x_cb, parity);
 #pragma unroll
-          for (auto s = 0; s < n_src_tile; s++) {
-            const Vector in = arg.in[src_idx + s](fwd3_idx, their_spinor_parity);
-            out[s] = mv_add(L, in, out[s]);
+            for (auto s = 0; s < n_src_tile; s++) {
+              const Vector in = arg.in[src_idx + s](fwd3_idx, their_spinor_parity);
+              out[s] = mv_add(L, in, out[s]);
+            }
           }
+          prefetch(d, 0, 1, coord, coord1, parity, arg); // prefetch the gauge link Arg::prefetch_distance ahead
         }
       }
 
       if (arg.dd_in.doHopping(coord, d, -1)) {
         // Backward gather - compute back offset for spinor and gauge fetch
-        const bool ghost = (coord[d] - 1 < 0) && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord1.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx2 = ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 1);
           const int ghost_idx = arg.improved ? ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 3) : ghost_idx2;
-          const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, 1 - parity) :
-            arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg));
+          const Link U
+            = static_cast<const Link>(arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg)));
+
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             Vector in = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.dc.ghostFaceCB[d], their_spinor_parity);
             out[s] = mv_sub(conj(U), in, out[s]);
           }
-        } else if (doBulk<kernel_type>() && !ghost) {
-          const int back_idx = linkIndexM1(coord, arg.dc.X, d);
-          const int gauge_idx = back_idx;
-          const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) :
-            arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg));
+        }
+
+        if constexpr (doBulk<kernel_type>()) {
+          if (!ghost) {
+            const int back_idx = getNeighborIndexCB<1>(coord1, d, -1, arg.dc);
+            const Link U = dslash_double_store() ?
+              static_cast<const Link>(arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg))) :
+              static_cast<const Link>(arg.U(d, back_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg)));
+
 #pragma unroll
-          for (auto s = 0; s < n_src_tile; s++) {
-            Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity);
-            out[s] = mv_sub(conj(U), in, out[s]);
+            for (auto s = 0; s < n_src_tile; s++) {
+              Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity);
+              out[s] = mv_sub(conj(U), in, out[s]);
+            }
           }
+          prefetch(d, 1, 0, coord, coord1, parity, arg); // prefetch the gauge link Arg::prefetch_distance ahead
         }
       }
 
       // improved - backward direction
       if (arg.improved && arg.dd_in.doHopping(coord, d, -3)) {
-        const bool ghost = coord.in_boundary[0][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 1);
-          const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity);
+          const Link L = static_cast<const Link>(arg.L.Ghost(d, ghost_idx, 1 - parity));
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             const Vector in
               = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.dc.ghostFaceCB[d], their_spinor_parity);
             out[s] = mv_sub(conj(L), in, out[s]);
           }
-        } else if (doBulk<kernel_type>() && !ghost) {
-          const int back3_idx = linkIndexM3(coord, arg.dc.X, d);
-          const int gauge_idx = back3_idx;
-          const Link L = arg.L(d, gauge_idx, 1 - parity);
+        }
+
+        if constexpr (doBulk<kernel_type>()) {
+          if (!ghost) {
+            const int back3_idx = getNeighborIndexCB<3>(coord, d, -1, arg.dc);
+            const Link L = dslash_double_store() ? static_cast<const Link>(arg.Lback(d, coord.x_cb, parity)) :
+                                                   static_cast<const Link>(arg.L(d, back3_idx, 1 - parity));
 #pragma unroll
-          for (auto s = 0; s < n_src_tile; s++) {
-            const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity);
-            out[s] = mv_sub(conj(L), in, out[s]);
+            for (auto s = 0; s < n_src_tile; s++) {
+              const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity);
+              out[s] = mv_sub(conj(L), in, out[s]);
+            }
           }
+          prefetch(d, 1, 1, coord, coord1, parity, arg); // prefetch the gauge link Arg::prefetch_distance ahead
         }
       }
+
     } // nDim
   }
 
diff --git a/include/kernels/dslash_twisted_mass_preconditioned.cuh b/include/kernels/dslash_twisted_mass_preconditioned.cuh
index 513a034acd..547385c75c 100644
--- a/include/kernels/dslash_twisted_mass_preconditioned.cuh
+++ b/include/kernels/dslash_twisted_mass_preconditioned.cuh
@@ -63,7 +63,7 @@ namespace quda
       if (arg.dd_in.doHopping(coord, d, +1)) {
         const int fwd_idx = getNeighborIndexCB(coord, d, +1, arg.dc);
         constexpr int proj_dir = dagger ? +1 : -1;
-        const bool ghost = coord.in_boundary[1][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord.in_boundary[1][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
         if (doHalo<kernel_type>(d) && ghost) {
           // we need to compute the face index if we are updating a face that isn't ours
@@ -101,7 +101,7 @@ namespace quda
         const int back_idx = getNeighborIndexCB(coord, d, -1, arg.dc);
         const int gauge_idx = back_idx;
         constexpr int proj_dir = dagger ? -1 : +1;
-        const bool ghost = coord.in_boundary[0][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
         if (doHalo<kernel_type>(d) && ghost) {
           // we need to compute the face index if we are updating a face that isn't ours
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 8b66ee83e6..75d5ce041c 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -28,7 +28,9 @@ namespace quda
     static constexpr bool distance_pc = distance_pc_;
     static constexpr bool gauge_direct_load = false; // false means texture load
     static constexpr QudaGhostExchange ghost = QUDA_GHOST_EXCHANGE_PAD;
-    typedef typename gauge_mapper<Float, reconstruct, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost>::type G;
+    template <bool shifted>
+    using G = typename gauge_mapper<Float, reconstruct, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost, false,
+                                    QUDA_NATIVE_GAUGE_ORDER, shifted>::type;
 
     typedef typename mapper<Float>::type real;
 
@@ -37,11 +39,13 @@ namespace quda
     F x[MAX_MULTI_RHS];   /** input vector set when doing xpay */
     Ghost halo_pack;
     Ghost halo;
-    const G U;    /** the gauge field */
+    mutable G<false> U;    /** the gauge field */
+    mutable G<true> Uback; /** the backwards gauge field */
     const real a; /** xpay scale factor - can be -kappa or -kappa^2 */
     /** parameters for distance preconditioning */
     const real alpha0;
     const int t0;
+    static constexpr int prefetch_distance = QUDA_DSLASH_PREFETCH_DISTANCE_WILSON;
 
     WilsonArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo,
               const GaugeField &U, double a, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
@@ -51,6 +55,7 @@ namespace quda
       halo_pack(halo),
       halo(halo),
       U(U),
+      Uback(dslash_double_store() ? U.shift(1) : U),
       a(a),
       alpha0(alpha0),
       t0(t0)
@@ -63,6 +68,41 @@ namespace quda
     }
   };
 
+  /**
+     @tparam distance The distance away we are prefetching
+     @param[in] dim The dimension we are presently working on
+     @param[in] dir The direction we are presently working on (1 = forwards, 0 = backwards)
+     @param[in] coord Coordinates that we are working on
+     @param[in] parity Partiry that we are working on
+     @param[in] arg Paramter struct
+  */
+  template <class coord_t, class Arg>
+  __device__ __host__ void prefetch(int dim, int dir, const coord_t &coord, int parity, const Arg &arg)
+  {
+    if constexpr (Arg::prefetch_distance == 0) return;
+
+    int step = 2 * dim + dir + Arg::prefetch_distance;
+    if (step >= 8) return;
+
+    int dim2 = step / 2;
+
+    // if using a bulk prefetch we need to use block's first coordinate
+    auto x_cb = dslash_prefetch_tma() ? coord.x_cb_0 : coord.x_cb;
+    x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
+
+    switch (step % 2) {
+    case 0: arg.U.template prefetch<Arg::prefetch_type>(x_cb, dim2, parity); break;
+    case 1:
+      if constexpr (dslash_double_store()) {
+        arg.Uback.template prefetch<Arg::prefetch_type>(x_cb, dim2, parity);
+      } else {
+        int idx = getNeighborIndexCB(coord, dim2, -1, arg.dc);
+        arg.U.template prefetch<Arg::prefetch_type>(Arg::nDim == 5 ? idx % arg.dc.volume_4d_cb : idx, dim2, 1 - parity);
+      }
+      break;
+    }
+  }
+
   /**
      @brief Applies the off-diagonal part of the Wilson operator
 
@@ -102,7 +142,7 @@ namespace quda
         const int gauge_idx = (Arg::nDim == 5 ? coord.x_cb % arg.dc.volume_4d_cb : coord.x_cb);
         constexpr int proj_dir = dagger ? +1 : -1;
 
-        const bool ghost = coord.in_boundary[1][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord.in_boundary[1][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
         if (doHalo<kernel_type>(d) && ghost) {
           // we need to compute the face index if we are updating a face that isn't ours
@@ -115,12 +155,16 @@ namespace quda
                                          their_spinor_parity);
 
           out += fwd_coeff * (U * in).reconstruct(d, proj_dir);
-        } else if (doBulk<kernel_type>() && !ghost) {
+        }
 
-          Link U = arg.U(d, gauge_idx, gauge_parity);
-          Vector in = arg.in[src_idx](fwd_idx + coord.s * arg.dc.volume_4d_cb, their_spinor_parity);
+        if constexpr (doBulk<kernel_type>()) {
+          if (!ghost) {
+            Link U = arg.U(d, gauge_idx, gauge_parity);
+            Vector in = arg.in[src_idx](fwd_idx + coord.s * arg.dc.volume_4d_cb, their_spinor_parity);
+            out += fwd_coeff * (U * in.project(d, proj_dir)).reconstruct(d, proj_dir);
+          }
 
-          out += fwd_coeff * (U * in.project(d, proj_dir)).reconstruct(d, proj_dir);
+          prefetch(d, 0, coord, parity, arg); // prefetch the gauge link Arg::prefetch_distance ahead
         }
       }
 
@@ -128,10 +172,11 @@ namespace quda
       if (arg.dd_in.doHopping(coord, d, -1)) {
         const real bwd_coeff = (d < 3) ? 1.0 : bwd_coeff_3;
         const int back_idx = getNeighborIndexCB(coord, d, -1, arg.dc);
-        const int gauge_idx = (Arg::nDim == 5 ? back_idx % arg.dc.volume_4d_cb : back_idx);
+        int gauge_idx = dslash_double_store() ? coord.x_cb : back_idx;
+        if constexpr (Arg::nDim == 5) gauge_idx = gauge_idx % arg.dc.volume_4d_cb;
         constexpr int proj_dir = dagger ? -1 : +1;
 
-        const bool ghost = coord.in_boundary[0][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
         if (doHalo<kernel_type>(d) && ghost) {
           // we need to compute the face index if we are updating a face that isn't ours
@@ -140,17 +185,23 @@ namespace quda
             idx;
 
           const int gauge_ghost_idx = (Arg::nDim == 5 ? ghost_idx % arg.dc.ghostFaceCB[d] : ghost_idx);
-          Link U = arg.U.Ghost(d, gauge_ghost_idx, 1 - gauge_parity);
+          Link U = dslash_double_store() ? static_cast<const Link&>(arg.Uback(d, gauge_idx, gauge_parity)) :
+                                           static_cast<const Link &>(arg.U.Ghost(d, gauge_ghost_idx, 1 - gauge_parity));
           HalfVector in = arg.halo.Ghost(d, 0, ghost_idx + (src_idx * arg.Ls + coord.s) * arg.dc.ghostFaceCB[d],
                                          their_spinor_parity);
 
           out += bwd_coeff * (conj(U) * in).reconstruct(d, proj_dir);
-        } else if (doBulk<kernel_type>() && !ghost) {
+        }
 
-          Link U = arg.U(d, gauge_idx, 1 - gauge_parity);
-          Vector in = arg.in[src_idx](back_idx + coord.s * arg.dc.volume_4d_cb, their_spinor_parity);
+        if constexpr (doBulk<kernel_type>()) {
+          if (!ghost) {
+            Link U = dslash_double_store() ? static_cast<const Link &>(arg.Uback(d, gauge_idx, gauge_parity)) :
+                                             static_cast<const Link &>(arg.U(d, gauge_idx, 1 - gauge_parity));
+            Vector in = arg.in[src_idx](back_idx + coord.s * arg.dc.volume_4d_cb, their_spinor_parity);
+            out += bwd_coeff * (conj(U) * in.project(d, proj_dir)).reconstruct(d, proj_dir);
+          }
 
-          out += bwd_coeff * (conj(U) * in.project(d, proj_dir)).reconstruct(d, proj_dir);
+          prefetch(d, 1, coord, parity, arg); // prefetch the gauge link Arg::prefetch_distance ahead
         }
       }
     } // nDim
diff --git a/include/kernels/extract_gauge_ghost.cuh b/include/kernels/extract_gauge_ghost.cuh
index 42fef0b4ae..5ea2cdaa3f 100644
--- a/include/kernels/extract_gauge_ghost.cuh
+++ b/include/kernels/extract_gauge_ghost.cuh
@@ -24,7 +24,6 @@ namespace quda {
     int f[nDim][nDim];
     bool localParity[nDim];
     int faceVolumeCB[nDim];
-    int comm_dim[QUDA_MAX_DIM];
     const int offset;
     ExtractGhostArg(const GaugeField &u, Float **Ghost, int offset, uint64_t size) :
       kernel_param(dim3(size, 1, 1)),
@@ -34,7 +33,6 @@ namespace quda {
     {
       for (int d=0; d<nDim; d++) {
 	X[d] = u.X()[d];
-	comm_dim[d] = comm_dim_partitioned(d);
 	faceVolumeCB[d] = u.SurfaceCB(d)*u.Nface();
       }
 
@@ -79,7 +77,7 @@ namespace quda {
       int dim = parity_dim % Arg::nDim;
 
       // for now we never inject unless we have partitioned in that dimension
-      if (!arg.comm_dim[dim] && !Arg::extract) return;
+      if (!arg.comms_dim_partitioned[dim] && !Arg::extract) return;
 
       // linear index used for writing into ghost buffer
       if (X >= 2*arg.faceVolumeCB[dim]) return;
@@ -128,7 +126,7 @@ namespace quda {
       int dim = parity_dim % Arg::nDim;
 
       // for now we never inject unless we have partitioned in that dimension
-      if (!arg.comm_dim[dim] && !Arg::extract) return;
+      if (!arg.comms_dim_partitioned[dim] && !Arg::extract) return;
 
       // linear index used for writing into ghost buffer
       if (X >= 2*arg.faceVolumeCB[dim]) return;
diff --git a/include/kernels/gauge_shift.cuh b/include/kernels/gauge_shift.cuh
new file mode 100644
index 0000000000..4726258242
--- /dev/null
+++ b/include/kernels/gauge_shift.cuh
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <gauge_field_order.h>
+#include <quda_matrix.h>
+#include <index_helper.cuh>
+#include <byte_array.h>
+#include <kernel.h>
+
+namespace quda
+{
+
+  template <typename store_t, int nColor, QudaReconstructType recon, bool verify_ = false>
+  struct GaugeShiftArg : kernel_param<> {
+    using real = typename mapper<store_t>::type;
+    using Link = Matrix<complex<real>, nColor>;
+    using RawLink = array<store_t, recon>;
+    using Gauge = typename gauge_mapper<store_t, recon>::type;
+    static constexpr bool verify = verify_;
+
+    int X[4]; // true grid dimensions
+    Gauge out;
+    const Gauge in;
+    int shift;
+    int volume_cb;
+
+    GaugeShiftArg(GaugeField &out, const GaugeField &in, int shift) :
+      kernel_param(dim3(in.VolumeCB(), 2, 4)), out(out), in(in), shift(shift), volume_cb(in.VolumeCB())
+    {
+      for (int dir = 0; dir < 4; dir++) X[dir] = in.X()[dir];
+    }
+  };
+
+  template <typename Arg> struct GaugeShift {
+    const Arg &arg;
+    constexpr GaugeShift(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ void operator()(int x_cb, int parity, int dir)
+    {
+      byte_array<int8_t, 4> x = {};
+      getCoords(x, x_cb, arg.X, parity);
+
+      if constexpr (!Arg::verify) {
+        typename Arg::RawLink link;
+        if (x[dir] < arg.shift && arg.comms_dim_partitioned[dir]) { // on boundary so we fetch from ghost
+          const int ghost_idx = ghostFaceIndexStaggered<0>(x, arg.X, dir, 1);
+          arg.in.raw_load(link, arg.volume_cb + ghost_idx, dir, 1 - parity);
+          arg.out.raw_save(link, x_cb, dir, parity);
+        } else { // simple shift
+          byte_array<int8_t, 4> dx = {};
+          dx[dir] = dx[dir] - arg.shift;
+          int x_cb_back = linkIndexShift(x, dx, arg.X);
+          arg.in.raw_load(link, x_cb_back, dir, 1 - parity);
+          arg.out.raw_save(link, x_cb, dir, parity);
+
+          if (x[dir] >= arg.X[dir] - arg.shift && arg.comms_dim_partitioned[dir]) { // write the ghost
+            const int ghost_idx = ghostFaceIndexStaggered<1>(x, arg.X, dir, arg.shift);
+            arg.in.raw_load(link, x_cb, dir, parity);
+            arg.out.raw_save(link, arg.volume_cb + ghost_idx, dir, 1 - parity);
+          }
+        }
+      } else {
+        // verify the shifting has worked
+        using Link = typename Arg::Link;
+        if (x[dir] < arg.shift && arg.comms_dim_partitioned[dir]) {
+          const int ghost_idx = ghostFaceIndexStaggered<0>(x, arg.X, dir, 1);
+          Link in = arg.in(dir, arg.volume_cb + ghost_idx, 1 - parity);
+          Link out = arg.out(dir, x_cb, parity);
+          assert(in.L1() == out.L1());
+        } else {
+          byte_array<int8_t, 4> dx = {};
+          dx[dir] = dx[dir] - arg.shift;
+          int x_cb_back = linkIndexShift(x, dx, arg.X);
+          Link in = arg.in(dir, x_cb_back, 1 - parity);
+          Link out = arg.out(dir, x_cb, parity);
+          assert(in.L1() == out.L1());
+
+          if (x[dir] >= arg.X[dir] - arg.shift && arg.comms_dim_partitioned[dir]) {
+            const int ghost_idx = ghostFaceIndexStaggered<1>(x, arg.X, dir, arg.shift);
+            Link in = arg.in(dir, x_cb, parity);
+            Link out = arg.out.Ghost(dir, ghost_idx, 1 - parity);
+            assert(in.L1() == out.L1());
+          }
+        }
+      }
+    }
+  };
+
+} // namespace quda
diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh
index 9d66c2dee4..b1a45a5c85 100644
--- a/include/kernels/laplace.cuh
+++ b/include/kernels/laplace.cuh
@@ -36,7 +36,7 @@ namespace quda
     const Ghost halo_pack; /** accessor used for writing the halo field */
     const Ghost halo;      /** accessor used for reading the halo field */
     F x[MAX_MULTI_RHS];    /** input vector field for xpay*/
-    const G U;    /** the gauge field */
+    mutable G U;           /** the gauge field */
     const real a; /** xpay scale factor - can be -kappa or -kappa^2 */
     const real b; /** used by Wuppetal smearing kernel */
     int dir;      /** The direction from which to omit the derivative */
@@ -86,11 +86,10 @@ namespace quda
       if (d != dir) {
         if (arg.dd_in.doHopping(coord, d, +1)) {
           // Forward gather - compute fwd offset for vector fetch
-          const bool ghost = coord.in_boundary[1][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+          const bool ghost = coord.in_boundary[1][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
           if (doHalo<kernel_type>(d) && ghost) {
 
-            // const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dc.X, d, 1);
             const int ghost_idx = ghostFaceIndex<1>(coord, arg.dc.X, d, arg.nFace);
             const Link U = arg.U(d, coord.x_cb, parity);
             const Vector in = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.dc.ghostFaceCB[d], their_spinor_parity);
@@ -111,11 +110,10 @@ namespace quda
           const int back_idx = linkIndexM1(coord, arg.dc.X, d);
           const int gauge_idx = back_idx;
 
-          const bool ghost = coord.in_boundary[0][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+          const bool ghost = coord.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
           if (doHalo<kernel_type>(d) && ghost) {
 
-            // const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 1);
             const int ghost_idx = ghostFaceIndex<0>(coord, arg.dc.X, d, arg.nFace);
 
             const Link U = arg.U.Ghost(d, ghost_idx, 1 - parity);
diff --git a/include/kernels/restrictor_mma.cuh b/include/kernels/restrictor_mma.cuh
index 73f7f16b17..a1501a8ea2 100644
--- a/include/kernels/restrictor_mma.cuh
+++ b/include/kernels/restrictor_mma.cuh
@@ -174,11 +174,7 @@ namespace quda
       // block all-reduce thread_max
       using block_reduce_t = cub::BlockReduce<float, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS, Arg::block_y, Arg::block_z>;
       __shared__ typename block_reduce_t::TempStorage temp_storage;
-#if CUDA_VERSION >= 12090
       float block_max = block_reduce_t(temp_storage).Reduce(thread_max, ::cuda::maximum());
-#else
-      float block_max = block_reduce_t(temp_storage).Reduce(thread_max, ::cub::Max());
-#endif
 
       __shared__ float block_max_all;
       if (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z) == 0) {
diff --git a/include/lattice_field.h b/include/lattice_field.h
index da7538d680..3ad257d251 100644
--- a/include/lattice_field.h
+++ b/include/lattice_field.h
@@ -160,8 +160,9 @@ namespace quda {
     /**
        @brief Create the field as specified by the param
        @param[in] Parameter struct
+       @param[in] native_gauge Whether the field is a native gauge field
     */
-    void create(const LatticeFieldParam &param);
+    void create(const LatticeFieldParam &param, bool is_native_gauge);
 
     /**
        @brief Move the contents of a field to this
@@ -500,7 +501,7 @@ namespace quda {
        @brief Constructor for creating a LatticeField from a LatticeFieldParam
        @param param Contains the metadata for creating the field
     */
-    LatticeField(const LatticeFieldParam &param);
+    LatticeField(const LatticeFieldParam &param, bool is_native_gauge = false);
 
     /**
        @brief Destructor for LatticeField
diff --git a/include/quda_define.h.in b/include/quda_define.h.in
index 9b6c75f081..98e9177557 100644
--- a/include/quda_define.h.in
+++ b/include/quda_define.h.in
@@ -168,6 +168,36 @@
 #define GPU_DISTANCE_PRECONDITIONING
 #endif
 
+/**
+ * @def QUDA_DSLASH_DOUBLE_STORE
+ * @brief This macro sets whether to use double storage of the gauge
+ * field to simplify indexing in the Dslash kernels.
+ */
+#cmakedefine QUDA_DSLASH_DOUBLE_STORE
+
+/**
+ * @def QUDA_DSLASH_PREFETCH_TYPE
+ * @brief This macro sets whether to use
+ * the TMA for L2 prefetching:
+ * NONE - no prefetch
+ * THREAD - per thread prefetch
+ * BULK - TMA bulk prefetch
+ * TENSOR - TMA tensor descriptor prefetch
+ */
+#define QUDA_DSLASH_PREFETCH_TYPE_@QUDA_DSLASH_PREFETCH_TYPE@
+
+/**
+ * @def QUDA_DSLASH_PREFETCH_DISTANCE_WILSON
+ * @brief This macro sets the prefetch distance for Wilson fermions
+ */
+#define QUDA_DSLASH_PREFETCH_DISTANCE_WILSON @QUDA_DSLASH_PREFETCH_DISTANCE_WILSON@
+
+/**
+ * @def QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED
+ * @brief This macro sets the prefetch distance for staggered fermions
+ */
+#define QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED @QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED@
+
 #cmakedefine QUDA_MULTIGRID
 #ifdef QUDA_MULTIGRID
 /**
diff --git a/include/quda_matrix.h b/include/quda_matrix.h
index 8eb579dab3..3eef975308 100644
--- a/include/quda_matrix.h
+++ b/include/quda_matrix.h
@@ -103,7 +103,8 @@ namespace quda {
            the absolute column sums.
            @return Compute L1 norm
         */
-        __device__ __host__ inline real L1() {
+        __device__ __host__ inline real L1() const
+        {
           real l1 = 0;
 #pragma unroll
           for (int j=0; j<N; j++) {
@@ -122,7 +123,8 @@ namespace quda {
            Frobenius norm which is an upper bound on the L2 norm.
            @return Computed L2 norm
         */
-        __device__ __host__ inline real L2() {
+        __device__ __host__ inline real L2() const
+        {
           real l2 = 0;
 #pragma unroll
           for (int j=0; j<N; j++) {
@@ -139,7 +141,8 @@ namespace quda {
            the absolute row sums.
            @return Computed Linfinity norm
         */
-        __device__ __host__ inline real Linf() {
+        __device__ __host__ inline real Linf() const
+        {
           real linf = 0;
 #pragma unroll
           for (int i=0; i<N; i++) {
diff --git a/include/targets/cuda/block_reduce_helper.h b/include/targets/cuda/block_reduce_helper.h
index 8b9dcc16e6..7b742b5331 100644
--- a/include/targets/cuda/block_reduce_helper.h
+++ b/include/targets/cuda/block_reduce_helper.h
@@ -100,7 +100,7 @@ namespace quda
     template <typename T, typename reducer_t, typename param_t>
     __device__ inline T operator()(const T &value_, bool all, const reducer_t &r, const param_t &)
     {
-      using warp_reduce_t = cub::WarpReduce<T, param_t::width, __COMPUTE_CAPABILITY__>;
+      using warp_reduce_t = cub::WarpReduce<T, param_t::width>;
       typename warp_reduce_t::TempStorage dummy_storage;
       warp_reduce_t warp_reduce(dummy_storage);
       T value = {};
@@ -111,7 +111,7 @@ namespace quda
       }
 
       if (all) {
-        using warp_scan_t = cub::WarpScan<T, param_t::width, __COMPUTE_CAPABILITY__>;
+        using warp_scan_t = cub::WarpScan<T, param_t::width>;
         typename warp_scan_t::TempStorage dummy_storage;
         warp_scan_t warp_scan(dummy_storage);
         value = warp_scan.Broadcast(value, 0);
diff --git a/include/targets/cuda/block_reduction_kernel.h b/include/targets/cuda/block_reduction_kernel.h
index bf41cde6d3..639501c421 100644
--- a/include/targets/cuda/block_reduction_kernel.h
+++ b/include/targets/cuda/block_reduction_kernel.h
@@ -61,9 +61,9 @@ namespace quda
      @tparam block_size x-dimension block-size
      @param[in] arg Kernel argument
    */
-  template <unsigned int block_size_, typename Arg_> struct BlockKernelArg : Arg_ {
+  template <unsigned int block_size, typename Arg_> struct BlockKernelArg : Arg_ {
     using Arg = Arg_;
-    static constexpr unsigned int block_size = block_size_;
+    static constexpr unsigned int block_size_cxpr = block_size;
     BlockKernelArg(const Arg &arg) : Arg(arg) { }
   };
 
@@ -112,7 +112,7 @@ namespace quda
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
   __launch_bounds__(Arg::launch_bounds ?
-                      Arg::block_size :
+                      Arg::block_size_cxpr :
                       0) __global__ std::enable_if_t<device::use_kernel_arg<Arg>(), void> BlockKernel2D(Arg arg)
   {
     static_assert(!grid_stride, "grid_stride not supported for BlockKernel");
@@ -135,7 +135,7 @@ namespace quda
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
   __launch_bounds__(Arg::launch_bounds ?
-                      Arg::block_size :
+                      Arg::block_size_cxpr :
                       0) __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> BlockKernel2D()
   {
     static_assert(!grid_stride, "grid_stride not supported for BlockKernel");
diff --git a/include/targets/cuda/inline_ptx.h b/include/targets/cuda/inline_ptx.h
index fa29eee35b..2ad01cd3ac 100644
--- a/include/targets/cuda/inline_ptx.h
+++ b/include/targets/cuda/inline_ptx.h
@@ -18,119 +18,425 @@ namespace quda {
   // If you're bored...
   // http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st
 
-  __device__ inline void load_streaming_double4(double4 &a, const double4 *addr)
+// Helper macro for prefetch size validation
+#define VALIDATE_PREFETCH_SIZE(prefetch_size)                                                                          \
+  static_assert(prefetch_size == 0 || prefetch_size == 64 || prefetch_size == 128 || prefetch_size == 256,             \
+                "prefetch_size must be 0, 64, 128, or 256")
+
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_streaming_double4(double4 &a, const double4 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     double x, y, z, w;
-    asm("ld.cs.global.v4.f64 {%0, %1, %2, %3}, [%4+0];" : "=d"(x), "=d"(y), "=d"(z), "=d"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain streaming load, no prefetch hint
+      asm volatile("ld.global.cs.v4.f64 {%0, %1, %2, %3}, [%4];\n" : "=d"(x), "=d"(y), "=d"(z), "=d"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cs.L2::64B.v4.f64 {%0, %1, %2, %3}, [%4];\n"
+                   : "=d"(x), "=d"(y), "=d"(z), "=d"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cs.L2::128B.v4.f64 {%0, %1, %2, %3}, [%4];\n"
+                   : "=d"(x), "=d"(y), "=d"(z), "=d"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cs.L2::256B.v4.f64 {%0, %1, %2, %3}, [%4];\n"
+                   : "=d"(x), "=d"(y), "=d"(z), "=d"(w)
+                   : "l"(addr));
+    }
+
     a.x = x;
     a.y = y;
     a.z = z;
     a.w = w;
   }
 
-  __device__ inline void load_streaming_double2(double2 &a, const double2* addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_streaming_double2(double2 &a, const double2 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     double x, y;
-    asm("ld.cs.global.v2.f64 {%0, %1}, [%2+0];" : "=d"(x), "=d"(y) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain streaming load, no prefetch hint
+      asm volatile("ld.global.cs.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cs.L2::64B.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cs.L2::128B.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cs.L2::256B.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    }
+
     a.x = x; a.y = y;
   }
 
-  __device__ inline void load_streaming_float8(float8 &v, const float8 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_streaming_float8(float8 &v, const float8 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     float x, y, z, w, a, b, c, d;
-    asm("ld.cs.global.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8+0];"
-        : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
-        : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain streaming load, no prefetch hint
+      asm volatile("ld.global.cs.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cs.L2::64B.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cs.L2::128B.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cs.L2::256B.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    }
+
     v = {{x, y, z, w}, {a, b, c, d}};
   }
 
-  __device__ inline void load_streaming_float4(float4 &a, const float4* addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_streaming_float4(float4 &a, const float4 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     float x, y, z, w;
-    asm("ld.cs.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain streaming load, no prefetch hint
+      asm volatile("ld.global.cs.v4.f32 {%0, %1, %2, %3}, [%4];\n" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cs.L2::64B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cs.L2::128B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cs.L2::256B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    }
+
     a.x = x; a.y = y; a.z = z; a.w = w;
   }
 
-  __device__ inline void load_cached_short4(short4 &a, const short4 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_short4(short4 &a, const short4 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     short x, y, z, w;
-    asm("ld.ca.global.v4.s16 {%0, %1, %2, %3}, [%4+0];" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v4.s16 {%0, %1, %2, %3}, [%4];\n" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v4.s16 {%0, %1, %2, %3}, [%4];\n"
+                   : "=h"(x), "=h"(y), "=h"(z), "=h"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v4.s16 {%0, %1, %2, %3}, [%4];\n"
+                   : "=h"(x), "=h"(y), "=h"(z), "=h"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v4.s16 {%0, %1, %2, %3}, [%4];\n"
+                   : "=h"(x), "=h"(y), "=h"(z), "=h"(w)
+                   : "l"(addr));
+    }
+
     a.x = x;
     a.y = y;
     a.z = z;
     a.w = w;
   }
 
-  __device__ inline void load_cached_short2(short2 &a, const short2 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_short2(short2 &a, const short2 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     short x, y;
-    asm("ld.ca.global.v2.s16 {%0, %1}, [%2+0];" : "=h"(x), "=h"(y) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    }
+
     a.x = x;
     a.y = y;
   }
 
-  __device__ inline void load_global_short4(short4 &a, const short4 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_global_short4(short4 &a, const short4 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     short x, y, z, w;
-    asm("ld.cg.global.v4.s16 {%0, %1, %2, %3}, [%4+0];" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain global load, no prefetch hint
+      asm volatile("ld.global.cg.v4.s16 {%0, %1, %2, %3}, [%4];\n" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cg.L2::64B.v4.s16 {%0, %1, %2, %3}, [%4];\n"
+                   : "=h"(x), "=h"(y), "=h"(z), "=h"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cg.L2::128B.v4.s16 {%0, %1, %2, %3}, [%4];\n"
+                   : "=h"(x), "=h"(y), "=h"(z), "=h"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cg.L2::256B.v4.s16 {%0, %1, %2, %3}, [%4];\n"
+                   : "=h"(x), "=h"(y), "=h"(z), "=h"(w)
+                   : "l"(addr));
+    }
+
     a.x = x;
     a.y = y;
     a.z = z;
     a.w = w;
   }
 
-  __device__ inline void load_global_short2(short2 &a, const short2 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_global_short2(short2 &a, const short2 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     short x, y;
-    asm("ld.cg.global.v2.s16 {%0, %1}, [%2+0];" : "=h"(x), "=h"(y) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain global load, no prefetch hint
+      asm volatile("ld.global.cg.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cg.L2::64B.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cg.L2::128B.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cg.L2::256B.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    }
+
     a.x = x;
     a.y = y;
   }
 
-  __device__ inline void load_global_float4(float4 &a, const float4* addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_global_float4(float4 &a, const float4 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     float x, y, z, w;
-    asm("ld.cg.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain global load, no prefetch hint
+      asm volatile("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];\n" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cg.L2::64B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cg.L2::128B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cg.L2::256B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    }
+
     a.x = x; a.y = y; a.z = z; a.w = w;
   }
 
-  __device__ inline void load_cached_float4(float4 &a, const float4* addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_float4(float4 &a, const float4 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     float x, y, z, w;
-    asm("ld.ca.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v4.f32 {%0, %1, %2, %3}, [%4];\n" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    }
+
     a.x = x; a.y = y; a.z = z; a.w = w;
   }
 
-  __device__ inline void load_cached_float8(float8 &v, const float8 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_float8(float8 &v, const float8 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     float x, y, z, w, a, b, c, d;
-    asm("ld.ca.global.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8+0];"
-        : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
-        : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    }
+
     v = {{x, y, z, w}, {a, b, c, d}};
   }
 
-  __device__ inline void load_cached_float2(float2 &a, const float2* addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_float2(float2 &a, const float2 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     float x, y;
-    asm("ld.ca.global.v2.f32 {%0, %1}, [%2+0];" : "=f"(x), "=f"(y) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v2.f32 {%0, %1}, [%2];\n" : "=f"(x), "=f"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v2.f32 {%0, %1}, [%2];\n" : "=f"(x), "=f"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v2.f32 {%0, %1}, [%2];\n" : "=f"(x), "=f"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v2.f32 {%0, %1}, [%2];\n" : "=f"(x), "=f"(y) : "l"(addr));
+    }
+
     a.x = x; a.y = y;
   }
 
-  __device__ inline void load_cached_double4(double4 &a, const double4 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_float(float &a, const float *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
+    float x;
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.f32 {%0}, [%1];\n" : "=f"(x) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.f32 {%0}, [%1];\n" : "=f"(x) : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.f32 {%0}, [%1];\n" : "=f"(x) : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.f32 {%0}, [%1];\n" : "=f"(x) : "l"(addr));
+    }
+
+    a = x;
+  }
+
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_double4(double4 &a, const double4 *addr)
+  {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     double x, y, z, w;
-    asm("ld.ca.global.v4.f64 {%0, %1, %2, %3}, [%4+0];" : "=d"(x), "=d"(y), "=d"(z), "=d"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v4.f64 {%0, %1, %2, %3}, [%4];\n" : "=d"(x), "=d"(y), "=d"(z), "=d"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v4.f64 {%0, %1, %2, %3}, [%4];\n"
+                   : "=d"(x), "=d"(y), "=d"(z), "=d"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v4.f64 {%0, %1, %2, %3}, [%4];\n"
+                   : "=d"(x), "=d"(y), "=d"(z), "=d"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v4.f64 {%0, %1, %2, %3}, [%4];\n"
+                   : "=d"(x), "=d"(y), "=d"(z), "=d"(w)
+                   : "l"(addr));
+    }
+
     a.x = x;
     a.y = y;
     a.z = z;
     a.w = w;
   }
 
-  __device__ inline void load_cached_double2(double2 &a, const double2* addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_double2(double2 &a, const double2 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     double x, y;
-    asm("ld.ca.global.v2.f64 {%0, %1}, [%2+0];" : "=d"(x), "=d"(y) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    }
+
     a.x = x; a.y = y;
   }
 
@@ -170,4 +476,48 @@ namespace quda {
     asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: __PTR(addr), "h"(x), "h"(y));
   }
 
+  __device__ inline void prefetch_L1(void *smem_ptr_, const void *gmem_ptr)
+  {
+    uint32_t smem_ptr = __cvta_generic_to_shared(smem_ptr_);
+    asm volatile("cp.async.ca.shared.global [%0], [%1], 4;\n" ::"r"(smem_ptr), "l"(gmem_ptr));
+  }
+
+  __device__ __forceinline__ void prefetch_L1(const void *p) { asm volatile("prefetch.global.L2 [%0];" ::"l"(p)); }
+
+  __device__ __forceinline__ void prefetch_L2(const void *p) { asm volatile("prefetch.global.L2 [%0];" ::"l"(p)); }
+
+  __device__ __forceinline__ void prefetch_tma(const void *p, size_t bytes)
+  {
+    asm volatile("cp.async.bulk.prefetch.L2.global [%0], %1;\n" ::"l"(p), "r"(static_cast<uint32_t>(bytes)));
+  }
+
+  using tensor_desc_t = CUtensorMap;
+
+  __device__ __forceinline__ void prefetch_tma_2d(const CUtensorMap &tensor_map, int x, int y)
+  {
+    asm volatile("cp.async.bulk.prefetch.tensor.2d.L2.global.tile [%0, {%1, %2}];" ::"l"(&tensor_map), "r"(x), "r"(y)
+                 : "memory");
+  }
+
+  __device__ __forceinline__ void prefetch_tma_3d(const CUtensorMap &tensor_map, int x, int y, int z)
+  {
+    asm volatile("cp.async.bulk.prefetch.tensor.3d.L2.global.tile [%0, {%1, %2, %3}];" ::"l"(&tensor_map), "r"(x),
+                 "r"(y), "r"(z)
+                 : "memory");
+  }
+
+  __device__ __forceinline__ void prefetch_tma_4d(const CUtensorMap &tensor_map, int x, int y, int z, int w)
+  {
+    asm volatile("cp.async.bulk.prefetch.tensor.4d.L2.global.tile [%0, {%1, %2, %3, %4}];" ::"l"(&tensor_map), "r"(x),
+                 "r"(y), "r"(z), "r"(w)
+                 : "memory");
+  }
+
+  __device__ __forceinline__ void prefetch_tma_5d(const CUtensorMap &tensor_map, int x, int y, int z, int w, int u)
+  {
+    asm volatile("cp.async.bulk.prefetch.tensor.5d.L2.global.tile [%0, {%1, %2, %3, %4, %5}];" ::"l"(&tensor_map),
+                 "r"(x), "r"(y), "r"(z), "r"(w), "r"(u)
+                 : "memory");
+  }
+
 } // namespace quda
diff --git a/include/targets/cuda/kernel.h b/include/targets/cuda/kernel.h
index d1956f647c..00eab6db97 100644
--- a/include/targets/cuda/kernel.h
+++ b/include/targets/cuda/kernel.h
@@ -57,7 +57,7 @@ namespace quda
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
   MAXNREG(Arg::max_regs)
-  __global__ std::enable_if_t<(device::use_kernel_arg<Arg>() && Arg::max_regs > 0), void> Kernel1D(Arg arg)
+  __global__ std::enable_if_t<(device::use_kernel_arg<Arg>() && Arg::max_regs > 0), void> Kernel1D(const GRID_CONSTANT Arg arg)
   {
     Kernel1D_impl<Functor, Arg, grid_stride>(arg);
   }
@@ -76,7 +76,8 @@ namespace quda
      @param[in] arg Kernel argument
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && Arg::max_regs == 0, void> Kernel1D(Arg arg)
+  __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && Arg::max_regs == 0, void>
+  Kernel1D(const GRID_CONSTANT Arg arg)
   {
     Kernel1D_impl<Functor, Arg, grid_stride>(arg);
   }
@@ -173,7 +174,7 @@ namespace quda
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
   MAXNREG(Arg::max_regs)
-  __global__ std::enable_if_t<(device::use_kernel_arg<Arg>() && Arg::max_regs > 0), void> Kernel2D(Arg arg)
+  __global__ std::enable_if_t<(device::use_kernel_arg<Arg>() && Arg::max_regs > 0), void> Kernel2D(const GRID_CONSTANT Arg arg)
   {
     Kernel2D_impl<Functor, Arg, grid_stride>(arg);
   }
@@ -192,7 +193,8 @@ namespace quda
      @param[in] arg Kernel argument
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && Arg::max_regs == 0, void> Kernel2D(Arg arg)
+  __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && Arg::max_regs == 0, void>
+  Kernel2D(const GRID_CONSTANT Arg arg)
   {
     Kernel2D_impl<Functor, Arg, grid_stride>(arg);
   }
@@ -291,7 +293,7 @@ namespace quda
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
   MAXNREG(Arg::max_regs)
-  __global__ std::enable_if_t<(device::use_kernel_arg<Arg>() && Arg::max_regs > 0), void> Kernel3D(Arg arg)
+  __global__ std::enable_if_t<(device::use_kernel_arg<Arg>() && Arg::max_regs > 0), void> Kernel3D(const GRID_CONSTANT Arg arg)
   {
     Kernel3D_impl<Functor, Arg, grid_stride>(arg);
   }
@@ -310,7 +312,8 @@ namespace quda
      @param[in] arg Kernel argument
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && Arg::max_regs == 0, void> Kernel3D(Arg arg)
+  __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && Arg::max_regs == 0, void>
+  Kernel3D(const GRID_CONSTANT Arg arg)
   {
     Kernel3D_impl<Functor, Arg, grid_stride>(arg);
   }
diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index 9f2a51d0b8..c441ba9232 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -2,6 +2,7 @@
 
 #include <register_traits.h>
 #include <inline_ptx.h>
+#include <tma_helper.hpp>
 
 namespace quda
 {
@@ -15,53 +16,82 @@ namespace quda
   // pre-declaration of vector_load that we wish to specialize
   template <bool> struct vector_load_impl;
 
+  // pre-declaration of the prefetch type
+  template <size_t prefetch> struct prefetch_t;
+
   // CUDA specializations of the vector_load
   template <> struct vector_load_impl<true> {
-    template <typename T> __device__ inline void operator()(T &value, const void *ptr, int idx)
-    {
+    template <typename T, size_t prefetch_size>
+    __device__ inline void operator()(T &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &) {
       value = reinterpret_cast<const T *>(ptr)[idx];
     }
 
-    __device__ inline void operator()(float4 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(float4 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
+    {
+      load_cached_float4<prefetch_size>(value, reinterpret_cast<const float4 *>(ptr) + idx);
+    }
+
+    template <size_t prefetch_size>
+    __device__ inline void operator()(float2 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
-      load_cached_float4(value, reinterpret_cast<const float4 *>(ptr) + idx);
+      load_cached_float2<prefetch_size>(value, reinterpret_cast<const float2 *>(ptr) + idx);
     }
 
-    __device__ inline void operator()(float2 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(float &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
-      load_cached_float2(value, reinterpret_cast<const float2 *>(ptr) + idx);
+      load_cached_float<prefetch_size>(value, reinterpret_cast<const float *>(ptr) + idx);
     }
 
 #if __COMPUTE_CAPABILITY__ >= 1000
-    __device__ inline void operator()(double4 &value, const void *ptr, int idx)
+    template <typename T, size_t prefetch_size>
+    __device__ inline void operator()(double4 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
-      load_cached_double4(value, reinterpret_cast<const double4 *>(ptr) + idx);
+      load_cached_double4<prefetch_size>(value, reinterpret_cast<const double4 *>(ptr) + idx);
     }
 
-    __device__ inline void operator()(float8 &value, const void *ptr, int idx)
+    template <typename T, size_t prefetch_size>
+    __device__ inline void operator()(float8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
-      load_cached_float8(value, reinterpret_cast<const float8 *>(ptr) + idx);
+      load_cached_float8<prefetch_size>(value, reinterpret_cast<const float8 *>(ptr) + idx);
     }
 #endif
 
-    __device__ inline void operator()(double2 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(double2 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
-      load_cached_double2(value, reinterpret_cast<const double2 *>(ptr) + idx);
+      load_cached_double2<prefetch_size>(value, reinterpret_cast<const double2 *>(ptr) + idx);
     }
 
-    __device__ inline void operator()(short8 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(short2 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
+    {
+      load_cached_short2<prefetch_size>(value, reinterpret_cast<const short2 *>(ptr) + idx);      
+    }
+
+    template <size_t prefetch_size>
+    __device__ inline void operator()(short4 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
+    {
+      load_cached_short4<prefetch_size>(value, reinterpret_cast<const short4 *>(ptr) + idx);      
+    }
+
+    template <size_t prefetch_size>
+    __device__ inline void operator()(short8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
     {
       float4 tmp;
-      operator()(tmp, ptr, idx);
+      operator()(tmp, ptr, idx, prefetch);
       memcpy(&value, &tmp, sizeof(float4));
     }
 
-    __device__ inline void operator()(char8 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(char8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
     {
       float2 tmp;
-      operator()(tmp, ptr, idx);
+      operator()(tmp, ptr, idx, prefetch);
       memcpy(&value, &tmp, sizeof(float2));
     }
+
   };
 
   // pre-declaration of vector_store that we wish to specialize
@@ -127,6 +157,70 @@ namespace quda
     }
   };
 
+  // pre-declaration of the prefetch_cache that we wish to specialize
+  template <bool> struct prefetch_cache_line_imp;
+
+  // CUDA specialization of the prefetch_cache that uses inline ptx
+  template <> struct prefetch_cache_line_imp<true> {
+    __device__ inline void operator()(const void *p) { prefetch_L2(p); }
+  };
+
+  // pre-declaration of the prefetch_cache that we wish to specialize
+  template <bool> struct prefetch_L1_cache_line_imp;
+
+  template <> struct prefetch_L1_cache_line_imp<true> {
+    __device__ inline void operator()(const void *p)
+    {
+      static __shared__ float smem[32]; // dummy shared memory allocation
+      auto tid = target::thread_idx_linear<3>();
+      auto lane_id = tid & 31;
+      prefetch_L1(smem + lane_id, p);
+    }
+  };
+
+  // pre-declaration of the prefetch_cache that we wish to specialize
+  template <bool> struct prefetch_cache_bulk_imp;
+  template <bool> struct prefetch_cache_tensor_2d_imp;
+  template <bool> struct prefetch_cache_tensor_3d_imp;
+  template <bool> struct prefetch_cache_tensor_4d_imp;
+  template <bool> struct prefetch_cache_tensor_5d_imp;
+
+#if __COMPUTE_CAPABILITY__ >= 900
+  // CUDA specialization of the prefetch_cache_bulk that uses TMA (requires Hopper+)
+  template <> struct prefetch_cache_bulk_imp<true> {
+    __device__ inline void operator()(const void *p, size_t bytes) { prefetch_tma(p, bytes); }
+  };
+
+  // CUDA specialization of the prefetch_cache_tensor_2d that uses TMA (requires Hopper+)
+  template <> struct prefetch_cache_tensor_2d_imp<true> {
+    __device__ inline void operator()(const tma_descriptor_t &desc, int x, int y) { prefetch_tma_2d(desc.map, x, y); }
+  };
+
+  // CUDA specialization of the prefetch_cache_tensor_3d that uses TMA (requires Hopper+)
+  template <> struct prefetch_cache_tensor_3d_imp<true> {
+    __device__ inline void operator()(const tma_descriptor_t &desc, int x, int y, int z)
+    {
+      prefetch_tma_3d(desc.map, x, y, z);
+    }
+  };
+
+  // CUDA specialization of the prefetch_cache_tensor_4d that uses TMA (requires Hopper+)
+  template <> struct prefetch_cache_tensor_4d_imp<true> {
+    __device__ inline void operator()(const tma_descriptor_t &desc, int x, int y, int z, int w)
+    {
+      prefetch_tma_4d(desc.map, x, y, z, w);
+    }
+  };
+
+  // CUDA specialization of the prefetch_cache_tensor_5d that uses TMA (requires Hopper+)
+  template <> struct prefetch_cache_tensor_5d_imp<true> {
+    __device__ inline void operator()(const tma_descriptor_t &desc, int x, int y, int z, int w, int u)
+    {
+      prefetch_tma_5d(desc.map, x, y, z, w, u);
+    }
+  };
+#endif
+
 } // namespace quda
 
 #include "../generic/load_store.h"
diff --git a/include/targets/cuda/mdw_dslash5_tensor_core.cuh b/include/targets/cuda/mdw_dslash5_tensor_core.cuh
index 0f1a1abc95..4d9ab8d8c5 100644
--- a/include/targets/cuda/mdw_dslash5_tensor_core.cuh
+++ b/include/targets/cuda/mdw_dslash5_tensor_core.cuh
@@ -186,31 +186,6 @@ namespace quda
     }
   }
 
-  template <class integer_vec> __device__ inline integer_vec __2half22integer4_rn(const half2 &a, const half2 &b)
-  {
-    integer_vec c;
-    c.x = __half2short_rn(a.x);
-    c.y = __half2short_rn(a.y);
-    c.z = __half2short_rn(b.x);
-    c.w = __half2short_rn(b.y);
-    return c;
-  }
-
-  template <class integer_vec>
-  __device__ inline integer_vec __4half22integer8_rn(const half2 &a, const half2 &b, const half2 &c, const half2 &d)
-  {
-    integer_vec e;
-    e.x.x = __half2short_rn(a.x);
-    e.x.y = __half2short_rn(a.y);
-    e.x.z = __half2short_rn(b.x);
-    e.x.w = __half2short_rn(b.y);
-    e.y.x = __half2short_rn(c.x);
-    e.y.y = __half2short_rn(c.y);
-    e.y.z = __half2short_rn(d.x);
-    e.y.w = __half2short_rn(d.y);
-    return e;
-  }
-
   __device__ inline void __half_max_abs_half2__(half &max, const half2 &input)
   {
     half2 lh = habs2(input);
@@ -276,14 +251,14 @@ namespace quda
     }
     if (store) {
       scale = block_wise_reduce_vector(ftor, v);
+      auto scale_inv = __fdividef(1.0f, scale);
 #pragma unroll
       for (int spin = 0; spin < 4; spin++) {
 #pragma unroll
         for (int color = 0; color < 3; color++) {
-          float real = v(spin, color).real() / scale;
-          float imag = v(spin, color).imag() / scale;
+          auto c = v(spin, color) * scale_inv;
           int idx = (threadIdx.y * 4 + spin) * N_sm_d2 + 3 * threadIdx.x + color;
-          sm_b[idx] = __floats2half2_rn(real, imag);
+          sm_b[idx] = __floats2half2_rn(c.real(), c.imag());
         }
       }
     }
@@ -291,7 +266,7 @@ namespace quda
 
   // Store results(scaled short/char values and scale) in shared memroy to global
   // memroy.
-  template <class storage_type, int N_sm, class Output>
+  template <class store_t, int N_sm, class Output>
   __device__ inline void store_matrix_c(Output &output, half2 *sm_b, int sid, const float scale)
   {
     half max_ = 0.0f;
@@ -306,60 +281,34 @@ namespace quda
     }
 
     auto norm = reinterpret_cast<float *>(output.field + output.volumeCB * 24);
-    norm[sid] = __half2float(max_) * scale * fixedInvMaxValue<storage_type>::value;
-
-    const half2 max_i_div_max2_ = __half2half2(__hdiv(fixedMaxValue<storage_type>::value, max_));
-#if QUDA_ORDER_FP == 8 // use float8/short8
-    typedef typename VectorType<storage_type, 8>::type storage_vec;
-    storage_vec *out = reinterpret_cast<storage_vec *>(output.field);
-    half2 a, b, c, d;
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 0) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 0) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    c = __hmul2(sm_b[(threadIdx.y * 4 + 0) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    d = __hmul2(sm_b[(threadIdx.y * 4 + 1) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    vector_store(&out[sid + 0 * output.volumeCB], 0, __4half22integer8_rn<storage_vec>(a, b, c, d));
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 1) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 1) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    c = __hmul2(sm_b[(threadIdx.y * 4 + 2) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    d = __hmul2(sm_b[(threadIdx.y * 4 + 2) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    vector_store(&out[sid + 1 * output.volumeCB], 0, __4half22integer8_rn<storage_vec>(a, b, c, d));
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 2) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 3) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    c = __hmul2(sm_b[(threadIdx.y * 4 + 3) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    d = __hmul2(sm_b[(threadIdx.y * 4 + 3) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    vector_store(&out[sid + 2 * output.volumeCB], 0, __4half22integer8_rn<storage_vec>(a, b, c, d));
-#elif QUDA_ORDER_FP == 4
-    typedef typename VectorType<storage_type, 4>::type storage_vec;
-    storage_vec *out = reinterpret_cast<storage_vec *>(output.field);
-    half2 a, b;
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 0) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 0) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    out[sid + 0 * output.volumeCB] = __2half22integer4_rn<storage_vec>(a, b);
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 0) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 1) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    out[sid + 1 * output.volumeCB] = __2half22integer4_rn<storage_vec>(a, b);
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 1) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 1) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    out[sid + 2 * output.volumeCB] = __2half22integer4_rn<storage_vec>(a, b);
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 2) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 2) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    out[sid + 3 * output.volumeCB] = __2half22integer4_rn<storage_vec>(a, b);
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 2) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 3) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    out[sid + 4 * output.volumeCB] = __2half22integer4_rn<storage_vec>(a, b);
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 3) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 3) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    out[sid + 5 * output.volumeCB] = __2half22integer4_rn<storage_vec>(a, b);
-#endif
+    norm[sid] = __half2float(max_) * scale * fixedInvMaxValue<store_t>::value;
+
+    const half2 max_i_div_max2_ = __half2half2(__hdiv(fixedMaxValue<store_t>::value, max_));
+    array<typename VectorType<store_t, 2>::type, 12> o;
+    for (int s = 0; s < 4; s++) {
+#pragma unroll
+      for (int c = 0; c < 3; c++) {
+        auto tmp = __hmul2(sm_b[(threadIdx.y * 4 + s) * N_sm_d2 + 3 * threadIdx.x + c], max_i_div_max2_);
+        o[s * 3 + c] = {static_cast<store_t>(__half2short_rn(tmp.x)), static_cast<store_t>(__half2short_rn(tmp.y))};
+      }
+    }
+
+    constexpr int N = colorspinor::get_vector_order<store_t>(24);
+    constexpr int M = 24 / N;
+    constexpr int Nrem = 24 - N * M;
+
+    array<store_t, N> outN;
+#pragma unroll
+    for (int i = 0; i < M; i++) {
+      memcpy(&outN, &o[i * N / 2], sizeof(outN));
+      vector_store(output.field, i * output.volumeCB + sid, outN);
+    }
+
+    if constexpr (Nrem > 0) {
+      array<store_t, Nrem> outNrem;
+      memcpy(&outNrem, &o[N * M / 2], sizeof(outNrem));
+      vector_store(output.field, N * M * output.volumeCB, sid, outNrem);
+    }
   }
 
   template <class mma_t, int BlockDimX, int Ls, int M, int N, int M_PAD, int N_PAD, bool reload, class T>
diff --git a/include/targets/cuda/mma_tensor_op/gmem_loader.cuh b/include/targets/cuda/mma_tensor_op/gmem_loader.cuh
index 0d4023bdbc..2a8a9c722e 100644
--- a/include/targets/cuda/mma_tensor_op/gmem_loader.cuh
+++ b/include/targets/cuda/mma_tensor_op/gmem_loader.cuh
@@ -504,11 +504,7 @@ namespace quda
             // block all-reduce thread_max
             using block_reduce_t = cub::BlockReduce<float, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS, block_y, block_z>;
             __shared__ typename block_reduce_t::TempStorage temp_storage;
-#if CUDA_VERSION >= 12090
             float block_max = block_reduce_t(temp_storage).Reduce(thread_max, ::cuda::maximum());
-#else
-            float block_max = block_reduce_t(temp_storage).Reduce(thread_max, ::cub::Max());
-#endif
 
             __shared__ float block_max_all;
             if (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z) == 0) {
@@ -670,11 +666,7 @@ namespace quda
             // block all-reduce thread_max
             using block_reduce_t = cub::BlockReduce<float, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS, block_y, block_z>;
             __shared__ typename block_reduce_t::TempStorage temp_storage;
-#if CUDA_VERSION >= 12090
             float block_max = block_reduce_t(temp_storage).Reduce(thread_max, ::cuda::maximum());
-#else
-            float block_max = block_reduce_t(temp_storage).Reduce(thread_max, cub::Max());
-#endif
 
             __shared__ float block_max_all;
             if (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z) == 0) {
diff --git a/include/targets/cuda/target_device.h b/include/targets/cuda/target_device.h
index 077504027a..7a3b23c620 100644
--- a/include/targets/cuda/target_device.h
+++ b/include/targets/cuda/target_device.h
@@ -7,6 +7,8 @@
 #include <nv/target>
 #endif
 
+#include <cuda/ptx>
+
 #if defined(__CUDACC__) || defined(_NVHPC_CUDA) || (defined(__clang__) && defined(__CUDA__))
 #define QUDA_CUDA_CC
 #endif
@@ -189,6 +191,55 @@ namespace quda
       }
     }
 
+    template <bool is_device> struct is_thread_zero_impl {
+      template <class T> bool operator()(const T &) { return true; }
+    };
+
+#ifdef QUDA_CUDA_CC
+    template <> struct is_thread_zero_impl<true> {
+      template <class T> __device__ bool operator()(const T &)
+      {
+        unsigned int tid = thread_idx_linear<T::value>();
+        unsigned int warp_id = tid / 32;
+        unsigned int uniform_warp_id = __shfl_sync(0xFFFFFFFF, warp_id, 0); // Broadcast from lane 0
+        // unsigned int uniform_warp_id = __reduce_min_sync(~0, warp_id == 0); perhaps faster on sm_100
+        return (uniform_warp_id == 0 && cuda::ptx::elect_sync(0xFFFFFFFF));
+      }
+    };
+#endif
+
+    /**
+       @brief Return true only for a single thread in a thread block.
+       This function assumes all warps in the thread block are
+       converged.  Note that the single thread that is returned is not
+       necessarily thread 0 in the thread block.
+       @tparam dim The dimension of the thread block
+       @return true for a single thread in the thread block, else
+       false
+    */
+    template <int dim = 3> __device__ __host__ inline bool is_thread_zero()
+    {
+      return target::dispatch<is_thread_zero_impl>(std::integral_constant<int, dim>());
+    }
+
+    template <bool is_device> struct is_lane_zero_impl {
+      bool operator()() { return true; }
+    };
+#ifdef QUDA_CUDA_CC
+    template <> struct is_lane_zero_impl<true> {
+      __device__ bool operator()() { return cuda::ptx::elect_sync(0xFFFFFFFF); }
+    };
+#endif
+
+    /**
+       @brief Return true only for a single lane in a warp.
+       This function assumes the warp is converged.
+       Note that the single thread that is returned is not
+       necessarily lane 0 in the warp.
+       @return true for a single lane in the warp, else false
+    */
+    __device__ __host__ inline bool is_lane_zero() { return target::dispatch<is_lane_zero_impl>(); }
+
     template <class T> constexpr bool vectorize()
     {
 #ifdef QUDA_VECTORIZE_SINGLE
@@ -198,6 +249,27 @@ namespace quda
 #endif
     }
 
+    template <bool is_device> struct uniform_impl {
+      template <typename T> T operator()(const T &t) { return t; }
+    };
+#ifdef QUDA_CUDA_CC
+    template <> struct uniform_impl<true> {
+      template <typename T> __device__ inline T operator()(const T &t) { return __shfl_sync(0xFFFFFFFF, t, 0); }
+    };
+#endif
+
+    /**
+       @brief Return the warp uniform variant of a given operand.
+       This is used to suggest to a compiler that a variable is
+       constant across the warp.
+       @param[in] t The input value we want to make warp uniform
+       @return The warp uniform variant
+    */
+    template <typename T> __device__ __host__ inline bool uniform(const T &t)
+    {
+      return target::dispatch<uniform_impl>(t);
+    }
+
   } // namespace target
 
   namespace device
diff --git a/include/targets/cuda/tma_helper.hpp b/include/targets/cuda/tma_helper.hpp
index 39adae2abe..852a0a42cf 100644
--- a/include/targets/cuda/tma_helper.hpp
+++ b/include/targets/cuda/tma_helper.hpp
@@ -1,21 +1,31 @@
 #pragma once
 
 #include <quda_define.h>
+#include <target_device.h>
+#include <gauge_field.h>
+#include <complex_quda.h>
 
 #if (__COMPUTE_CAPABILITY__ >= 900) && (CUDA_VERSION >= 12060)
 #define USE_TENSOR_MEMORY_ACCELERATOR
 #endif
 
-#ifdef USE_TENSOR_MEMORY_ACCELERATOR
+#ifndef USE_TENSOR_MEMORY_ACCELERATOR
+
+#include "../generic/tma_helper.hpp"
+
+#else
 #include <cuda.h>
 #include <unordered_map>
+#include <cuda/ptx>
+#include <cuda/barrier>
 
 using barrier_t = cuda::barrier<cuda::thread_scope_block>;
-namespace cde = cuda::device::experimental;
 
 namespace quda
 {
 
+  enum class PrefetchType { NONE, THREAD, BULK, TENSOR };
+
   struct tma_descriptor_t {
     CUtensorMap map;
   };
@@ -160,6 +170,7 @@ namespace quda
     return get_tma_descriptor<T, 4>(key);
   }
 
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA) || (defined(__clang__) && defined(__CUDA__))
   /**
     @brief Launch TMA load from a 5-d tensor in global memory to a 2-d box in shared memory.
     @param smem_ptr The destination shared memory pointer
@@ -177,7 +188,9 @@ namespace quda
   {
     static_assert(box_a <= tma_box_limit);
     static_assert(box_b <= tma_box_limit);
-    cde::cp_async_bulk_tensor_5d_global_to_shared(smem_ptr, map, offset_a, offset_b, offset_c, offset_d, offset_e, *bar);
+    int32_t coords[5] = {offset_a, offset_b, offset_c, offset_d, offset_e};
+    cuda::ptx::cp_async_bulk_tensor(cuda::ptx::space_shared, cuda::ptx::space_global, smem_ptr, map, coords,
+                                    reinterpret_cast<uint64_t *>(bar));
   }
 
   /**
@@ -196,8 +209,29 @@ namespace quda
   {
     static_assert(box_a <= tma_box_limit);
     static_assert(box_b <= tma_box_limit);
-    cde::cp_async_bulk_tensor_4d_global_to_shared(smem_ptr, map, offset_a, offset_b, offset_c, offset_d, *bar);
+    int32_t coords[4] = {offset_a, offset_b, offset_c, offset_d};
+    cuda::ptx::cp_async_bulk_tensor(cuda::ptx::space_shared, cuda::ptx::space_global, smem_ptr, map, coords,
+                                    reinterpret_cast<uint64_t *>(bar));
   }
+#endif
+
+  namespace gauge
+  {
+
+    struct tensor_desc_t {
+      tma_descriptor_t N;
+      tma_descriptor_t Nrem;
+      tma_descriptor_t phase;
+    };
+
+  } // namespace gauge
+
+  /*
+   * @brief Create a tensor descriptor associated with a GaugeField instance with the supplied block size
+   * @param[in] u the gauge field we are getting the descriptor for
+   * @param[in] block_size the thread block size we associate with this descriptor
+   */
+  gauge::tensor_desc_t &get_tensor_descriptor(const GaugeField &u, uint32_t block_size);
 
 } // namespace quda
 
diff --git a/include/targets/cuda/tunable_kernel.h b/include/targets/cuda/tunable_kernel.h
index 55219f5f93..46b599254e 100644
--- a/include/targets/cuda/tunable_kernel.h
+++ b/include/targets/cuda/tunable_kernel.h
@@ -57,6 +57,8 @@ namespace quda
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
       checkSharedBytes<Functor>(tp, arg);
+      const_cast<Arg &>(arg).block_size = tp.block.x * tp.block.y * tp.block.z;
+      if constexpr (Arg::is_dslash) const_cast<Arg &>(arg).arg.block_size = arg.block_size;
 #ifdef JITIFY
       launch_error = launch_jitify<Functor, grid_stride, Arg>(kernel.name, tp, stream, arg);
 #else
@@ -66,7 +68,7 @@ namespace quda
       return launch_error;
     }
 
-    template <typename Arg, size_t arg_size = sizeof(Arg)> void check_arg_size(Arg&)
+    template <typename Arg, size_t arg_size = sizeof(Arg)> void check_arg_size(Arg &)
     {
       static_assert(sizeof(Arg) <= device::max_constant_size(), "Parameter struct is greater than max constant size");
     }
@@ -76,6 +78,8 @@ namespace quda
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
       checkSharedBytes<Functor>(tp, arg);
+      const_cast<Arg &>(arg).block_size = tp.block.x * tp.block.y * tp.block.z;
+      if constexpr (Arg::is_dslash) const_cast<Arg &>(arg).arg.block_size = arg.block_size;
 #ifdef JITIFY
       // note we do the copy to constant memory after the kernel has been compiled in launch_jitify
       launch_error = launch_jitify<Functor, grid_stride, Arg>(kernel.name, tp, stream, arg);
@@ -99,6 +103,8 @@ namespace quda
     void launch_cuda(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg) const
     {
       checkSharedBytes<Functor>(tp, arg);
+      const_cast<Arg &>(arg).block_size = tp.block.x * tp.block.y * tp.block.z;
+      if constexpr (Arg::is_dslash) const_cast<Arg &>(arg).arg.block_size = arg.block_size;
       constexpr bool grid_stride = false;
       const_cast<TunableKernel *>(this)->launch_device<Functor, grid_stride>(KERNEL(raw_kernel), tp, stream, arg);
     }
diff --git a/include/targets/generic/load_store.h b/include/targets/generic/load_store.h
index 3239aeaefc..80c89439b9 100644
--- a/include/targets/generic/load_store.h
+++ b/include/targets/generic/load_store.h
@@ -1,38 +1,51 @@
 #pragma once
 
 #include <target_device.h>
+#include <tma_helper.hpp>
 
 namespace quda
 {
 
+  template <size_t prefetch> struct prefetch_t {
+    static constexpr int size = prefetch;
+  };
+
   /**
      @brief Non-specialized load operation
   */
   template <bool is_device> struct vector_load_impl {
-    template <typename T> __device__ __host__ inline void operator()(T &value, const void *ptr, int idx)
+    template <typename T, size_t prefetch_size>
+    __device__ __host__ inline void operator()(T &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
       value = reinterpret_cast<const T *>(ptr)[idx];
     }
   };
 
-  template <typename vector_t> __device__ __host__ inline vector_t vector_load(const void *ptr, int idx)
+  template <typename vector_t, size_t prefetch = 0>
+  __device__ __host__ inline vector_t vector_load_internal(const void *ptr, int idx)
   {
     vector_t value;
-    target::dispatch<vector_load_impl>(value, ptr, idx);
+    target::dispatch<vector_load_impl>(value, ptr, idx, prefetch_t<prefetch>());
     return value;
   }
 
-  template <typename scalar_t, int N>
+  template <typename scalar_t, int N, size_t prefetch = 0>
   __device__ __host__ inline array<scalar_t, N> vector_load(const void *ptr, int idx)
   {
     using vector_t = typename VectorType<scalar_t, N>::type;
-    auto value_v = vector_load<vector_t>(ptr, idx);
+    auto value_v = vector_load_internal<vector_t, prefetch>(ptr, idx);
     array<scalar_t, N> value_a;
     static_assert(sizeof(value_a) == sizeof(value_v), "array type and vector type are different sizes");
     memcpy(&value_a, &value_v, sizeof(vector_t));
     return value_a;
   }
 
+  template <typename scalar_t, int N, size_t prefetch = 0>
+  __device__ __host__ inline array<scalar_t, N> vector_load(const scalar_t *ptr, unsigned int offset, int idx)
+  {
+    return vector_load<scalar_t, N, prefetch>(ptr + (offset + N * idx), 0);
+  }
+
   /**
      @brief Non-specialized store operation
   */
@@ -58,4 +71,70 @@ namespace quda
     vector_store<vector_t>(ptr, idx, value_v);
   }
 
+  template <typename scalar_t, int N>
+  __device__ __host__ inline void vector_store(scalar_t *ptr, unsigned offset, int idx, const array<scalar_t, N> &value_a)
+  {
+    vector_store<scalar_t, N>(ptr + (offset + N * idx), 0, value_a);
+  }
+
+  template <bool is_device> struct prefetch_cache_line_imp {
+    __device__ __host__ inline void operator()(const void *) { }
+  };
+
+  __device__ __host__ inline void prefetch_cache_line(const void *p) { target::dispatch<prefetch_cache_line_imp>(p); }
+
+  template <bool is_device> struct prefetch_L1_cache_line_imp {
+    __device__ __host__ inline void operator()(const void *) { }
+  };
+
+  __device__ __host__ inline void prefetch_L1_cache_line(const void *p)
+  {
+    target::dispatch<prefetch_L1_cache_line_imp>(p);
+  }
+
+  template <bool is_device> struct prefetch_cache_bulk_imp {
+    constexpr void operator()(const void *, size_t) { }
+  };
+
+  __device__ __host__ inline void prefetch_cache_bulk(const void *p, size_t bytes)
+  {
+    target::dispatch<prefetch_cache_bulk_imp>(p, bytes);
+  }
+
+  template <bool is_device> struct prefetch_cache_tensor_2d_imp {
+    constexpr void operator()(const tma_descriptor_t &, int, int) { }
+  };
+
+  __device__ __host__ inline void prefetch_cache_tensor_2d(const tma_descriptor_t &desc, int x, int y)
+  {
+    target::dispatch<prefetch_cache_tensor_2d_imp>(desc, x, y);
+  }
+
+  template <bool is_device> struct prefetch_cache_tensor_3d_imp {
+    constexpr void operator()(const tma_descriptor_t &, int, int, int) { }
+  };
+
+  __device__ __host__ inline void prefetch_cache_tensor_3d(const tma_descriptor_t &desc, int x, int y, int z)
+  {
+    target::dispatch<prefetch_cache_tensor_3d_imp>(desc, x, y, z);
+  }
+
+  template <bool is_device> struct prefetch_cache_tensor_4d_imp {
+    constexpr void operator()(const tma_descriptor_t &, int, int, int, int) { }
+  };
+
+  __device__ __host__ inline void prefetch_cache_tensor_4d(const tma_descriptor_t &desc, int x, int y, int z, int w)
+  {
+    target::dispatch<prefetch_cache_tensor_4d_imp>(desc, x, y, z, w);
+  }
+
+  template <bool is_device> struct prefetch_cache_tensor_5d_imp {
+    constexpr void operator()(const tma_descriptor_t &, int, int, int, int, int) { }
+  };
+
+  __device__ __host__ inline void prefetch_cache_tensor_5d(const tma_descriptor_t &desc, int x, int y, int z, int w, int u)
+  {
+    target::dispatch<prefetch_cache_tensor_5d_imp>(desc, x, y, z, w, u);
+  }
+
 } // namespace quda
diff --git a/include/targets/generic/tma_helper.hpp b/include/targets/generic/tma_helper.hpp
new file mode 100644
index 0000000000..761768ce98
--- /dev/null
+++ b/include/targets/generic/tma_helper.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+namespace quda
+{
+
+  enum class PrefetchType { NONE, THREAD, BULK, TENSOR };
+
+  struct tma_descriptor_t {
+  };
+
+  namespace gauge
+  {
+    struct tensor_desc_t {
+      tma_descriptor_t N;
+      tma_descriptor_t Nrem;
+      tma_descriptor_t phase;
+    };
+  } // namespace gauge
+
+  inline gauge::tensor_desc_t get_tensor_descriptor(const GaugeField &, uint32_t) { return gauge::tensor_desc_t {}; }
+
+} // namespace quda
diff --git a/include/targets/hip/block_reduction_kernel.h b/include/targets/hip/block_reduction_kernel.h
index d81c213dd9..0daff25af4 100644
--- a/include/targets/hip/block_reduction_kernel.h
+++ b/include/targets/hip/block_reduction_kernel.h
@@ -43,9 +43,9 @@ namespace quda
      size to be set statically at launch time in the actual argument
      class that is passed to the kernel.
    */
-  template <unsigned int block_size_, typename Arg_> struct BlockKernelArg : Arg_ {
+  template <unsigned int block_size, typename Arg_> struct BlockKernelArg : Arg_ {
     using Arg = Arg_;
-    static constexpr unsigned int block_size = block_size_;
+    static constexpr unsigned int block_size_cxpr = block_size;
     BlockKernelArg(const Arg &arg) : Arg(arg) { }
   };
 
@@ -89,7 +89,7 @@ namespace quda
      @param[in] arg Kernel argument
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __launch_bounds__(Arg::block_size)
+  __launch_bounds__(Arg::block_size_cxpr)
     __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && (Arg::launch_bounds), void> BlockKernel2D(Arg arg)
   {
     static_assert(!grid_stride, "grid_stride not supported for BlockKernel");
@@ -132,7 +132,7 @@ namespace quda
      @param[in] arg Kernel argument
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __launch_bounds__(Arg::block_size)
+  __launch_bounds__(Arg::block_size_cxpr)
     __global__ std::enable_if_t<(!device::use_kernel_arg<Arg>()) && (Arg::launch_bounds), void> BlockKernel2D()
   {
     static_assert(!grid_stride, "grid_stride not supported for BlockKernel");
diff --git a/include/targets/hip/load_store.h b/include/targets/hip/load_store.h
index d1bfe4a955..0d9058f098 100644
--- a/include/targets/hip/load_store.h
+++ b/include/targets/hip/load_store.h
@@ -14,24 +14,30 @@ namespace quda
   // pre-declaration of vector_load that we wish to specialize
   template <bool> struct vector_load_impl;
 
-  // CUDA specializations of the vector_load
+  // pre-declaration of the prefetch type
+  template <size_t prefetch> struct prefetch_t;
+
+  // HIP specializations of the vector_load
   template <> struct vector_load_impl<true> {
-    template <typename T> __device__ inline void operator()(T &value, const void *ptr, int idx)
+    template <typename T, size_t prefetch_size>
+    __device__ inline void operator()(T &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
       value = reinterpret_cast<const T *>(ptr)[idx];
     }
 
-    __device__ inline void operator()(short8 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(short8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
     {
       float4 tmp;
-      operator()(tmp, ptr, idx);
+      operator()(tmp, ptr, idx, prefetch);
       memcpy(&value, &tmp, sizeof(float4));
     }
 
-    __device__ inline void operator()(char8 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(char8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
     {
       float2 tmp;
-      operator()(tmp, ptr, idx);
+      operator()(tmp, ptr, idx, prefetch);
       memcpy(&value, &tmp, sizeof(float2));
     }
   };
diff --git a/include/targets/hip/shared_memory_cache_helper.h b/include/targets/hip/shared_memory_cache_helper.h
deleted file mode 100644
index 73be0cd01b..0000000000
--- a/include/targets/hip/shared_memory_cache_helper.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../generic/shared_memory_cache_helper.h"
diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h
index 69d8c095ce..3b4b46a132 100644
--- a/include/targets/hip/shared_memory_helper.h
+++ b/include/targets/hip/shared_memory_helper.h
@@ -80,8 +80,9 @@ namespace quda
     /**
        @brief Constructor for SharedMemory object.
     */
-    template <typename... U>
-    constexpr SharedMemory(const KernelOps<U...> &) : data(cache(get_offset(target::block_dim())))
+    template <typename... U, typename... Arg>
+    constexpr SharedMemory(const KernelOps<U...> &, const Arg &...arg) :
+      data(cache(get_offset(target::block_dim(), arg...)))
     {
     }
 
diff --git a/include/targets/hip/target_device.h b/include/targets/hip/target_device.h
index 0f2bd317cf..63b0a1cd36 100644
--- a/include/targets/hip/target_device.h
+++ b/include/targets/hip/target_device.h
@@ -144,6 +144,25 @@ namespace quda
       }
     }
 
+    template <int dim = 3> __device__ __host__ inline bool is_thread_zero()
+    {
+      return thread_idx_linear<dim>() == 0;
+    }
+
+    __device__ __host__ inline bool is_lane_zero()
+    {
+      return (thread_idx_linear<3>() % 64) == 0; // switch this to warp_size
+    }
+
+    /**
+       @brief Return the warp uniform variant of a given operand.
+       This is used to suggest to a compiler that a variable is
+       constant across the warp.  Dummy for HIP.
+       @param[in] t The input value we want to make warp uniform
+       @return The warp uniform variant
+    */
+    template <typename T> constexpr bool uniform(const T &t) { return t; }
+
   } // namespace target
 
   namespace device
diff --git a/include/targets/hip/tunable_kernel.h b/include/targets/hip/tunable_kernel.h
index 5447eeb25b..bb8b08f56b 100644
--- a/include/targets/hip/tunable_kernel.h
+++ b/include/targets/hip/tunable_kernel.h
@@ -54,17 +54,26 @@ namespace quda
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
       checkSharedBytes<Functor>(tp, arg);
+      const_cast<Arg &>(arg).block_size = tp.block.x * tp.block.y * tp.block.z;
+      if constexpr (Arg::is_dslash) const_cast<Arg &>(arg).arg.block_size = arg.block_size;
       setMaxActiveBlocks(kernel, tp);
       launch_error = qudaLaunchKernel(kernel, tp, stream, static_cast<const void *>(&arg));
       return launch_error;
     }
 
+    template <typename Arg, size_t arg_size = sizeof(Arg)> void check_arg_size(Arg &)
+    {
+      static_assert(sizeof(Arg) <= device::max_constant_size(), "Parameter struct is greater than max constant size");
+    }
+
     template <template <typename> class Functor, bool grid_stride, typename Arg>
     std::enable_if_t<!device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
       checkSharedBytes<Functor>(tp, arg);
-      static_assert(sizeof(Arg) <= device::max_constant_size(), "Parameter struct is greater than max constant size");
+      const_cast<Arg &>(arg).block_size = tp.block.x * tp.block.y * tp.block.z;
+      if constexpr (Arg::is_dslash) const_cast<Arg &>(arg).arg.block_size = arg.block_size;
+      check_arg_size(arg);
       qudaMemcpyAsync(device::get_constant_buffer<Arg>(), &arg, sizeof(Arg), qudaMemcpyHostToDevice, stream);
       setMaxActiveBlocks(kernel, tp);
       launch_error = qudaLaunchKernel(kernel, tp, stream, static_cast<const void *>(&arg));
@@ -82,6 +91,8 @@ namespace quda
     void launch_cuda(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg) const
     {
       checkSharedBytes<Functor>(tp, arg);
+      const_cast<Arg &>(arg).block_size = tp.block.x * tp.block.y * tp.block.z;
+      if constexpr (Arg::is_dslash) const_cast<Arg &>(arg).arg.block_size = arg.block_size;
       constexpr bool grid_stride = false;
       const_cast<TunableKernel *>(this)->launch_device<Functor, grid_stride>(KERNEL(raw_kernel), tp, stream, arg);
     }
diff --git a/include/tune_key.h b/include/tune_key.h
index 28c6b7668b..4a074ae4da 100644
--- a/include/tune_key.h
+++ b/include/tune_key.h
@@ -7,9 +7,9 @@ namespace quda {
 
   struct TuneKey {
 
-    static const int volume_n = 32;
-    static const int name_n = 512;
-    static const int aux_n = 256;
+    static constexpr int volume_n = 32;
+    static constexpr int name_n = 512;
+    static constexpr int aux_n = 384;
     char volume[volume_n];
     char name[name_n];
     char aux[aux_n];
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 6043cc0250..6338e269f4 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -28,6 +28,7 @@ set (QUDA_OBJS
   gauge_phase.cu timer.cpp
   solver.cpp inv_bicgstab_quda.cpp inv_cg_quda.cpp inv_bicgstabl_quda.cpp
   inv_multi_cg_quda.cpp inv_eigcg_quda.cpp gauge_ape.cu
+  gauge_shift.cu
   gauge_stout.cu gauge_hyp.cu gauge_wilson_flow.cu gauge_plaq.cu gauge_plaqrect.cu
   gauge_laplace.cpp gauge_observable.cpp
   inv_cgnr.cpp inv_cgne.cpp
@@ -518,6 +519,13 @@ if(NOT DEFINED QUDA_MAX_MULTI_RHS)
   message(STATUS "Max number of rhs per kernel: ${QUDA_MAX_MULTI_RHS}")
 endif()
 
+# Disable dslash prefetching if not already set
+if(NOT QUDA_DSLASH_PREFETCH_TYPE)
+  set(QUDA_DSLASH_PREFETCH_TYPE "NONE" CACHE STRING "enable Dslash prefetching" FORCE)
+  set(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON "0" CACHE STRING "set prefetch distance for Wilson-like fermions" FORCE)
+  set(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED "0" CACHE STRING "set prefetch distance for staggered-like fermions" FORCE)
+endif()
+
 # make one library
 target_sources(quda PRIVATE $<TARGET_OBJECTS:quda_cpp> $<$<TARGET_EXISTS:quda_pack>:$<TARGET_OBJECTS:quda_pack>>
                             ${QUDA_CU_OBJS})
diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp
index 2f0078aacf..3985511e8b 100644
--- a/lib/clover_field.cpp
+++ b/lib/clover_field.cpp
@@ -187,6 +187,7 @@ namespace quda {
     std::stringstream aux_ss;
     aux_ss << "vol=" << volume << "precision=" << precision << "Nc=" << nColor << ",order=" << order;
     if (isNative()) aux_ss << ",N=" << clover::get_vector_order(precision, 128);
+    if (precision < QUDA_SINGLE_PRECISION) aux_ss << ",alt_i2f=" << QUDA_ALTERNATIVE_I_TO_F;
     aux_string = aux_ss.str();
     if (aux_string.size() >= TuneKey::aux_n / 2) errorQuda("Aux string too large %lu", aux_string.size());
   }
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 72a3f5f62a..ea387eb44b 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -298,6 +298,7 @@ namespace quda
       aux_ss << "vol=" << volume << ",parity=" << siteSubset << ",precision=" << precision << ",Ns=" << nSpin
              << ",Nc=" << nColor << ",order=" << fieldOrder;
       if (isNative()) aux_ss << ",N=" << colorspinor::get_vector_order(precision, 128);
+      if (precision < QUDA_SINGLE_PRECISION) aux_ss << ",alt_i2f=" << QUDA_ALTERNATIVE_I_TO_F;
       if (nVec > 1) aux_ss << ",nVec=" << nVec;
       if (twistFlavor != QUDA_TWIST_NO && twistFlavor != QUDA_TWIST_INVALID) aux_ss << ",TwistFlavor=" << twistFlavor;
       aux_string = aux_ss.str();
@@ -384,10 +385,7 @@ namespace quda
       dc.X2X1 = X[1] * X[0];
       dc.X3X2X1 = X[2] * X[1] * X[0];
       dc.X4X3X2X1 = X[3] * X[2] * X[1] * X[0];
-      dc.X2X1mX1 = (X[1] - 1) * X[0];
-      dc.X3X2X1mX2X1 = (X[2] - 1) * X[1] * X[0];
-      dc.X4X3X2X1mX3X2X1 = (X[3] - 1) * X[2] * X[1] * X[0];
-      dc.X5X4X3X2X1mX4X3X2X1 = (X[4] - 1) * X[3] * X[2] * X[1] * X[0];
+      dc.X5X4X3X2X1 = X[4] * X[3] * X[2] * X[1] * X[0];
     }
 
     spin_project_allocated = spin_project;
diff --git a/lib/covariant_derivative.cu b/lib/covariant_derivative.cu
index df7767cdd1..fb30b149fb 100644
--- a/lib/covariant_derivative.cu
+++ b/lib/covariant_derivative.cu
@@ -25,18 +25,19 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     CovDev(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-           const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+           const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream) override
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay) errorQuda("Covariant derivative operator only defined without xpay");
       if (arg.nParity != 2) errorQuda("Covariant derivative operator only defined for full field");
 
@@ -143,11 +144,11 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in, 1, false);
       if (in.Nspin() == 4) {
         CovDevArg<Float, 4, nColor, DDArg, recon, nDim> arg(out, in, halo, U, mu, parity, dagger, comm_override);
-        CovDev<decltype(arg)> covDev(arg, out, in, halo);
+        CovDev<decltype(arg)> covDev(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(covDev)> policy(covDev, out, in, halo, profile);
       } else if (in.Nspin() == 1) {
         CovDevArg<Float, 1, nColor, DDArg, recon, nDim> arg(out, in, halo, U, mu, parity, dagger, comm_override);
-        CovDev<decltype(arg)> covDev(arg, out, in, halo);
+        CovDev<decltype(arg)> covDev(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(covDev)> policy(covDev, out, in, halo, profile);
       } else {
         errorQuda("Spin not supported");
diff --git a/lib/dslash_domain_wall_4d.hpp b/lib/dslash_domain_wall_4d.hpp
index b8ebf9b7e4..3b22cbf467 100644
--- a/lib/dslash_domain_wall_4d.hpp
+++ b/lib/dslash_domain_wall_4d.hpp
@@ -21,18 +21,19 @@ namespace quda
     using Dslash = Dslash<domainWall4D, Arg>;
     using Dslash::arg;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     DomainWall4D(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                 const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                 const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
   };
@@ -47,7 +48,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       DomainWall4DArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, a, m_5, b_5, c_5, a != 0.0, x, parity,
                                                              dagger, comm_override);
-      DomainWall4D<decltype(arg)> dwf(arg, out, in, halo);
+      DomainWall4D<decltype(arg)> dwf(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(dwf)> policy(dwf, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_domain_wall_4d_fused_m5.hpp b/lib/dslash_domain_wall_4d_fused_m5.hpp
index fb835eda8f..97ebd5fd7d 100644
--- a/lib/dslash_domain_wall_4d_fused_m5.hpp
+++ b/lib/dslash_domain_wall_4d_fused_m5.hpp
@@ -20,6 +20,7 @@ namespace quda
     using Dslash::aux_base;
     using Dslash::in;
     cvector_ref<ColorSpinorField> &y;
+    const GaugeField &U;
 
     inline std::string get_app_base()
     {
@@ -42,8 +43,8 @@ namespace quda
 
   public:
     DomainWall4DFusedM5(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                        const ColorSpinorField &halo, cvector_ref<ColorSpinorField> &y) :
-      Dslash(arg, out, in, halo, get_app_base()), y(y)
+                        const ColorSpinorField &halo, cvector_ref<ColorSpinorField> &y, const GaugeField &U) :
+      Dslash(arg, out, in, halo, get_app_base()), y(y), U(U)
     {
       TunableKernel3D::resizeStep(in.X(4), 1); // keep Ls local to the thread block
     }
@@ -51,7 +52,7 @@ namespace quda
     void apply(const qudaStream_t &stream) override
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
 
@@ -129,7 +130,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       using Arg = DomainWall4DFusedM5Arg<Float, nColor, nDim, DDArg, recon, dslash5_type_impl>;
       Arg arg(out, in, halo, U, a, m_5, b_5, c_5, a != 0.0, x, y, parity, dagger, comm_override, m_f);
-      DomainWall4DFusedM5<Arg> dwf(arg, out, in, halo, y);
+      DomainWall4DFusedM5<Arg> dwf(arg, out, in, halo, y, U);
       dslash::DslashPolicyTune<decltype(dwf)> policy(dwf, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_domain_wall_5d.hpp b/lib/dslash_domain_wall_5d.hpp
index 6ea9d0d144..ffeb183534 100644
--- a/lib/dslash_domain_wall_5d.hpp
+++ b/lib/dslash_domain_wall_5d.hpp
@@ -18,18 +18,19 @@ namespace quda
     using Dslash = Dslash<domainWall5D, Arg>;
     using Dslash::arg;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     DomainWall5D(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                 const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                 const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
 
@@ -74,7 +75,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       DomainWall5DArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, a, m_f, a != 0.0, x, parity, dagger,
                                                              comm_override);
-      DomainWall5D<decltype(arg)> dwf(arg, out, in, halo);
+      DomainWall5D<decltype(arg)> dwf(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(dwf)> policy(dwf, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_improved_staggered.hpp b/lib/dslash_improved_staggered.hpp
index a9576d84a1..25f9f673f4 100644
--- a/lib/dslash_improved_staggered.hpp
+++ b/lib/dslash_improved_staggered.hpp
@@ -24,19 +24,20 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
     const GaugeField &L;
 
   public:
     Staggered(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-              const ColorSpinorField &halo, const GaugeField &L) :
-      Dslash(arg, out, in, halo), L(L)
+              const ColorSpinorField &halo, const GaugeField &U, const GaugeField &L) :
+      Dslash(arg, out, in, halo), U(U), L(L)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::template setParam<true>(tp, U, L);
       // operator is anti-Hermitian so do not instantiate dagger
       if (arg.xpay)
         Dslash::template instantiate<packStaggeredShmem, false, true>(tp, stream);
@@ -153,9 +154,10 @@ namespace quda
       constexpr bool improved = true;
       constexpr QudaReconstructType recon_u = QUDA_RECONSTRUCT_NO;
       auto halo = ColorSpinorField::create_comms_batch(in, 3);
+
       StaggeredArg<Float, nColor, nDim, DDArg, recon_u, recon_l, improved> arg(out, in, halo, U, L, a, x, parity,
                                                                                dagger, comm_override);
-      Staggered<decltype(arg)> staggered(arg, out, in, halo, L);
+      Staggered<decltype(arg)> staggered(arg, out, in, halo, U, L);
       dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_ndeg_twisted_clover.hpp b/lib/dslash_ndeg_twisted_clover.hpp
index 78d1a48484..d13d0df0ab 100644
--- a/lib/dslash_ndeg_twisted_clover.hpp
+++ b/lib/dslash_ndeg_twisted_clover.hpp
@@ -21,6 +21,7 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
     unsigned int sharedBytesPerThread() const
     {
@@ -32,8 +33,8 @@ namespace quda
 
   public:
     NdegTwistedClover(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                      const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                      const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
       TunableKernel3D::resizeStep(2, 1);
     }
@@ -41,7 +42,7 @@ namespace quda
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay)
         Dslash::template instantiate<packShmem, true>(tp, stream);
       else
@@ -87,7 +88,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       NdegTwistedCloverArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, A, a, b, c, x, parity, dagger,
                                                                   comm_override);
-      NdegTwistedClover<decltype(arg)> twisted(arg, out, in, halo);
+      NdegTwistedClover<decltype(arg)> twisted(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_ndeg_twisted_clover_preconditioned.hpp b/lib/dslash_ndeg_twisted_clover_preconditioned.hpp
index 89c7165055..4e91b7ae22 100644
--- a/lib/dslash_ndeg_twisted_clover_preconditioned.hpp
+++ b/lib/dslash_ndeg_twisted_clover_preconditioned.hpp
@@ -20,6 +20,7 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
     unsigned int sharedBytesPerThread() const
     {
@@ -27,9 +28,9 @@ namespace quda
     }
 
   public:
-    NdegTwistedCloverPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out,
-                                    cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+    NdegTwistedCloverPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                    const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
       TunableKernel3D::resizeStep(2, 1); // this will force flavor to be contained in the block
     }
@@ -37,7 +38,7 @@ namespace quda
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.nParity != 1)
         errorQuda("Preconditioned non-degenerate twisted-clover operator not defined nParity=%d", arg.nParity);
 
@@ -106,7 +107,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       NdegTwistedCloverPreconditionedArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, A, a, b, c, xpay, x,
                                                                                 parity, dagger, comm_override);
-      NdegTwistedCloverPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+      NdegTwistedCloverPreconditioned<decltype(arg)> twisted(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_ndeg_twisted_mass.hpp b/lib/dslash_ndeg_twisted_mass.hpp
index 5461de6311..b0b0dc21e7 100644
--- a/lib/dslash_ndeg_twisted_mass.hpp
+++ b/lib/dslash_ndeg_twisted_mass.hpp
@@ -19,18 +19,19 @@ namespace quda
     using Dslash = Dslash<nDegTwistedMass, Arg>;
     using Dslash::arg;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     NdegTwistedMass(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                    const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                    const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay)
         Dslash::template instantiate<packShmem, true>(tp, stream);
       else
@@ -61,7 +62,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       NdegTwistedMassArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, a, b, c, x, parity, dagger,
                                                                 comm_override);
-      NdegTwistedMass<decltype(arg)> twisted(arg, out, in, halo);
+      NdegTwistedMass<decltype(arg)> twisted(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_ndeg_twisted_mass_preconditioned.hpp b/lib/dslash_ndeg_twisted_mass_preconditioned.hpp
index c3ab903c23..25d4deeb2d 100644
--- a/lib/dslash_ndeg_twisted_mass_preconditioned.hpp
+++ b/lib/dslash_ndeg_twisted_mass_preconditioned.hpp
@@ -28,6 +28,7 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   protected:
     bool shared;
@@ -38,8 +39,8 @@ namespace quda
 
   public:
     NdegTwistedMassPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                  const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo), shared(arg.asymmetric || !arg.dagger)
+                                  const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U), shared(arg.asymmetric || !arg.dagger)
     {
       if (shared) TunableKernel3D::resizeStep(2, 1); // this will force flavor to be contained in the block
     }
@@ -47,7 +48,7 @@ namespace quda
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.asymmetric && !arg.dagger) errorQuda("asymmetric operator only defined for dagger");
       if (arg.asymmetric && arg.xpay) errorQuda("asymmetric operator not defined for xpay");
       if (arg.nParity != 1)
@@ -105,12 +106,12 @@ namespace quda
       if (asymmetric) {
         NdegTwistedMassArg<Float, nColor, nDim, DDArg, recon, true> arg(out, in, halo, U, a, b, c, xpay, x, parity,
                                                                         dagger, comm_override);
-        NdegTwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+        NdegTwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
       } else {
         NdegTwistedMassArg<Float, nColor, nDim, DDArg, recon, false> arg(out, in, halo, U, a, b, c, xpay, x, parity,
                                                                          dagger, comm_override);
-        NdegTwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+        NdegTwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
       }
     }
diff --git a/lib/dslash_staggered.hpp b/lib/dslash_staggered.hpp
index 31c372d834..24965e00d0 100644
--- a/lib/dslash_staggered.hpp
+++ b/lib/dslash_staggered.hpp
@@ -21,18 +21,19 @@ namespace quda
   {
     using Dslash = Dslash<staggered, Arg>;
     using Dslash::arg;
+    const GaugeField &U;
 
   public:
     Staggered(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-              const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+              const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       // operator is anti-Hermitian so do not instantiate dagger
       if (arg.xpay)
         Dslash::template instantiate<packStaggeredShmem, false, true>(tp, stream);
@@ -55,7 +56,7 @@ namespace quda
         if constexpr (is_enabled<QUDA_MILC_GAUGE_ORDER>()) {
           StaggeredArg<Float, nColor, nDim, DDArg, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_MILC> arg(
             out, in, halo, U, U, a, x, parity, dagger, comm_override);
-          Staggered<decltype(arg)> staggered(arg, out, in, halo);
+          Staggered<decltype(arg)> staggered(arg, out, in, halo, U);
 
           dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, out, in, halo, profile);
         } else {
@@ -65,7 +66,7 @@ namespace quda
         if constexpr (is_enabled<QUDA_TIFR_GAUGE_ORDER>()) {
           StaggeredArg<Float, nColor, nDim, DDArg, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_TIFR> arg(
             out, in, halo, U, U, a, x, parity, dagger, comm_override);
-          Staggered<decltype(arg)> staggered(arg, out, in, halo);
+          Staggered<decltype(arg)> staggered(arg, out, in, halo, U);
 
           dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, out, in, halo, profile);
         } else {
diff --git a/lib/dslash_twisted_clover.hpp b/lib/dslash_twisted_clover.hpp
index 80b54887c0..30cc88a4b2 100644
--- a/lib/dslash_twisted_clover.hpp
+++ b/lib/dslash_twisted_clover.hpp
@@ -20,18 +20,19 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     TwistedClover(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                  const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                  const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay)
         Dslash::template instantiate<packShmem, true>(tp, stream);
       else
@@ -76,7 +77,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       WilsonCloverArg<Float, nColor, nDim, DDArg, recon, true> arg(out, in, halo, U, C, a, b, x, parity, dagger,
                                                                    comm_override);
-      TwistedClover<decltype(arg)> twisted(arg, out, in, halo);
+      TwistedClover<decltype(arg)> twisted(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_twisted_clover_preconditioned.hpp b/lib/dslash_twisted_clover_preconditioned.hpp
index 0b8676c4fd..c8f0ee847a 100644
--- a/lib/dslash_twisted_clover_preconditioned.hpp
+++ b/lib/dslash_twisted_clover_preconditioned.hpp
@@ -20,18 +20,19 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     TwistedCloverPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                                const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       // specialize here to constrain the template instantiation
       if (arg.nParity == 1) {
         if (arg.xpay) {
@@ -123,7 +124,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       TwistedCloverArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, C, a, b, xpay, x, parity, dagger,
                                                               comm_override);
-      TwistedCloverPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+      TwistedCloverPreconditioned<decltype(arg)> twisted(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_twisted_mass.hpp b/lib/dslash_twisted_mass.hpp
index 255550a2b6..2d37fe4c5a 100644
--- a/lib/dslash_twisted_mass.hpp
+++ b/lib/dslash_twisted_mass.hpp
@@ -18,18 +18,19 @@ namespace quda
     using Dslash = Dslash<twistedMass, Arg>;
     using Dslash::arg;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     TwistedMass(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay)
         Dslash::template instantiate<packShmem, true>(tp, stream);
       else
@@ -59,7 +60,7 @@ namespace quda
       constexpr int nDim = 4;
       auto halo = ColorSpinorField::create_comms_batch(in);
       TwistedMassArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, a, b, x, parity, dagger, comm_override);
-      TwistedMass<decltype(arg)> twisted(arg, out, in, halo);
+      TwistedMass<decltype(arg)> twisted(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_twisted_mass_preconditioned.hpp b/lib/dslash_twisted_mass_preconditioned.hpp
index 9aec1d634b..9fcd22ce78 100644
--- a/lib/dslash_twisted_mass_preconditioned.hpp
+++ b/lib/dslash_twisted_mass_preconditioned.hpp
@@ -26,18 +26,19 @@ namespace quda
     using Dslash = Dslash<twistedMassPreconditioned, Arg>;
     using Dslash::arg;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     TwistedMassPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                              const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                              const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.asymmetric && !arg.dagger) errorQuda("asymmetric operator only defined for dagger");
       if (arg.asymmetric && arg.xpay) errorQuda("asymmetric operator not defined for xpay");
       if (arg.nParity != 1) errorQuda("Preconditioned twisted-mass operator not defined nParity=%d", arg.nParity);
@@ -82,13 +83,13 @@ namespace quda
       if (asymmetric) {
         TwistedMassArg<Float, nColor, nDim, DDArg, recon, true> arg(out, in, halo, U, a, b, xpay, x, parity, dagger,
                                                                     comm_override);
-        TwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+        TwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo, U);
 
         dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
       } else {
         TwistedMassArg<Float, nColor, nDim, DDArg, recon, false> arg(out, in, halo, U, a, b, xpay, x, parity, dagger,
                                                                      comm_override);
-        TwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+        TwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo, U);
 
         dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
       }
diff --git a/lib/dslash_wilson.hpp b/lib/dslash_wilson.hpp
index b103ac6be8..514e723ff8 100644
--- a/lib/dslash_wilson.hpp
+++ b/lib/dslash_wilson.hpp
@@ -18,18 +18,19 @@ namespace quda
   template <typename Arg> class Wilson : public Dslash<wilson, Arg>
   {
     using Dslash = Dslash<wilson, Arg>;
+    const GaugeField &U;
 
   public:
-    Wilson(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+    Wilson(Arg &arg, const GaugeField &U, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
            const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
   };
@@ -43,9 +44,10 @@ namespace quda
     {
       constexpr int nDim = 4;
       auto halo = ColorSpinorField::create_comms_batch(in);
+
       WilsonArg<Float, nColor, nDim, DDArg, recon, distance_pc> arg(out, in, halo, U, a, x, parity, dagger,
                                                                     comm_override, alpha0, t0);
-      Wilson<decltype(arg)> wilson(arg, out, in, halo);
+      Wilson<decltype(arg)> wilson(arg, U, out, in, halo);
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_wilson_clover.hpp b/lib/dslash_wilson_clover.hpp
index 1b34b48814..931b109132 100644
--- a/lib/dslash_wilson_clover.hpp
+++ b/lib/dslash_wilson_clover.hpp
@@ -20,19 +20,20 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
     const CloverField &A;
 
   public:
     WilsonClover(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                 const ColorSpinorField &halo, const CloverField &A) :
-      Dslash(arg, out, in, halo), A(A)
+                 const ColorSpinorField &halo, const GaugeField &U, const CloverField &A) :
+      Dslash(arg, out, in, halo), U(U), A(A)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay)
         Dslash::template instantiate<packShmem, true>(tp, stream);
       else
@@ -79,7 +80,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       WilsonCloverArg<Float, nColor, nDim, DDArg, recon, false, distance_pc> arg(out, in, halo, U, A, a, 0.0, x, parity,
                                                                                  dagger, comm_override, alpha0, t0);
-      WilsonClover<decltype(arg)> wilson(arg, out, in, halo, A);
+      WilsonClover<decltype(arg)> wilson(arg, out, in, halo, U, A);
 
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, out, in, halo, profile);
     }
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist.hpp b/lib/dslash_wilson_clover_hasenbusch_twist.hpp
index d618673307..3ef2a571a7 100644
--- a/lib/dslash_wilson_clover_hasenbusch_twist.hpp
+++ b/lib/dslash_wilson_clover_hasenbusch_twist.hpp
@@ -19,18 +19,19 @@ namespace quda
     using Dslash = Dslash<cloverHasenbusch, Arg>;
     using Dslash::arg;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     WilsonCloverHasenbuschTwist(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                                const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay)
         Dslash::template instantiate<packShmem, true>(tp, stream);
       else
@@ -83,7 +84,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       WilsonCloverHasenbuschTwistArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, A, a, b, x, parity,
                                                                             dagger, comm_override);
-      WilsonCloverHasenbuschTwist<decltype(arg)> wilson(arg, out, in, halo);
+      WilsonCloverHasenbuschTwist<decltype(arg)> wilson(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp
index da290e3259..42d214512a 100644
--- a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp
+++ b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp
@@ -20,18 +20,20 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     WilsonCloverHasenbuschTwistPCNoClovInv(Arg &arg, cvector_ref<ColorSpinorField> &out,
-                                           cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                                           cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo,
+                                           const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
 
       // specialize here to constrain the template instantiation
       if (arg.nParity != 1) errorQuda("Operator not defined nParity=%d", arg.nParity);
@@ -126,7 +128,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       using ArgType = WilsonCloverHasenbuschTwistPCArg<Float, nColor, nDim, DDArg, recon, false>;
       ArgType arg(out, in, halo, U, A, a, b, x, parity, dagger, comm_override);
-      WilsonCloverHasenbuschTwistPCNoClovInv<ArgType> wilson(arg, out, in, halo);
+      WilsonCloverHasenbuschTwistPCNoClovInv<ArgType> wilson(arg, out, in, halo, U);
 
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, out, in, halo, profile);
     }
@@ -144,18 +146,20 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     WilsonCloverHasenbuschTwistPCClovInv(Arg &arg, cvector_ref<ColorSpinorField> &out,
-                                         cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                                         cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo,
+                                         const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
 
       // specialize here to constrain the template instantiation
       if (arg.nParity != 1) errorQuda("Operator not defined nParity=%d", arg.nParity);
@@ -253,7 +257,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       using ArgType = WilsonCloverHasenbuschTwistPCArg<Float, nColor, nDim, DDArg, recon, true>;
       ArgType arg(out, in, halo, U, A, kappa, mu, x, parity, dagger, comm_override);
-      WilsonCloverHasenbuschTwistPCClovInv<ArgType> wilson(arg, out, in, halo);
+      WilsonCloverHasenbuschTwistPCClovInv<ArgType> wilson(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_wilson_clover_preconditioned.hpp b/lib/dslash_wilson_clover_preconditioned.hpp
index 0ebd6646f2..99dcd43590 100644
--- a/lib/dslash_wilson_clover_preconditioned.hpp
+++ b/lib/dslash_wilson_clover_preconditioned.hpp
@@ -20,19 +20,20 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
     const CloverField &A;
 
   public:
     WilsonCloverPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                               const ColorSpinorField &halo, const CloverField &A) :
-      Dslash(arg, out, in, halo), A(A)
+                               const ColorSpinorField &halo, const GaugeField &U, const CloverField &A) :
+      Dslash(arg, out, in, halo), U(U), A(A)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       // specialize here to constrain the template instantiation
       if (arg.nParity == 1) {
         if (arg.xpay) {
@@ -126,7 +127,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       WilsonCloverArg<Float, nColor, nDim, DDArg, recon, distance_pc> arg(out, in, halo, U, A, a, x, parity, dagger,
                                                                           comm_override, alpha0, t0);
-      WilsonCloverPreconditioned<decltype(arg)> wilson(arg, out, in, halo, A);
+      WilsonCloverPreconditioned<decltype(arg)> wilson(arg, out, in, halo, U, A);
 
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, out, in, halo, profile);
     }
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 96e7a51d11..2e4e0f7e34 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -7,7 +7,7 @@ namespace quda {
 
   GaugeFieldParam::GaugeFieldParam(const GaugeField &u) : LatticeFieldParam(u) { u.fill(*this); }
 
-  GaugeField::GaugeField(const GaugeFieldParam &param) : LatticeField(param)
+  GaugeField::GaugeField(const GaugeFieldParam &param) : LatticeField(param, param.order == QUDA_NATIVE_GAUGE_ORDER)
   {
     create(param);
 
@@ -282,6 +282,8 @@ namespace quda {
     anisotropy = std::exchange(src.anisotropy, 0.0);
     tadpole = std::exchange(src.tadpole, 0.0);
     fat_link_max = std::exchange(src.fat_link_max, 0.0);
+    shifted = std::exchange(src.shifted, nullptr);
+    is_shifted = std::exchange(src.is_shifted, false);
     for (auto i = 0; i < ghost.size(); i++) ghost[i].exchange(src.ghost[i], {});
     ghostFace = std::exchange(src.ghostFace, {});
     staggeredPhaseType = std::exchange(src.staggeredPhaseType, QUDA_STAGGERED_PHASE_INVALID);
@@ -321,6 +323,7 @@ namespace quda {
     aux_ss << "vol=" << volume << ",stride=" << stride << ",precision=" << precision << ",geometry=" << geometry
            << ",Nc=" << nColor << ",order=" << order;
     if (isNative()) aux_ss << ",N=" << gauge::get_vector_order(precision, 128);
+    if (precision < QUDA_SINGLE_PRECISION) aux_ss << ",alt_i2f=" << QUDA_ALTERNATIVE_I_TO_F;
     if (ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED) aux_ss << ",r=" << r[0] << r[1] << r[2] << r[3];
     aux_string = aux_ss.str();
     if (aux_string.size() >= TuneKey::aux_n / 2) errorQuda("Aux string too large %lu", aux_string.size());
@@ -1440,6 +1443,21 @@ namespace quda {
     }
   }
 
+  GaugeField& GaugeField::shift(int shift_offset) const
+  {
+    if (shift_offset == -1) shift_offset = nFace;
+    if (shift_offset != 1 && shift_offset != 3) errorQuda("Invalid shift_offset = %d", shift_offset);
+    if (is_shifted) errorQuda("Cannot shift a shifted field");
+    // If we don't yet have a cached shifted copy or the shift value changed
+    if (!shifted) shifted = std::make_unique<GaugeField>(::quda::shift(*this, shift_offset));
+    return *shifted;
+  }
+
+  void GaugeField::shift_reset() const
+  {
+    if (shifted) shifted.reset(nullptr);
+  }
+
   void GaugeField::PrintMatrix(int dim, int parity, unsigned int x_cb, int rank) const
   {
     genericPrintMatrix(*this, dim, parity, x_cb, rank);
diff --git a/lib/gauge_shift.cu b/lib/gauge_shift.cu
new file mode 100644
index 0000000000..93ccd090d0
--- /dev/null
+++ b/lib/gauge_shift.cu
@@ -0,0 +1,78 @@
+#include <gauge_field.h>
+#include <instantiate.h>
+#include <tunable_nd.h>
+#include <kernels/gauge_shift.cuh>
+
+namespace quda
+{
+
+  template <typename Float, int nColor> class GaugeShifter : public TunableKernel3D
+  {
+    GaugeField &out;
+    const GaugeField &in;
+    int shift;
+    bool verify;
+    unsigned int minThreads() const { return in.VolumeCB(); }
+
+  public:
+    GaugeShifter(GaugeField &out, const GaugeField &in, int shift, bool verify) :
+      TunableKernel3D(in, 2, 4), out(out), in(in), shift(shift), verify(verify)
+    {
+      assert(shift == 1 || shift == 3);
+      strcat(aux, ",shift=");
+      char shift_str[16];
+      u32toa(shift_str, shift);
+      strcat(aux, shift_str);
+      strcat(aux, verify ? ",verify" : "");
+      apply(device::get_default_stream());
+    }
+
+    template <bool verify> void instantiate(TuneParam &tp, const qudaStream_t &stream)
+    {
+      if (in.Reconstruct() == QUDA_RECONSTRUCT_NO) {
+        launch<GaugeShift>(tp, stream, GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_NO, verify>(out, in, shift));
+      } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) {
+        launch<GaugeShift>(tp, stream, GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_13, verify>(out, in, shift));
+      } else if (in.Reconstruct() == QUDA_RECONSTRUCT_12) {
+        launch<GaugeShift>(tp, stream, GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_12, verify>(out, in, shift));
+      } else if (in.Reconstruct() == QUDA_RECONSTRUCT_9) {
+        launch<GaugeShift>(tp, stream, GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_9, verify>(out, in, shift));
+      } else if (in.Reconstruct() == QUDA_RECONSTRUCT_8) {
+        launch<GaugeShift>(tp, stream, GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_8, verify>(out, in, shift));
+      }
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      if (verify)
+        instantiate<true>(tp, stream);
+      else
+        instantiate<false>(tp, stream);
+    }
+
+    long long bytes() const { return out.Bytes() + in.Bytes(); }
+  };
+
+  GaugeField shift(const GaugeField &in, int shift)
+  {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+    if (in.GhostExchange() == QUDA_GHOST_EXCHANGE_EXTENDED)
+      errorQuda("Extended ghost exchange not supported");
+    if (in.GhostExchange() == QUDA_GHOST_EXCHANGE_NO && comm_partitioned())
+      errorQuda("comm_dim_partition() == true requires we have GhostExchange = QUDA_GHOST_EXCHANGE_PAD");
+    GaugeFieldParam param(in);
+    param.create = QUDA_NULL_FIELD_CREATE;
+    GaugeField out(param);
+    const_cast<double&>(out.LinkMax()) = in.LinkMax();
+    out.is_shifted = true;
+
+    instantiate<GaugeShifter>(out, in, shift, false);
+#if 0 // set to 1 to run verification
+    instantiate<GaugeShifter>(out, in, shift, true);
+#endif
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+    return out;
+  }
+
+} // namespace quda
diff --git a/lib/laplace.hpp b/lib/laplace.hpp
index 54fa2b1901..cdf0f0741e 100644
--- a/lib/laplace.hpp
+++ b/lib/laplace.hpp
@@ -25,18 +25,19 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     Laplace(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-            const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+            const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream) override
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
 
       // operator is Hermitian so do not instantiate dagger
       if (arg.xpay)
@@ -151,12 +152,12 @@ namespace quda
       if (in.Nspin() == 1) {
         constexpr int nSpin = 1;
         LaplaceArg<Float, nSpin, nColor, nDim, DDArg, recon> arg(out, in, halo, U, dir, a, b, x, parity, comm_override);
-        Laplace<decltype(arg)> laplace(arg, out, in, halo);
+        Laplace<decltype(arg)> laplace(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, out, in, halo, profile);
       } else if (in.Nspin() == 4) {
         constexpr int nSpin = 4;
         LaplaceArg<Float, nSpin, nColor, nDim, DDArg, recon> arg(out, in, halo, U, dir, a, b, x, parity, comm_override);
-        Laplace<decltype(arg)> laplace(arg, out, in, halo);
+        Laplace<decltype(arg)> laplace(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, out, in, halo, profile);
       } else {
         errorQuda("Unsupported nSpin= %d", in.Nspin());
diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index 5b51f03014..1bc10cd0f5 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -25,7 +25,7 @@ namespace quda {
     }
   }
 
-  LatticeField::LatticeField(const LatticeFieldParam &param) :
+  LatticeField::LatticeField(const LatticeFieldParam &param, bool is_native_gauge) :
     volume(1),
     localVolume(1),
     pad(param.pad),
@@ -57,7 +57,7 @@ namespace quda {
     mh_send_rdma {},
     mem_type(param.mem_type)
   {
-    create(param);
+    create(param, is_native_gauge);
   }
 
   LatticeField::LatticeField(const LatticeField &field) noexcept :
@@ -98,7 +98,7 @@ namespace quda {
   {
     LatticeFieldParam param;
     field.fill(param);
-    create(param);
+    create(param, false);
   }
 
   LatticeField::LatticeField(LatticeField &&field) noexcept { move(std::move(field)); }
@@ -111,7 +111,7 @@ namespace quda {
       destroyComms();
       LatticeFieldParam param;
       src.fill(param);
-      create(param);
+      create(param, false);
     }
     return *this;
   }
@@ -125,7 +125,7 @@ namespace quda {
     return *this;
   }
 
-  void LatticeField::create(const LatticeFieldParam &param)
+  void LatticeField::create(const LatticeFieldParam &param, bool is_native_gauge)
   {
     if (param.location == QUDA_INVALID_FIELD_LOCATION) errorQuda("Invalid field location");
     location = param.location;
@@ -158,6 +158,10 @@ namespace quda {
     volumeCB = (siteSubset == QUDA_FULL_SITE_SUBSET) ? volume / 2 : volume;
     localVolumeCB = (siteSubset == QUDA_FULL_SITE_SUBSET) ? localVolume / 2 : localVolume;
     stride = volumeCB + pad;
+    if (is_native_gauge) { // if a native gauge field we need to ensure padded volume is aligned
+      stride = (stride + 31) & ~31; // round up to be a multiple of 32 to guarantee alignment
+      pad = stride - volumeCB;
+    }
 
     // for parity fields the factor of half is present for all surfaces dimensions except x, so add it manually
     for (int i = 0; i < nDim; i++) {
@@ -636,7 +640,7 @@ namespace quda {
     output << "localVolume = " << field.localVolume << std::endl;
     output << "localVolumeCB = " << field.localVolumeCB << std::endl;
     output << "stride = " << field.stride << std::endl;
-    output << "pad = " << field.stride << std::endl;
+    output << "pad = " << field.pad << std::endl;
     output << "total_bytes = " << field.total_bytes << std::endl;
     output << "nDim = " << field.nDim << std::endl;
     output << "x = " << field.x << std::endl;
diff --git a/lib/staggered_quark_smearing.cu b/lib/staggered_quark_smearing.cu
index 760855f6e7..6c31a79ae6 100644
--- a/lib/staggered_quark_smearing.cu
+++ b/lib/staggered_quark_smearing.cu
@@ -25,11 +25,12 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     StaggeredQSmear(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                    const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                    const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
@@ -53,7 +54,7 @@ namespace quda
       }
 
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
 
       // operator is Hermitian so do not instantiate dagger
       Dslash::template instantiate<packStaggeredShmem, false, false>(tp, stream);
@@ -194,7 +195,7 @@ namespace quda
         auto halo = ColorSpinorField::create_comms_batch(in, 3);
         StaggeredQSmearArg<Float, nSpin, nColor, nDim, DDArg, recon> arg(out, in, halo, U, t0, is_tslice_kernel, parity,
                                                                          dir, dagger, comm_override);
-        StaggeredQSmear<decltype(arg)> staggered_qsmear(arg, out, in, halo);
+        StaggeredQSmear<decltype(arg)> staggered_qsmear(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(staggered_qsmear)> policy(staggered_qsmear, out, in, halo, profile);
       } else {
         errorQuda("Unsupported nSpin = %d", in.Nspin());
diff --git a/lib/targets/cuda/CMakeLists.txt b/lib/targets/cuda/CMakeLists.txt
index c735036a7a..c3eddee81f 100644
--- a/lib/targets/cuda/CMakeLists.txt
+++ b/lib/targets/cuda/CMakeLists.txt
@@ -1,6 +1,6 @@
 # ######################################################################################################################
 # additonal sources
-target_sources(quda_cpp PRIVATE quda_api.cpp device.cpp malloc.cpp blas_lapack_cublas.cpp comm_target.cpp)
+target_sources(quda_cpp PRIVATE quda_api.cpp device.cpp malloc.cpp blas_lapack_cublas.cpp comm_target.cpp tma_helper.cpp)
 
 if(QUDA_JITIFY)
   target_sources(quda_cpp PRIVATE jitify_helper.cpp)
diff --git a/lib/targets/cuda/device.cpp b/lib/targets/cuda/device.cpp
index c47e5eee35..78f6c80ca1 100644
--- a/lib/targets/cuda/device.cpp
+++ b/lib/targets/cuda/device.cpp
@@ -160,7 +160,15 @@ namespace quda
     auto get_temperature()
     {
       unsigned int temp = 0;
+#if defined(nvmlTemperature_v1)
+      nvmlTemperature_t temperature;
+      temperature.version = nvmlTemperature_v1;
+      temperature.sensorType = NVML_TEMPERATURE_GPU;
+      NVML_CHECK(nvmlDeviceGetTemperatureV(monitor_device_id, &temperature));
+      temp = static_cast<unsigned int>(temperature.temperature);
+#else
       NVML_CHECK(nvmlDeviceGetTemperature(monitor_device_id, NVML_TEMPERATURE_GPU, &temp));
+#endif
       return temp;
     }
 
diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index a54b5dcc9d..8aa7c18936 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -176,6 +176,89 @@ option(QUDA_SHARED_MEMORY_SPILL "enable shared memory spilling?" OFF)
 mark_as_advanced(QUDA_SHARED_MEMORY_SPILL)
 message(STATUS "Shared memory spilling: ${QUDA_SHARED_MEMORY_SPILL}")
 
+
+# ---------------------------
+# Set Dslash prefetching
+# ---------------------------
+
+# Arch-dependent defaults
+set(_dslash_double_store_default OFF)
+set(_dslash_prefetch_type_default NONE)
+set(_dslash_prefetch_dist_w_default 0)
+set(_dslash_prefetch_dist_s_default 0)
+
+# These are expected Blackwell+ defaults
+if(QUDA_COMPUTE_CAPABILITY GREATER_EQUAL 100)
+  set(_dslash_double_store_default ON)
+  set(_dslash_prefetch_type_default BULK)
+  set(_dslash_prefetch_dist_w_default 2)
+  set(_dslash_prefetch_dist_s_default 2)
+endif()
+
+# Cache variables (set only if not already defined)
+if(NOT DEFINED QUDA_DSLASH_DOUBLE_STORE)
+  set(QUDA_DSLASH_DOUBLE_STORE ${_dslash_double_store_default}
+      CACHE BOOL "store a forwards shifted copy of the gauge fields for simplified Dslash indexing")
+endif()
+mark_as_advanced(QUDA_DSLASH_DOUBLE_STORE)
+message(STATUS "QUDA_DSLASH_DOUBLE_STORE: ${QUDA_DSLASH_DOUBLE_STORE}")
+
+if(NOT DEFINED QUDA_DSLASH_PREFETCH_TYPE)
+  set(QUDA_DSLASH_PREFETCH_TYPE ${_dslash_prefetch_type_default}
+      CACHE STRING "enable Dslash prefetching (NONE, THREAD, BULK, TENSOR)")
+endif()
+set_property(CACHE QUDA_DSLASH_PREFETCH_TYPE PROPERTY STRINGS NONE THREAD BULK TENSOR)
+mark_as_advanced(QUDA_DSLASH_PREFETCH_TYPE)
+message(STATUS "QUDA_DSLASH_PREFETCH_TYPE: ${QUDA_DSLASH_PREFETCH_TYPE}")
+
+if(NOT DEFINED QUDA_DSLASH_PREFETCH_DISTANCE_WILSON)
+  set(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON ${_dslash_prefetch_dist_w_default}
+      CACHE STRING "Dslash prefetch distance for Wilson kernels")
+endif()
+mark_as_advanced(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON)
+message(STATUS "QUDA_DSLASH_PREFETCH_DISTANCE_WILSON: ${QUDA_DSLASH_PREFETCH_DISTANCE_WILSON}")
+
+if(NOT DEFINED QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED)
+  set(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED ${_dslash_prefetch_dist_s_default}
+      CACHE STRING "Dslash prefetch distance for Staggered kernels")
+endif()
+mark_as_advanced(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED)
+message(STATUS "QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED: ${QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED}")
+
+# Validate prefetch type
+set(_valid_prefetch NONE THREAD BULK TENSOR)
+if(NOT QUDA_DSLASH_PREFETCH_TYPE IN_LIST _valid_prefetch)
+  message(FATAL_ERROR
+    "Invalid QUDA_DSLASH_PREFETCH_TYPE='${QUDA_DSLASH_PREFETCH_TYPE}'. "
+    "Allowed: ${_valid_prefetch}")
+endif()
+
+# TMA prefetching requires double-store
+set(_tma_modes BULK TENSOR)
+
+# TMA prefetching requires double store
+if(QUDA_DSLASH_PREFETCH_TYPE IN_LIST _tma_modes AND NOT QUDA_DSLASH_DOUBLE_STORE)
+  message(FATAL_ERROR
+    "QUDA_DSLASH_PREFETCH_TYPE=${QUDA_DSLASH_PREFETCH_TYPE} "
+    "requires QUDA_DSLASH_DOUBLE_STORE=ON")
+endif()
+
+# TMA prefetching requires sm_90+
+if(QUDA_DSLASH_PREFETCH_TYPE IN_LIST _tma_modes AND QUDA_COMPUTE_CAPABILITY LESS 90)
+  message(FATAL_ERROR
+    "QUDA_DSLASH_PREFETCH_TYPE=${QUDA_DSLASH_PREFETCH_TYPE} "
+    "requires QUDA_GPU_ARCH=sm_90 or newer")
+endif()
+
+# validate prefetching distances
+if(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON GREATER 7)
+  message(SEND_ERROR "QUDA_DSLASH_PREFETCH_DISTANCE_WILSON is greater than pipeline length")
+endif()
+if(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED GREATER 15)
+  message(SEND_ERROR "QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED is greater than pipeline length")
+endif()
+
+
 # QUDA_HASH for tunecache
 set(HASH cpu_arch=${CPU_ARCH},gpu_arch=${QUDA_GPU_ARCH},cuda_version=${CMAKE_CUDA_COMPILER_VERSION})
 set(GITVERSION "${PROJECT_VERSION}-${GITVERSION}-${QUDA_GPU_ARCH}")
@@ -419,6 +502,13 @@ if(CUDAToolkit_FOUND)
   target_link_libraries(quda INTERFACE CUDA::cudart_static)
 endif()
 
+CPMAddPackage(
+    NAME CCCL
+    GITHUB_REPOSITORY nvidia/cccl
+    GIT_TAG v3.1.4 # Fetches this tagged commit
+)
+target_link_libraries(quda PRIVATE CCCL::CCCL)
+
 # nvshmem enabled parts need SEPARABLE_COMPILATION ...
 if(QUDA_NVSHMEM)
   list(APPEND QUDA_DSLASH_OBJS dslash_constant_arg.cu)
diff --git a/lib/targets/cuda/tma_helper.cpp b/lib/targets/cuda/tma_helper.cpp
new file mode 100644
index 0000000000..4df0f7c43b
--- /dev/null
+++ b/lib/targets/cuda/tma_helper.cpp
@@ -0,0 +1,157 @@
+#include <map>
+#include <array>
+#include <cuda.h>
+#include <tma_helper.hpp>
+
+#ifdef USE_TENSOR_MEMORY_ACCELERATOR
+
+namespace quda
+{
+
+  auto create_descriptor(const GaugeField &u, uint32_t block_size)
+  {
+    auto precision = u.Precision();
+    auto reconstruct = u.Reconstruct();
+    auto stride = u.Stride();
+    auto geometry = u.Geometry();
+
+    auto get_tensor_data_type = [&](size_t word_size) {
+      switch (word_size) {
+      case 1: return CU_TENSOR_MAP_DATA_TYPE_UINT8;
+      case 2: return CU_TENSOR_MAP_DATA_TYPE_UINT16;
+      case 4: return CU_TENSOR_MAP_DATA_TYPE_UINT32;
+      case 8: return CU_TENSOR_MAP_DATA_TYPE_UINT64;
+      default: errorQuda("Unsupported word size %d", precision);
+      }
+      return CU_TENSOR_MAP_DATA_TYPE_UINT8;
+    };
+
+    auto hasPhase = reconstruct == 9 || reconstruct == 13;
+    uint32_t N = gauge::get_vector_order(precision, reconstruct - hasPhase);
+    uint32_t M = (reconstruct - hasPhase) / N;
+    uint32_t Nrem = reconstruct - hasPhase - M * N;
+
+    CUtensorMapDataType dtype = get_tensor_data_type(precision);
+    gauge::tensor_desc_t tensor;
+
+    {
+      if (stride % 16 != 0) errorQuda("Volume requirements not met: stride mod 16 = %lu", stride % 16);
+      uint64_t global_dim[] = {16llu * N, uint64_t(stride / 16), uint64_t(M), uint64_t(geometry), 2llu};
+      uint64_t global_stride[]
+        = {precision * 16llu * N, precision * stride * N, precision * stride * (N * M + Nrem), u.Bytes() / 2};
+      uint32_t box_dim[] = {16u * N, std::max(1u, block_size / 16), M, 1, 1};
+      uint32_t element_stride[] = {1, 1, 1, 1, 1};
+      auto data = u.data();
+      if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
+      auto res = cuTensorMapEncodeTiled(&tensor.N.map, dtype, 5, data, global_dim, global_stride, box_dim,
+                                        element_stride, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+                                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+      if (res != CUDA_SUCCESS) {
+        const char *errStr = nullptr;
+        cuGetErrorString(res, &errStr);
+        errorQuda("cuTensorMapEncodeTiled failed: %s", errStr);
+      }
+    }
+
+    if (Nrem > 0) {
+      if (stride % 16 != 0) errorQuda("Volume requirements not met: stride mod 16 = %lu", stride % 16);
+      uint64_t global_dim[]
+        = {16llu * Nrem, uint64_t(stride / 16), uint64_t(geometry), 2llu}; // can remove the M dimension?
+      uint64_t global_stride[] = {precision * 16llu * Nrem, precision * stride * (N * M + Nrem), u.Bytes() / 2};
+      uint32_t box_dim[] = {16u * Nrem, std::max(1u, block_size / 16), 1, 1, 1};
+      uint32_t element_stride[] = {1, 1, 1, 1};
+      auto data = u.data<char *>() + M * N * stride * precision;
+      if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
+      auto res = cuTensorMapEncodeTiled(&tensor.Nrem.map, dtype, 4, data, global_dim, global_stride, box_dim,
+                                        element_stride, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+                                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+      if (res != CUDA_SUCCESS) {
+        const char *errStr = nullptr;
+        cuGetErrorString(res, &errStr);
+        errorQuda("cuTensorMapEncodeTiled failed: %s box = {%u, %u, %u, %u}", errStr, box_dim[0], box_dim[1],
+                  box_dim[2], box_dim[3]);
+      }
+    }
+
+    if (hasPhase) {
+      if (stride % 16 != 0) errorQuda("Volume requirements not met: stride mod 16 = %lu", stride % 16);
+      uint64_t global_dim[] = {16llu, uint64_t(stride / 16), uint64_t(geometry), 2llu};
+      uint64_t global_stride[] = {precision * 16llu, precision * stride, u.Bytes() / 2};
+      uint32_t box_dim[] = {16u, std::max(1u, block_size / 16u), 1, 1};
+      uint32_t element_stride[] = {1, 1, 1, 1};
+      auto data = u.data<char *>() + u.PhaseOffset();
+      if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
+      auto res = cuTensorMapEncodeTiled(&tensor.phase.map, dtype, 4, data, global_dim, global_stride, box_dim,
+                                        element_stride, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+                                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+      if (res != CUDA_SUCCESS) {
+        const char *errStr = nullptr;
+        cuGetErrorString(res, &errStr);
+        errorQuda("cuTensorMapEncodeTiled failed: %s box = {%u, %u, %u, %u}", errStr, box_dim[0], box_dim[1],
+                  box_dim[2], box_dim[3]);
+      }
+    }
+
+    return tensor;
+  }
+
+  struct tensor_key_t {
+    static constexpr std::size_t volume_n = 32;
+    static constexpr std::size_t aux_n = 256;
+
+    uint32_t block_size {};
+    std::array<char, volume_n> volume {}; // zero-filled
+    std::array<char, aux_n> aux {};       // zero-filled
+    void *ptr {};
+
+    bool operator<(const tensor_key_t &other) const noexcept
+    {
+      if (block_size != other.block_size) return block_size < other.block_size;
+      int vc = std::memcmp(volume.data(), other.volume.data(), tensor_key_t::volume_n);
+      if (vc != 0) return vc < 0;
+      int ac = std::memcmp(aux.data(), other.aux.data(), tensor_key_t::aux_n);
+      if (ac != 0) return ac < 0;
+      // Required for strict weak ordering on arbitrary pointers
+      return std::less<void *> {}(ptr, other.ptr);
+    }
+
+    friend std::ostream &operator<<(std::ostream &os, const tensor_key_t &key)
+    {
+      auto print_buf = [&](auto const &buf) {
+        auto end = std::find(buf.begin(), buf.end(), '\0');
+        os.write(buf.data(), std::distance(buf.begin(), end));
+      };
+
+      os << "block_size=" << key.block_size << ", volume=\"";
+      print_buf(key.volume);
+      os << "\", aux=\"";
+      print_buf(key.aux);
+      os << "\", ptr=" << key.ptr;
+      return os;
+    }
+  };
+
+  static std::map<tensor_key_t, gauge::tensor_desc_t> tensor_map;
+
+  gauge::tensor_desc_t &get_tensor_descriptor(const GaugeField &u, uint32_t block_size)
+  {
+    tensor_key_t key {}; // zero-inits arrays + ptr
+    key.block_size = block_size;
+    key.ptr = u.data();
+
+    const std::size_t vlen = std::min(u.VolString().size(), tensor_key_t::volume_n);
+    const std::size_t alen = std::min(u.AuxString().size(), tensor_key_t::aux_n);
+
+    std::memcpy(key.volume.data(), u.VolString().data(), vlen);
+    std::memcpy(key.aux.data(), u.AuxString().data(), alen);
+
+    auto it = tensor_map.find(key);
+    if (it != tensor_map.end()) return it->second;
+
+    auto [ins_it, inserted] = tensor_map.emplace(key, create_descriptor(u, block_size));
+    return ins_it->second;
+  }
+
+} // namespace quda
+
+#endif
diff --git a/lib/tune.cpp b/lib/tune.cpp
index 68ece2aa39..362801db46 100644
--- a/lib/tune.cpp
+++ b/lib/tune.cpp
@@ -746,7 +746,7 @@ namespace quda
   }
 
   static std::string carve_out_step_str;
-  static int carve_out_step = 25; // default is 25% increment
+  static int carve_out_step = 100; // default is 100% increment
 
   void set_carve_out_step()
   {
diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp
index cd8b670eef..9ab1d42547 100644
--- a/tests/staggered_dslash_ctest.cpp
+++ b/tests/staggered_dslash_ctest.cpp
@@ -80,6 +80,7 @@ class StaggeredDslashTest
   {
     if (skip()) GTEST_SKIP();
     dslash_test_wrapper.end();
+    commDimPartitionedReset();
   }
 
   static void SetUpTestCase() { initQuda(device_ordinal); }
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index e3b0455283..aa62b19995 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -316,7 +316,6 @@ struct StaggeredDslashTestWrapper {
     freeGaugeQuda();
     cpuFat = {};
     cpuLong = {};
-    commDimPartitionedReset();
   }
 
   static void destroy()
diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp
index a0e89006e5..62c3776802 100644
--- a/tests/staggered_eigensolve_test_gtest.hpp
+++ b/tests/staggered_eigensolve_test_gtest.hpp
@@ -173,6 +173,9 @@ TEST_P(StaggeredEigensolveTest, verify)
     tol *= 5;
   }
 
+  // with block TRLM some of eigenvectors can have a small deviation
+  if (::testing::get<1>(GetParam()) == QUDA_EIG_BLK_TR_LANCZOS) tol *= 2;
+
   // account for summation error scaling with number of processors
   auto dof = 6lu * dim[0] * dim[1] * dim[2] * dim[3];
   tol *= (1 + log(quda::comm_size()) / log(dof));
diff --git a/tests/utils/gauge_utils.cpp b/tests/utils/gauge_utils.cpp
index 962dfb16ad..3b2dd007fd 100644
--- a/tests/utils/gauge_utils.cpp
+++ b/tests/utils/gauge_utils.cpp
@@ -433,6 +433,7 @@ template <typename real_t> struct ApplyRandomU1Phase {
     auto gauge = reinterpret_cast<real_t *const *>(gauge_);
 
     for (int dir = 0; dir < 4; dir++) {
+#pragma omp parallel for
       for (int i = 0; i < Vh; i++) {
         for (int parity = 0; parity < 2; parity++) {
           // create a random phase
@@ -493,6 +494,7 @@ template <typename real_t> struct ConstructRandomMatrixGaugeField {
     };
 
     for (int dir = 0; dir < 4; dir++) {
+#pragma omp parallel for
       for (int i = 0; i < Vh; i++) {
         for (int parity = 0; parity < 2; parity++) {
           real_t *link = gauge[dir] + (parity * Vh + i) * gauge_site_size;
diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp
index 2a31748a7a..7b8807fcd1 100644
--- a/tests/utils/host_utils.cpp
+++ b/tests/utils/host_utils.cpp
@@ -181,6 +181,7 @@ void constructHostCloverField(void *clover, void *, QudaInvertParam &inv_param)
 template <typename real_t> struct ConstructCloverField {
   void operator()(void *res, double norm, double diag)
   {
+#pragma omp parallel for
     for (auto i = 0lu; i < static_cast<size_t>(Vh); i++) {
       for (auto parity = 0lu; parity < 2lu; parity++) {
         auto clover_matrix = reinterpret_cast<real_t *>(res) + 72 * (parity * Vh + i);
diff --git a/tests/utils/staggered_gauge_utils.cpp b/tests/utils/staggered_gauge_utils.cpp
index c95dd87036..ab2a4c1346 100644
--- a/tests/utils/staggered_gauge_utils.cpp
+++ b/tests/utils/staggered_gauge_utils.cpp
@@ -61,6 +61,7 @@ void constructFatLongGaugeField(void *const *fatlink, void *const *longlink, Gau
       constructRandomGaugeField(longlink, param, precision, dslash_type);
       // incorporate non-trivial phase into long links
       for (int dir = 0; dir < 4; ++dir) {
+#pragma omp parallel for
         for (int i = 0; i < Vh; ++i) {
           for (int parity = 0; parity < 2; parity++) {
             double phase = random_uniform_host<double>(i, parity, 0, 2 * M_PI);
@@ -93,6 +94,7 @@ void constructFatLongGaugeField(void *const *fatlink, void *const *longlink, Gau
 
       // incorporate non-trivial phase into long links
       for (int dir = 0; dir < 4; ++dir) {
+#pragma omp parallel for
         for (int i = 0; i < Vh; ++i) {
           for (int parity = 0; parity < 2; parity++) {
             double phase = random_uniform_host<double>(i, parity, 0, 2 * M_PI);
diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp
index 1c5340661a..47884ca593 100644
--- a/tests/utils/staggered_host_utils.cpp
+++ b/tests/utils/staggered_host_utils.cpp
@@ -186,6 +186,7 @@ void computeTwoLinkCPU(void **twolink, su3_matrix **sitelinkEx)
   for (int dir = 0; dir < 4; ++dir) E[dir] = Z[dir] + 4;
   const int extended_volume = E[3] * E[2] * E[1] * E[0];
 
+#pragma omp parallel for
   for (int t = 0; t < Z[3]; ++t) {
     for (int z = 0; z < Z[2]; ++z) {
       for (int y = 0; y < Z[1]; ++y) {
@@ -698,6 +699,7 @@ void constructStaggeredTestSpinorParam(quda::ColorSpinorParam *cs_param, const Q
 // data reordering routines
 template <typename Out, typename In> void reorderQDPtoMILC(Out *milc_out, In **qdp_in, int V, int siteSize)
 {
+#pragma omp parallel for
   for (int i = 0; i < V; i++) {
     for (int dir = 0; dir < 4; dir++) {
       for (int j = 0; j < siteSize; j++) {