From 63b7ff4c7b55a0810eb8b886ab2d42caa4ab49e9 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 19 Sep 2025 16:42:56 -0700
Subject: [PATCH 001/121] Initial support for prefetching (over fetching) added
 to load instructions for CUDA

---
 include/targets/cuda/inline_ptx.h    | 370 ++++++++++++++++++++++++---
 include/targets/cuda/load_store.h    |  61 +++--
 include/targets/generic/load_store.h |  16 +-
 3 files changed, 394 insertions(+), 53 deletions(-)

diff --git a/include/targets/cuda/inline_ptx.h b/include/targets/cuda/inline_ptx.h
index fa29eee35b..3bbccf2ab5 100644
--- a/include/targets/cuda/inline_ptx.h
+++ b/include/targets/cuda/inline_ptx.h
@@ -18,119 +18,425 @@ namespace quda {
   // If you're bored...
   // http://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-st
 
-  __device__ inline void load_streaming_double4(double4 &a, const double4 *addr)
+// Helper macro for prefetch size validation
+#define VALIDATE_PREFETCH_SIZE(prefetch_size)                                                                          \
+  static_assert(prefetch_size == 0 || prefetch_size == 64 || prefetch_size == 128 || prefetch_size == 256,             \
+                "prefetch_size must be 0, 64, 128, or 256")
+
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_streaming_double4(double4 &a, const double4 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     double x, y, z, w;
-    asm("ld.cs.global.v4.f64 {%0, %1, %2, %3}, [%4+0];" : "=d"(x), "=d"(y), "=d"(z), "=d"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain streaming load, no prefetch hint
+      asm volatile("ld.global.cs.v4.f64 {%0, %1, %2, %3}, [%4];\n" : "=d"(x), "=d"(y), "=d"(z), "=d"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cs.L2::64B.v4.f64 {%0, %1, %2, %3}, [%4];\n"
+                   : "=d"(x), "=d"(y), "=d"(z), "=d"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cs.L2::128B.v4.f64 {%0, %1, %2, %3}, [%4];\n"
+                   : "=d"(x), "=d"(y), "=d"(z), "=d"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cs.L2::256B.v4.f64 {%0, %1, %2, %3}, [%4];\n"
+                   : "=d"(x), "=d"(y), "=d"(z), "=d"(w)
+                   : "l"(addr));
+    }
+
     a.x = x;
     a.y = y;
     a.z = z;
     a.w = w;
   }
 
-  __device__ inline void load_streaming_double2(double2 &a, const double2* addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_streaming_double2(double2 &a, const double2 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     double x, y;
-    asm("ld.cs.global.v2.f64 {%0, %1}, [%2+0];" : "=d"(x), "=d"(y) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain streaming load, no prefetch hint
+      asm volatile("ld.global.cs.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cs.L2::64B.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cs.L2::128B.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cs.L2::256B.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    }
+
     a.x = x; a.y = y;
   }
 
-  __device__ inline void load_streaming_float8(float8 &v, const float8 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_streaming_float8(float8 &v, const float8 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     float x, y, z, w, a, b, c, d;
-    asm("ld.cs.global.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8+0];"
-        : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
-        : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain streaming load, no prefetch hint
+      asm volatile("ld.global.cs.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cs.L2::64B.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cs.L2::128B.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cs.L2::256B.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    }
+
     v = {{x, y, z, w}, {a, b, c, d}};
   }
 
-  __device__ inline void load_streaming_float4(float4 &a, const float4* addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_streaming_float4(float4 &a, const float4 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     float x, y, z, w;
-    asm("ld.cs.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain streaming load, no prefetch hint
+      asm volatile("ld.global.cs.v4.f32 {%0, %1, %2, %3}, [%4];\n" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cs.L2::64B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cs.L2::128B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cs.L2::256B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    }
+
     a.x = x; a.y = y; a.z = z; a.w = w;
   }
 
-  __device__ inline void load_cached_short4(short4 &a, const short4 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_short4(short4 &a, const short4 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     short x, y, z, w;
-    asm("ld.ca.global.v4.s16 {%0, %1, %2, %3}, [%4+0];" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v4.s16 {%0, %1, %2, %3}, [%4];\n" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v4.s16 {%0, %1, %2, %3}, [%4];\n"
+                   : "=h"(x), "=h"(y), "=h"(z), "=h"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v4.s16 {%0, %1, %2, %3}, [%4];\n"
+                   : "=h"(x), "=h"(y), "=h"(z), "=h"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v4.s16 {%0, %1, %2, %3}, [%4];\n"
+                   : "=h"(x), "=h"(y), "=h"(z), "=h"(w)
+                   : "l"(addr));
+    }
+
     a.x = x;
     a.y = y;
     a.z = z;
     a.w = w;
   }
 
-  __device__ inline void load_cached_short2(short2 &a, const short2 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_short2(short2 &a, const short2 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     short x, y;
-    asm("ld.ca.global.v2.s16 {%0, %1}, [%2+0];" : "=h"(x), "=h"(y) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    }
+
     a.x = x;
     a.y = y;
   }
 
-  __device__ inline void load_global_short4(short4 &a, const short4 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_global_short4(short4 &a, const short4 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     short x, y, z, w;
-    asm("ld.cg.global.v4.s16 {%0, %1, %2, %3}, [%4+0];" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain global load, no prefetch hint
+      asm volatile("ld.global.cg.v4.s16 {%0, %1, %2, %3}, [%4];\n" : "=h"(x), "=h"(y), "=h"(z), "=h"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cg.L2::64B.v4.s16 {%0, %1, %2, %3}, [%4];\n"
+                   : "=h"(x), "=h"(y), "=h"(z), "=h"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cg.L2::128B.v4.s16 {%0, %1, %2, %3}, [%4];\n"
+                   : "=h"(x), "=h"(y), "=h"(z), "=h"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cg.L2::256B.v4.s16 {%0, %1, %2, %3}, [%4];\n"
+                   : "=h"(x), "=h"(y), "=h"(z), "=h"(w)
+                   : "l"(addr));
+    }
+
     a.x = x;
     a.y = y;
     a.z = z;
     a.w = w;
   }
 
-  __device__ inline void load_global_short2(short2 &a, const short2 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_global_short2(short2 &a, const short2 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     short x, y;
-    asm("ld.cg.global.v2.s16 {%0, %1}, [%2+0];" : "=h"(x), "=h"(y) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain global load, no prefetch hint
+      asm volatile("ld.global.cg.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cg.L2::64B.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cg.L2::128B.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cg.L2::256B.v2.s16 {%0, %1}, [%2];\n" : "=h"(x), "=h"(y) : "l"(addr));
+    }
+
     a.x = x;
     a.y = y;
   }
 
-  __device__ inline void load_global_float4(float4 &a, const float4* addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_global_float4(float4 &a, const float4 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     float x, y, z, w;
-    asm("ld.cg.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain global load, no prefetch hint
+      asm volatile("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];\n" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.cg.L2::64B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.cg.L2::128B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.cg.L2::256B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    }
+
     a.x = x; a.y = y; a.z = z; a.w = w;
   }
 
-  __device__ inline void load_cached_float4(float4 &a, const float4* addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_float4(float4 &a, const float4 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     float x, y, z, w;
-    asm("ld.ca.global.v4.f32 {%0, %1, %2, %3}, [%4+0];" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v4.f32 {%0, %1, %2, %3}, [%4];\n" : "=f"(x), "=f"(y), "=f"(z), "=f"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w)
+                   : "l"(addr));
+    }
+
     a.x = x; a.y = y; a.z = z; a.w = w;
   }
 
-  __device__ inline void load_cached_float8(float8 &v, const float8 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_float8(float8 &v, const float8 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     float x, y, z, w, a, b, c, d;
-    asm("ld.ca.global.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8+0];"
-        : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
-        : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v8.f32 {%0, %1, %2, %3, %4, %5, %6, %7}, [%8];\n"
+                   : "=f"(x), "=f"(y), "=f"(z), "=f"(w), "=f"(a), "=f"(b), "=f"(c), "=f"(d)
+                   : "l"(addr));
+    }
+
     v = {{x, y, z, w}, {a, b, c, d}};
   }
 
-  __device__ inline void load_cached_float2(float2 &a, const float2* addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_float2(float2 &a, const float2 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     float x, y;
-    asm("ld.ca.global.v2.f32 {%0, %1}, [%2+0];" : "=f"(x), "=f"(y) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v2.f32 {%0, %1}, [%2];\n" : "=f"(x), "=f"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v2.f32 {%0, %1}, [%2];\n" : "=f"(x), "=f"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v2.f32 {%0, %1}, [%2];\n" : "=f"(x), "=f"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v2.f32 {%0, %1}, [%2];\n" : "=f"(x), "=f"(y) : "l"(addr));
+    }
+
     a.x = x; a.y = y;
   }
 
-  __device__ inline void load_cached_double4(double4 &a, const double4 *addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_float(float &a, const float *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
+    float x;
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.f32 {%0}, [%1];\n" : "=f"(x) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.f32 {%0}, [%1];\n" : "=f"(x) : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.f32 {%0}, [%1];\n" : "=f"(x) : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.f32 {%0}, [%1];\n" : "=f"(x) : "l"(addr));
+    }
+
+    a = x;
+  }
+
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_double4(double4 &a, const double4 *addr)
+  {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     double x, y, z, w;
-    asm("ld.ca.global.v4.f64 {%0, %1, %2, %3}, [%4+0];" : "=d"(x), "=d"(y), "=d"(z), "=d"(w) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v4.f64 {%0, %1, %2, %3}, [%4];\n" : "=d"(x), "=d"(y), "=d"(z), "=d"(w) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v4.f64 {%0, %1, %2, %3}, [%4];\n"
+                   : "=d"(x), "=d"(y), "=d"(z), "=d"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v4.f64 {%0, %1, %2, %3}, [%4];\n"
+                   : "=d"(x), "=d"(y), "=d"(z), "=d"(w)
+                   : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v4.f64 {%0, %1, %2, %3}, [%4];\n"
+                   : "=d"(x), "=d"(y), "=d"(z), "=d"(w)
+                   : "l"(addr));
+    }
+
     a.x = x;
     a.y = y;
     a.z = z;
     a.w = w;
   }
 
-  __device__ inline void load_cached_double2(double2 &a, const double2* addr)
+  // Valid values for prefetch_size: 0 (no prefetch), 64, 128, 256
+  // Note: 256B prefetch requires SM 80+. For older architectures, 256B -> 128B
+  template <size_t prefetch_size = 0> __device__ inline void load_cached_double2(double2 &a, const double2 *addr)
   {
+    VALIDATE_PREFETCH_SIZE(prefetch_size);
+    constexpr size_t prefetch_ = __COMPUTE_CAPABILITY__ < 800 ? 0 : prefetch_size;
+
     double x, y;
-    asm("ld.ca.global.v2.f64 {%0, %1}, [%2+0];" : "=d"(x), "=d"(y) : __PTR(addr));
+
+    if constexpr (prefetch_ == 0) {
+      // Plain cached load, no prefetch hint
+      asm volatile("ld.global.ca.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 64) {
+      asm volatile("ld.global.ca.L2::64B.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 128) {
+      asm volatile("ld.global.ca.L2::128B.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    } else if constexpr (prefetch_ == 256) {
+      asm volatile("ld.global.ca.L2::256B.v2.f64 {%0, %1}, [%2];\n" : "=d"(x), "=d"(y) : "l"(addr));
+    }
+
     a.x = x; a.y = y;
   }
 
diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index 9f2a51d0b8..29b2e50be3 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -15,53 +15,82 @@ namespace quda
   // pre-declaration of vector_load that we wish to specialize
   template <bool> struct vector_load_impl;
 
+  // pre-declaration of the prefetch type
+  template <size_t prefetch> struct prefetch_t;
+
   // CUDA specializations of the vector_load
   template <> struct vector_load_impl<true> {
-    template <typename T> __device__ inline void operator()(T &value, const void *ptr, int idx)
-    {
+    template <typename T, size_t prefetch_size>
+    __device__ inline void operator()(T &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &) {
       value = reinterpret_cast<const T *>(ptr)[idx];
     }
 
-    __device__ inline void operator()(float4 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(float4 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
+    {
+      load_cached_float4<prefetch_size>(value, reinterpret_cast<const float4 *>(ptr) + idx);
+    }
+
+    template <size_t prefetch_size>
+    __device__ inline void operator()(float2 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
-      load_cached_float4(value, reinterpret_cast<const float4 *>(ptr) + idx);
+      load_cached_float2<prefetch_size>(value, reinterpret_cast<const float2 *>(ptr) + idx);
     }
 
-    __device__ inline void operator()(float2 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(float &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
-      load_cached_float2(value, reinterpret_cast<const float2 *>(ptr) + idx);
+      load_cached_float<prefetch_size>(value, reinterpret_cast<const float *>(ptr) + idx);
     }
 
 #if __COMPUTE_CAPABILITY__ >= 1000
-    __device__ inline void operator()(double4 &value, const void *ptr, int idx)
+    template <typename T, size_t prefetch_size>
+    __device__ inline void operator()(double4 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
-      load_cached_double4(value, reinterpret_cast<const double4 *>(ptr) + idx);
+      load_cached_double4<prefetch_size>(value, reinterpret_cast<const double4 *>(ptr) + idx);
     }
 
-    __device__ inline void operator()(float8 &value, const void *ptr, int idx)
+    template <typename T, size_t prefetch_size>
+    __device__ inline void operator()(float8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
-      load_cached_float8(value, reinterpret_cast<const float8 *>(ptr) + idx);
+      load_cached_float8<prefetch_size>(value, reinterpret_cast<const float8 *>(ptr) + idx);
     }
 #endif
 
-    __device__ inline void operator()(double2 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(double2 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
+    {
+      load_cached_double2<prefetch_size>(value, reinterpret_cast<const double2 *>(ptr) + idx);
+    }
+
+    template <size_t prefetch_size>
+    __device__ inline void operator()(short2 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
+    {
+      load_cached_short2<prefetch_size>(value, reinterpret_cast<const short2 *>(ptr) + idx);      
+    }
+
+    template <size_t prefetch_size>
+    __device__ inline void operator()(short4 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
     {
-      load_cached_double2(value, reinterpret_cast<const double2 *>(ptr) + idx);
+      load_cached_short4<prefetch_size>(value, reinterpret_cast<const short4 *>(ptr) + idx);      
     }
 
-    __device__ inline void operator()(short8 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(short8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
     {
       float4 tmp;
-      operator()(tmp, ptr, idx);
+      operator()(tmp, ptr, idx, prefetch);
       memcpy(&value, &tmp, sizeof(float4));
     }
 
-    __device__ inline void operator()(char8 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(char8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
     {
       float2 tmp;
-      operator()(tmp, ptr, idx);
+      operator()(tmp, ptr, idx, prefetch);
       memcpy(&value, &tmp, sizeof(float2));
     }
+
   };
 
   // pre-declaration of vector_store that we wish to specialize
diff --git a/include/targets/generic/load_store.h b/include/targets/generic/load_store.h
index 3239aeaefc..93b847a4db 100644
--- a/include/targets/generic/load_store.h
+++ b/include/targets/generic/load_store.h
@@ -5,28 +5,34 @@
 namespace quda
 {
 
+  template <size_t prefetch> struct prefetch_t {
+    static constexpr int size = prefetch;
+  };
+
   /**
      @brief Non-specialized load operation
   */
   template <bool is_device> struct vector_load_impl {
-    template <typename T> __device__ __host__ inline void operator()(T &value, const void *ptr, int idx)
+    template <typename T, size_t prefetch_size>
+    __device__ __host__ inline void operator()(T &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
       value = reinterpret_cast<const T *>(ptr)[idx];
     }
   };
 
-  template <typename vector_t> __device__ __host__ inline vector_t vector_load(const void *ptr, int idx)
+  template <typename vector_t, size_t prefetch = 0>
+  __device__ __host__ inline vector_t vector_load_internal(const void *ptr, int idx)
   {
     vector_t value;
-    target::dispatch<vector_load_impl>(value, ptr, idx);
+    target::dispatch<vector_load_impl>(value, ptr, idx, prefetch_t<prefetch>());
     return value;
   }
 
-  template <typename scalar_t, int N>
+  template <typename scalar_t, int N, size_t prefetch = 0>
   __device__ __host__ inline array<scalar_t, N> vector_load(const void *ptr, int idx)
   {
     using vector_t = typename VectorType<scalar_t, N>::type;
-    auto value_v = vector_load<vector_t>(ptr, idx);
+    auto value_v = vector_load_internal<vector_t, prefetch>(ptr, idx);
     array<scalar_t, N> value_a;
     static_assert(sizeof(value_a) == sizeof(value_v), "array type and vector type are different sizes");
     memcpy(&value_a, &value_v, sizeof(vector_t));

From 191105b34d6e1d4774d8bec8eb34139fca673850 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 Sep 2025 21:24:17 -0700
Subject: [PATCH 002/121] Fix for half precision

---
 include/color_spinor_field_order.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 46ad849079..0420d349c3 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -1023,7 +1023,7 @@ namespace quda
       {
         real v[length_ghost];
         norm_type nrm
-          = isFixed<Float>::value ? vector_load<float>(ghost_norm[2 * dim + dir], parity * faceVolumeCB[dim] + x) : 0.0;
+          = isFixed<Float>::value ? vector_load<float, 1>(ghost_norm[2 * dim + dir], parity * faceVolumeCB[dim] + x)[0] : 0.0;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
@@ -1161,7 +1161,7 @@ namespace quda
         auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
         auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns));
 #endif
-        norm_type nrm = isFixed<Float>::value ? vector_load<float>(norm, x + parity * norm_offset) : 0.0;
+        norm_type nrm = isFixed<Float>::value ? vector_load<float, 1>(norm, x + parity * norm_offset)[0] : 0.0;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {

From 5b41229f7272b2bfb5b512d488d458395c192cfc Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 Sep 2025 21:24:40 -0700
Subject: [PATCH 003/121] Apply some missing OMP parallelization to host
 functions

---
 tests/utils/gauge_utils.cpp           | 2 ++
 tests/utils/host_utils.cpp            | 1 +
 tests/utils/staggered_gauge_utils.cpp | 2 ++
 tests/utils/staggered_host_utils.cpp  | 2 ++
 4 files changed, 7 insertions(+)

diff --git a/tests/utils/gauge_utils.cpp b/tests/utils/gauge_utils.cpp
index 962dfb16ad..3b2dd007fd 100644
--- a/tests/utils/gauge_utils.cpp
+++ b/tests/utils/gauge_utils.cpp
@@ -433,6 +433,7 @@ template <typename real_t> struct ApplyRandomU1Phase {
     auto gauge = reinterpret_cast<real_t *const *>(gauge_);
 
     for (int dir = 0; dir < 4; dir++) {
+#pragma omp parallel for
       for (int i = 0; i < Vh; i++) {
         for (int parity = 0; parity < 2; parity++) {
           // create a random phase
@@ -493,6 +494,7 @@ template <typename real_t> struct ConstructRandomMatrixGaugeField {
     };
 
     for (int dir = 0; dir < 4; dir++) {
+#pragma omp parallel for
       for (int i = 0; i < Vh; i++) {
         for (int parity = 0; parity < 2; parity++) {
           real_t *link = gauge[dir] + (parity * Vh + i) * gauge_site_size;
diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp
index 2a31748a7a..7b8807fcd1 100644
--- a/tests/utils/host_utils.cpp
+++ b/tests/utils/host_utils.cpp
@@ -181,6 +181,7 @@ void constructHostCloverField(void *clover, void *, QudaInvertParam &inv_param)
 template <typename real_t> struct ConstructCloverField {
   void operator()(void *res, double norm, double diag)
   {
+#pragma omp parallel for
     for (auto i = 0lu; i < static_cast<size_t>(Vh); i++) {
       for (auto parity = 0lu; parity < 2lu; parity++) {
         auto clover_matrix = reinterpret_cast<real_t *>(res) + 72 * (parity * Vh + i);
diff --git a/tests/utils/staggered_gauge_utils.cpp b/tests/utils/staggered_gauge_utils.cpp
index c95dd87036..ab2a4c1346 100644
--- a/tests/utils/staggered_gauge_utils.cpp
+++ b/tests/utils/staggered_gauge_utils.cpp
@@ -61,6 +61,7 @@ void constructFatLongGaugeField(void *const *fatlink, void *const *longlink, Gau
       constructRandomGaugeField(longlink, param, precision, dslash_type);
       // incorporate non-trivial phase into long links
       for (int dir = 0; dir < 4; ++dir) {
+#pragma omp parallel for
         for (int i = 0; i < Vh; ++i) {
           for (int parity = 0; parity < 2; parity++) {
             double phase = random_uniform_host<double>(i, parity, 0, 2 * M_PI);
@@ -93,6 +94,7 @@ void constructFatLongGaugeField(void *const *fatlink, void *const *longlink, Gau
 
       // incorporate non-trivial phase into long links
       for (int dir = 0; dir < 4; ++dir) {
+#pragma omp parallel for
         for (int i = 0; i < Vh; ++i) {
           for (int parity = 0; parity < 2; parity++) {
             double phase = random_uniform_host<double>(i, parity, 0, 2 * M_PI);
diff --git a/tests/utils/staggered_host_utils.cpp b/tests/utils/staggered_host_utils.cpp
index 1c5340661a..47884ca593 100644
--- a/tests/utils/staggered_host_utils.cpp
+++ b/tests/utils/staggered_host_utils.cpp
@@ -186,6 +186,7 @@ void computeTwoLinkCPU(void **twolink, su3_matrix **sitelinkEx)
   for (int dir = 0; dir < 4; ++dir) E[dir] = Z[dir] + 4;
   const int extended_volume = E[3] * E[2] * E[1] * E[0];
 
+#pragma omp parallel for
   for (int t = 0; t < Z[3]; ++t) {
     for (int z = 0; z < Z[2]; ++z) {
       for (int y = 0; y < Z[1]; ++y) {
@@ -698,6 +699,7 @@ void constructStaggeredTestSpinorParam(quda::ColorSpinorParam *cs_param, const Q
 // data reordering routines
 template <typename Out, typename In> void reorderQDPtoMILC(Out *milc_out, In **qdp_in, int V, int siteSize)
 {
+#pragma omp parallel for
   for (int i = 0; i < V; i++) {
     for (int dir = 0; dir < 4; dir++) {
       for (int j = 0; j < siteSize; j++) {

From a2efb44f5d9008a039f07b2a7dc7e56adc7c4d89 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 Sep 2025 21:56:01 -0700
Subject: [PATCH 004/121] Fix for fine-grained accessor vector loads

---
 include/color_spinor_field_order.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 0420d349c3..008c25db8b 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -241,9 +241,9 @@ namespace quda
         constexpr int M = nSpinBlock * nColor * nVec;
 #pragma unroll
         for (int i = 0; i < M; i++) {
-          vec_t tmp
-            = vector_load<vec_t>(reinterpret_cast<const vec_t *>(in + parity * offset_cb), x_cb * N + chi * M + i);
-          memcpy(&out[i], &tmp, sizeof(vec_t));
+          auto tmp
+            = vector_load<Float, 2>(reinterpret_cast<const vec_t *>(in + parity * offset_cb), x_cb * N + chi * M + i);
+          memcpy(&out[i], &tmp, sizeof(tmp));
         }
       }
     };

From c815076f33b3dc137d7984318758bca6ea279681 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 30 Sep 2025 21:56:49 -0700
Subject: [PATCH 005/121] Add prefetching instructions for CUDA

---
 include/targets/cuda/inline_ptx.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/targets/cuda/inline_ptx.h b/include/targets/cuda/inline_ptx.h
index 3bbccf2ab5..adf92c4720 100644
--- a/include/targets/cuda/inline_ptx.h
+++ b/include/targets/cuda/inline_ptx.h
@@ -476,4 +476,13 @@ namespace quda {
     asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: __PTR(addr), "h"(x), "h"(y));
   }
 
+  __device__ __forceinline__ void prefetch_L1(const void *p) { asm volatile("prefetch.global.L1 [%0];" ::"l"(p)); }
+
+  __device__ __forceinline__ void prefetch_L2(const void *p) { asm volatile("prefetch.global.L2 [%0];" ::"l"(p)); }
+
+  __device__ __forceinline__ void prefetch_tma(const void *p, size_t bytes)
+  {
+    asm volatile("cp.async.bulk.prefetch.L2.global [%0], %1;\n" ::"l"(p), "r"(static_cast<uint32_t>(bytes)));
+  }
+
 } // namespace quda

From 177c18ba203354b25e99c4e60714726aea0d923e Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 1 Oct 2025 09:55:19 -0700
Subject: [PATCH 006/121] Optimizaiton of neighbor indexing for dslash kernels:
 use bitwise instead of logic operations when computing the neighboring index;
 this is branch free and less operations

---
 include/dslash_helper.cuh                     | 59 ++++++++++++++++++-
 include/dslash_quda.h                         |  1 +
 include/index_helper.cuh                      | 35 +----------
 include/kernels/dslash_staggered.cuh          | 21 +++++--
 .../dslash_twisted_mass_preconditioned.cuh    |  4 +-
 include/kernels/dslash_wilson.cuh             |  4 +-
 include/kernels/laplace.cuh                   |  4 +-
 lib/color_spinor_field.cpp                    |  1 +
 8 files changed, 81 insertions(+), 48 deletions(-)

diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 834b59425c..1714dd64d5 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -158,13 +158,68 @@ namespace quda
 
 #pragma unroll
     for (int d = 0; d < nDim; d++) {
-      coord.in_boundary[1][d] = coord[d] + arg.nFace >= arg.dc.X[d];
-      coord.in_boundary[0][d] = coord[d] - arg.nFace < 0;
+      coord.in_boundary[1][d] = -(coord[d] + arg.nFace >= arg.dc.X[d]);
+      coord.in_boundary[0][d] = -(coord[d] - arg.nFace < 0);
     }
 
     return coord;
   }
 
+  /**
+     @brief Compute the checkerboard 1-d index for the nearest
+     neighbor
+     @param[in] lattice coordinates
+     @param[in] mu dimension in which to add 1
+     @param[in] dir direction (+1 or -1)
+     @param[in] arg parameter struct
+     @return 1-d checkboard index
+   */
+  template <int nFace = 1, typename Coord, typename Arg>
+  __device__ __host__ inline int getNeighborIndexCB(const Coord &x, int mu, int dir, const Arg &arg)
+  {
+    switch (nFace) {
+    case 1:
+      switch (dir) {
+      case +1: // positive direction
+        switch (mu) {
+        case 0: return (x.X + 1 - (x.in_boundary[1][0] & arg.X[0])) >> 1;
+        case 1: return (x.X + arg.X[0] - (x.in_boundary[1][1] & arg.X2X1)) >> 1;
+        case 2: return (x.X + arg.X2X1 - (x.in_boundary[1][2] & arg.X3X2X1)) >> 1;
+        case 3: return (x.X + arg.X3X2X1 - (x.in_boundary[1][3] & arg.X4X3X2X1)) >> 1;
+        case 4: return (x.X + arg.X4X3X2X1 - (x.in_boundary[1][4] & arg.X5X4X3X2X1)) >> 1;
+        }
+      case -1:
+        switch (mu) {
+        case 0: return (x.X - 1 + (x.in_boundary[0][0] & arg.X[0])) >> 1;
+        case 1: return (x.X - arg.X[0] + (x.in_boundary[0][1] & arg.X2X1)) >> 1;
+        case 2: return (x.X - arg.X2X1 + (x.in_boundary[0][2] & arg.X3X2X1)) >> 1;
+        case 3: return (x.X - arg.X3X2X1 + (x.in_boundary[0][3] & arg.X4X3X2X1)) >> 1;
+        case 4: return (x.X - arg.X4X3X2X1 + (x.in_boundary[0][4] & arg.X5X4X3X2X1)) >> 1;
+        }
+      }
+    case 3:
+      switch (dir) {
+      case +1: // positive direction
+        switch (mu) {
+        case 0: return (x.X + 3 - (x.in_boundary[1][0] & arg.X[0])) >> 1;
+        case 1: return (x.X + 3 * arg.X[0] - (x.in_boundary[1][1] & arg.X2X1)) >> 1;
+        case 2: return (x.X + 3 * arg.X2X1 - (x.in_boundary[1][2] & arg.X3X2X1)) >> 1;
+        case 3: return (x.X + 3 * arg.X3X2X1 - (x.in_boundary[1][3] & arg.X4X3X2X1)) >> 1;
+        case 4: return (x.X + 3 * arg.X4X3X2X1 - (x.in_boundary[1][4] & arg.X5X4X3X2X1)) >> 1;
+        }
+      case -1:
+        switch (mu) {
+        case 0: return (x.X - 3 + (x.in_boundary[0][0] & arg.X[0])) >> 1;
+        case 1: return (x.X - 3 * arg.X[0] + (x.in_boundary[0][1] & arg.X2X1)) >> 1;
+        case 2: return (x.X - 3 * arg.X2X1 + (x.in_boundary[0][2] & arg.X3X2X1)) >> 1;
+        case 3: return (x.X - 3 * arg.X3X2X1 + (x.in_boundary[0][3] & arg.X4X3X2X1)) >> 1;
+        case 4: return (x.X - 3 * arg.X4X3X2X1 + (x.in_boundary[0][4] & arg.X5X4X3X2X1)) >> 1;
+        }
+      }
+    }
+    return 0; // should never reach here
+  }
+
   /**
      @brief Compute whether this thread should be active for updating
      the a given offsetDim halo.  For non-fused halo update kernels
diff --git a/include/dslash_quda.h b/include/dslash_quda.h
index f34a41de1a..5091e28674 100644
--- a/include/dslash_quda.h
+++ b/include/dslash_quda.h
@@ -35,6 +35,7 @@ namespace quda
     int X2X1;
     int X3X2X1;
     int X4X3X2X1;
+    int X5X4X3X2X1;
 
     int X2X1mX1;
     int X3X2X1mX2X1;
diff --git a/include/index_helper.cuh b/include/index_helper.cuh
index db58c0daed..5ea718aa8c 100644
--- a/include/index_helper.cuh
+++ b/include/index_helper.cuh
@@ -238,43 +238,10 @@ namespace quda {
     int X;       // full lattice site index
     constexpr const int& operator[](int i) const { return x[i]; }
     constexpr int& operator[](int i) { return x[i]; }
-    array_2d<bool, 2, nDim> in_boundary = {};
+    array_2d<int, 2, nDim> in_boundary = {};
     constexpr int size() const { return nDim; }
   };
 
-  /**
-     @brief Compute the checkerboard 1-d index for the nearest
-     neighbor
-     @param[in] lattice coordinates
-     @param[in] mu dimension in which to add 1
-     @param[in] dir direction (+1 or -1)
-     @param[in] arg parameter struct
-     @return 1-d checkboard index
-   */
-  template <typename Coord, typename Arg>
-  __device__ __host__ inline int getNeighborIndexCB(const Coord &x, int mu, int dir, const Arg &arg)
-  {
-    switch (dir) {
-    case +1: // positive direction
-      switch (mu) {
-      case 0: return (x.in_boundary[1][0] ? x.X - (arg.X[0] - 1) : x.X + 1) >> 1;
-      case 1: return (x.in_boundary[1][1] ? x.X - arg.X2X1mX1 : x.X + arg.X[0]) >> 1;
-      case 2: return (x.in_boundary[1][2] ? x.X - arg.X3X2X1mX2X1 : x.X + arg.X2X1) >> 1;
-      case 3: return (x.in_boundary[1][3] ? x.X - arg.X4X3X2X1mX3X2X1 : x.X + arg.X3X2X1) >> 1;
-      case 4: return (x.in_boundary[1][4] ? x.X - arg.X5X4X3X2X1mX4X3X2X1 : x.X + arg.X4X3X2X1) >> 1;
-      }
-    case -1:
-      switch (mu) {
-      case 0: return (x.in_boundary[0][0] ? x.X + (arg.X[0] - 1) : x.X - 1) >> 1;
-      case 1: return (x.in_boundary[0][1] ? x.X + arg.X2X1mX1 : x.X - arg.X[0]) >> 1;
-      case 2: return (x.in_boundary[0][2] ? x.X + arg.X3X2X1mX2X1 : x.X - arg.X2X1) >> 1;
-      case 3: return (x.in_boundary[0][3] ? x.X + arg.X4X3X2X1mX3X2X1 : x.X - arg.X3X2X1) >> 1;
-      case 4: return (x.in_boundary[0][4] ? x.X + arg.X5X4X3X2X1mX4X3X2X1 : x.X - arg.X4X3X2X1) >> 1;
-      }
-    }
-    return 0; // should never reach here
-  }
-
   /**
      Compute the 4-d spatial index from the checkerboarded 1-d index at parity parity
 
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index ae46c6a900..27bd23d62f 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -90,6 +90,15 @@ namespace quda
     typedef Matrix<complex<real>, Arg::nColor> Link;
     const int their_spinor_parity = (arg.nParity == 2) ? 1 - parity : 0;
 
+    Coord coord1 = coord;
+    if constexpr (arg.improved) { // need to compute 1-hop in_boundary
+#pragma unroll
+      for (int d = 0; d < 4; d++) {
+        coord1.in_boundary[1][d] = -(coord[d] + 1 >= arg.dc.X[d]);
+        coord1.in_boundary[0][d] = -(coord[d] - 1 < 0);
+      }
+    }
+
 #pragma unroll
     for (int d = 0; d < 4; d++) { // loop over dimension
 
@@ -105,7 +114,7 @@ namespace quda
             out[s] = mv_add(U, in, out[s]);
           }
         } else if (doBulk<kernel_type>() && !ghost) {
-          const int fwd_idx = linkIndexP1(coord, arg.dc.X, d);
+          const int fwd_idx = getNeighborIndexCB<1>(coord1, d, 1, arg.dc);
           const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
@@ -117,7 +126,7 @@ namespace quda
 
       // improved - forward direction
       if (arg.improved && arg.dd_in.doHopping(coord, d, +3)) {
-        const bool ghost = coord.in_boundary[1][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord.in_boundary[1][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dc.X, d, arg.nFace);
           const Link L = arg.L(d, coord.x_cb, parity);
@@ -128,7 +137,7 @@ namespace quda
             out[s] = mv_add(L, in, out[s]);
           }
         } else if (doBulk<kernel_type>() && !ghost) {
-          const int fwd3_idx = linkIndexP3(coord, arg.dc.X, d);
+          const int fwd3_idx = getNeighborIndexCB<3>(coord, d, 1, arg.dc);
           const Link L = arg.L(d, coord.x_cb, parity);
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
@@ -153,7 +162,7 @@ namespace quda
             out[s] = mv_sub(conj(U), in, out[s]);
           }
         } else if (doBulk<kernel_type>() && !ghost) {
-          const int back_idx = linkIndexM1(coord, arg.dc.X, d);
+          const int back_idx = getNeighborIndexCB<1>(coord1, d, -1, arg.dc);
           const int gauge_idx = back_idx;
           const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) :
             arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg));
@@ -167,7 +176,7 @@ namespace quda
 
       // improved - backward direction
       if (arg.improved && arg.dd_in.doHopping(coord, d, -3)) {
-        const bool ghost = coord.in_boundary[0][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 1);
           const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity);
@@ -178,7 +187,7 @@ namespace quda
             out[s] = mv_sub(conj(L), in, out[s]);
           }
         } else if (doBulk<kernel_type>() && !ghost) {
-          const int back3_idx = linkIndexM3(coord, arg.dc.X, d);
+          const int back3_idx = getNeighborIndexCB<3>(coord, d, -1, arg.dc);
           const int gauge_idx = back3_idx;
           const Link L = arg.L(d, gauge_idx, 1 - parity);
 #pragma unroll
diff --git a/include/kernels/dslash_twisted_mass_preconditioned.cuh b/include/kernels/dslash_twisted_mass_preconditioned.cuh
index 513a034acd..547385c75c 100644
--- a/include/kernels/dslash_twisted_mass_preconditioned.cuh
+++ b/include/kernels/dslash_twisted_mass_preconditioned.cuh
@@ -63,7 +63,7 @@ namespace quda
       if (arg.dd_in.doHopping(coord, d, +1)) {
         const int fwd_idx = getNeighborIndexCB(coord, d, +1, arg.dc);
         constexpr int proj_dir = dagger ? +1 : -1;
-        const bool ghost = coord.in_boundary[1][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord.in_boundary[1][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
         if (doHalo<kernel_type>(d) && ghost) {
           // we need to compute the face index if we are updating a face that isn't ours
@@ -101,7 +101,7 @@ namespace quda
         const int back_idx = getNeighborIndexCB(coord, d, -1, arg.dc);
         const int gauge_idx = back_idx;
         constexpr int proj_dir = dagger ? -1 : +1;
-        const bool ghost = coord.in_boundary[0][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
         if (doHalo<kernel_type>(d) && ghost) {
           // we need to compute the face index if we are updating a face that isn't ours
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 8b66ee83e6..0b1fb49fb8 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -102,7 +102,7 @@ namespace quda
         const int gauge_idx = (Arg::nDim == 5 ? coord.x_cb % arg.dc.volume_4d_cb : coord.x_cb);
         constexpr int proj_dir = dagger ? +1 : -1;
 
-        const bool ghost = coord.in_boundary[1][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord.in_boundary[1][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
         if (doHalo<kernel_type>(d) && ghost) {
           // we need to compute the face index if we are updating a face that isn't ours
@@ -131,7 +131,7 @@ namespace quda
         const int gauge_idx = (Arg::nDim == 5 ? back_idx % arg.dc.volume_4d_cb : back_idx);
         constexpr int proj_dir = dagger ? -1 : +1;
 
-        const bool ghost = coord.in_boundary[0][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
         if (doHalo<kernel_type>(d) && ghost) {
           // we need to compute the face index if we are updating a face that isn't ours
diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh
index 9d66c2dee4..b45ac9774f 100644
--- a/include/kernels/laplace.cuh
+++ b/include/kernels/laplace.cuh
@@ -86,7 +86,7 @@ namespace quda
       if (d != dir) {
         if (arg.dd_in.doHopping(coord, d, +1)) {
           // Forward gather - compute fwd offset for vector fetch
-          const bool ghost = coord.in_boundary[1][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+          const bool ghost = coord.in_boundary[1][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
           if (doHalo<kernel_type>(d) && ghost) {
 
@@ -111,7 +111,7 @@ namespace quda
           const int back_idx = linkIndexM1(coord, arg.dc.X, d);
           const int gauge_idx = back_idx;
 
-          const bool ghost = coord.in_boundary[0][d] && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+          const bool ghost = coord.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
           if (doHalo<kernel_type>(d) && ghost) {
 
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 442ef13ab1..5288a52a97 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -384,6 +384,7 @@ namespace quda
       dc.X2X1 = X[1] * X[0];
       dc.X3X2X1 = X[2] * X[1] * X[0];
       dc.X4X3X2X1 = X[3] * X[2] * X[1] * X[0];
+      dc.X5X4X3X2X1 = X[4] * X[3] * X[2] * X[1] * X[0];
       dc.X2X1mX1 = (X[1] - 1) * X[0];
       dc.X3X2X1mX2X1 = (X[2] - 1) * X[1] * X[0];
       dc.X4X3X2X1mX3X2X1 = (X[3] - 1) * X[2] * X[1] * X[0];

From eae953d5c5725df215002d6975ff0f73e9e89be6 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 2 Oct 2025 23:38:47 -0700
Subject: [PATCH 007/121] Add support for creating a backward gauge field

---
 include/gauge_field.h           | 11 +++++++
 include/kernels/gauge_shift.cuh | 56 +++++++++++++++++++++++++++++++++
 lib/CMakeLists.txt              |  1 +
 lib/gauge_shift.cu              | 41 ++++++++++++++++++++++++
 4 files changed, 109 insertions(+)
 create mode 100644 include/kernels/gauge_shift.cuh
 create mode 100644 lib/gauge_shift.cu

diff --git a/include/gauge_field.h b/include/gauge_field.h
index c355bd4818..c830938b58 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -669,6 +669,17 @@ namespace quda {
   */
   void genericPrintMatrix(const GaugeField &a, int dim, int parity, unsigned int x_cb, int rank = 0);
 
+  /**
+     @brief Shift the gauge field by shift in each dimension and store
+     the resulting shifted field.  This is used to move the backwards
+     links on to this site.  The input field must be a padded field
+     with the ghost pre-exchanged if communications are enabled.
+     @param[out] out Output shifted field
+     @param[in] in Input shifted field
+     @param[in] shift value (1 or 3 supported)
+   */
+  void shift(GaugeField &out, const GaugeField &in, int shift);
+
   /**
      @brief This is a debugging function, where we cast a gauge field
      into a spinor field so we can compute its L1 norm.
diff --git a/include/kernels/gauge_shift.cuh b/include/kernels/gauge_shift.cuh
new file mode 100644
index 0000000000..abe369f439
--- /dev/null
+++ b/include/kernels/gauge_shift.cuh
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <gauge_field_order.h>
+#include <quda_matrix.h>
+#include <index_helper.cuh>
+#include <byte_array.h>
+#include <kernel.h>
+
+namespace quda
+{
+
+  template <typename store_t, int nColor, QudaReconstructType recon> struct GaugeShiftArg : kernel_param<> {
+    using real = typename mapper<store_t>::type;
+    using Link = Matrix<complex<real>, nColor>;
+    using Gauge = typename gauge_mapper<store_t, recon>::type;
+
+    int X[4]; // true grid dimensions
+    Gauge out;
+    const Gauge in;
+    int shift;
+
+    GaugeShiftArg(GaugeField &out, const GaugeField &in, int shift) :
+      kernel_param(dim3(in.VolumeCB(), 2, 4)), out(out), in(in), shift(shift)
+    {
+      for (int dir = 0; dir < 4; dir++) X[dir] = in.X()[dir];
+    }
+  };
+
+  template <typename Arg> struct GaugeShift {
+    const Arg &arg;
+    constexpr GaugeShift(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ void operator()(int x_cb, int parity, int dir)
+    {
+      using real = typename Arg::real;
+      using Link = typename Arg::Link;
+
+      byte_array<int8_t, 4> x = {};
+      getCoords(x, x_cb, arg.X, parity);
+
+      if (x[dir] < arg.shift && arg.comms_dim[dir]) { // on the boundary so we need to fetch from the ghost zone
+        const int ghost_idx = ghostFaceIndex<0, 4>(x, arg.X, dir, arg.shift);
+        Link U = arg.in.Ghost(dir, ghost_idx, 1 - parity);
+        arg.out(dir, x_cb, parity) = U;
+      } else { // simple shift
+        byte_array<int8_t, 4> dx = {};
+        dx[dir] = dx[dir] - arg.shift;
+        int x_cb_back = linkIndexShift(x, dx, arg.X);
+        Link U = arg.in(dir, x_cb_back, 1 - parity);
+        arg.out(dir, x_cb, parity) = U;
+      }
+    }
+  };
+
+} // namespace quda
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 65f33a6772..9a614e09f5 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -28,6 +28,7 @@ set (QUDA_OBJS
   gauge_phase.cu timer.cpp
   solver.cpp inv_bicgstab_quda.cpp inv_cg_quda.cpp inv_bicgstabl_quda.cpp
   inv_multi_cg_quda.cpp inv_eigcg_quda.cpp gauge_ape.cu
+  gauge_shift.cu
   gauge_stout.cu gauge_hyp.cu gauge_wilson_flow.cu gauge_plaq.cu gauge_plaqrect.cu
   gauge_laplace.cpp gauge_observable.cpp
   inv_cgnr.cpp inv_cgne.cpp
diff --git a/lib/gauge_shift.cu b/lib/gauge_shift.cu
new file mode 100644
index 0000000000..8339508eb3
--- /dev/null
+++ b/lib/gauge_shift.cu
@@ -0,0 +1,41 @@
+#include <gauge_field.h>
+#include <instantiate.h>
+#include <tunable_nd.h>
+#include <kernels/gauge_shift.cuh>
+
+namespace quda
+{
+
+  template <typename Float, int nColor, QudaReconstructType recon> class GaugeShifter : public TunableKernel3D
+  {
+    GaugeField &out;
+    const GaugeField &in;
+    int shift;
+    unsigned int minThreads() const { return in.VolumeCB(); }
+
+  public:
+    GaugeShifter(GaugeField &out, const GaugeField &in, int shift) :
+      TunableKernel3D(in, 2, 4), out(out), in(in), shift(shift)
+    {
+      assert(shift == 1 || shift == 3);
+      apply(device::get_default_stream());
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      GaugeShiftArg<Float, nColor, recon> arg(out, in, shift);
+      launch<GaugeShift>(tp, stream, arg);
+    }
+
+    long long bytes() const { return out.Bytes() + in.Bytes(); }
+  };
+
+  void shift(GaugeField &out, const GaugeField &in, int shift)
+  {
+    getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+    instantiate<GaugeShifter, ReconstructGauge>(out, in, shift);
+    getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+  }
+
+} // namespace quda

From 2540a1bdc785d3aa4ef00b3962d545d1b74dde1e Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 6 Oct 2025 21:44:49 -0700
Subject: [PATCH 008/121] Some small improvedments to shift(GaugeField)
 function

---
 include/gauge_field.h |  2 +-
 lib/gauge_shift.cu    | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/gauge_field.h b/include/gauge_field.h
index c830938b58..a4f3d0b590 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -678,7 +678,7 @@ namespace quda {
      @param[in] in Input shifted field
      @param[in] shift value (1 or 3 supported)
    */
-  void shift(GaugeField &out, const GaugeField &in, int shift);
+  GaugeField shift(const GaugeField &in, int shift);
 
   /**
      @brief This is a debugging function, where we cast a gauge field
diff --git a/lib/gauge_shift.cu b/lib/gauge_shift.cu
index 8339508eb3..652ef8aa50 100644
--- a/lib/gauge_shift.cu
+++ b/lib/gauge_shift.cu
@@ -31,11 +31,19 @@ namespace quda
     long long bytes() const { return out.Bytes() + in.Bytes(); }
   };
 
-  void shift(GaugeField &out, const GaugeField &in, int shift)
+  GaugeField shift(const GaugeField &in, int shift)
   {
     getProfile().TPSTART(QUDA_PROFILE_COMPUTE);
+    if (in.GhostExchange() == QUDA_GHOST_EXCHANGE_EXTENDED)
+      errorQuda("Extended ghost exchange not supported");
+    if (in.GhostExchange() == QUDA_GHOST_EXCHANGE_NO && comm_partitioned())
+      errorQuda("comm_dim_partition() == true requires we have GhostExchange = QUDA_GHOST_EXCHANGE_PAD");
+    GaugeFieldParam param(in);
+    param.create = QUDA_NULL_FIELD_CREATE;
+    GaugeField out(param);
     instantiate<GaugeShifter, ReconstructGauge>(out, in, shift);
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+    return out;
   }
 
 } // namespace quda

From e686437443b89e75c49508cfa152c7d07e693cfb Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 6 Oct 2025 22:06:56 -0700
Subject: [PATCH 009/121] Gauge shift should encode shift value in aux_string

---
 lib/gauge_shift.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/gauge_shift.cu b/lib/gauge_shift.cu
index 652ef8aa50..6a81de0246 100644
--- a/lib/gauge_shift.cu
+++ b/lib/gauge_shift.cu
@@ -18,6 +18,10 @@ namespace quda
       TunableKernel3D(in, 2, 4), out(out), in(in), shift(shift)
     {
       assert(shift == 1 || shift == 3);
+      strcat(aux, ",shift=");
+      char shift_str[16];
+      u32toa(shift_str, shift);
+      strcat(aux, shift_str);
       apply(device::get_default_stream());
     }
 

From 676c643e73a82c293d298e6367deeed97400d879 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 6 Oct 2025 22:35:56 -0700
Subject: [PATCH 010/121] Add support for experimental double storage of gauge
 fields - disabled by default

---
 include/dslash.h                     |  6 ++++++
 include/kernels/dslash_staggered.cuh | 29 ++++++++++++++++++++++++----
 include/kernels/dslash_wilson.cuh    | 18 +++++++++++++++--
 lib/dslash_improved_staggered.hpp    | 12 ++++++++++--
 lib/dslash_staggered.hpp             |  9 +++++++--
 lib/dslash_wilson.hpp                |  9 ++++++++-
 6 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/include/dslash.h b/include/dslash.h
index 8feb23d893..3e0906810d 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -9,6 +9,9 @@
 #include <instantiate.h>
 #include <instantiate_dslash.h>
 
+// enable experimental double store of gauge fields
+//#define QUDA_DSLASH_DOUBLE_STORE
+
 namespace quda
 {
 
@@ -70,6 +73,9 @@ namespace quda
       char tile_str[16];
       i32toa(tile_str, Arg::n_src_tile);
       strcat(aux_base, tile_str);
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+      strcat(aux_base, ",double_store");
+#endif
     }
 
     /**
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 27bd23d62f..ebb55b9fff 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -43,7 +43,9 @@ namespace quda
     const Ghost halo;      /** accessor for reading the halo */
     F x[MAX_MULTI_RHS];    /** input vector when doing xpay */
     const GU U; /** the gauge field */
+    const GU Uback; /** the gauge field */
     const GL L; /** the long gauge field */
+    const GL Lback; /** the long gauge field */
 
     const real a; /** xpay scale factor */
     const real tboundary; /** temporal boundary condition */
@@ -54,13 +56,14 @@ namespace quda
     const real dagger_scale;
 
     StaggeredArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                 const ColorSpinorField &halo, const GaugeField &U, const GaugeField &L, double a,
-                 cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
+                 const ColorSpinorField &halo, const GaugeField &U, const GaugeField &Uback, const GaugeField &L,
+                 const GaugeField &Lback, double a, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
+                 const int *comm_override) :
       DslashArg < Float,
     nDim, DDArg, improved ? 3 : 1, n_src_tile
       > (out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, spin_project, comm_override),
-    halo_pack(halo, improved_ ? 3 : 1), halo(halo, improved_ ? 3 : 1), U(U), L(L), a(a), tboundary(U.TBoundary()),
-    is_first_time_slice(comm_coord(3) == 0 ? true : false),
+    halo_pack(halo, improved_ ? 3 : 1), halo(halo, improved_ ? 3 : 1), U(U), Uback(Uback), L(L), Lback(Lback), a(a),
+    tboundary(U.TBoundary()), is_first_time_slice(comm_coord(3) == 0 ? true : false),
     is_last_time_slice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
     dagger_scale(dagger ? static_cast<real>(-1.0) : static_cast<real>(1.0))
     {
@@ -154,8 +157,13 @@ namespace quda
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx2 = ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 1);
           const int ghost_idx = arg.improved ? ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 3) : ghost_idx2;
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+          const Link U = arg.improved ? arg.Uback(d, coord.x_cb, parity) :
+                                        arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg));
+#else
           const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, 1 - parity) :
             arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg));
+#endif
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             Vector in = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.dc.ghostFaceCB[d], their_spinor_parity);
@@ -163,9 +171,14 @@ namespace quda
           }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int back_idx = getNeighborIndexCB<1>(coord1, d, -1, arg.dc);
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+          const Link U = arg.improved ? arg.Uback(d, coord.x_cb, parity) :
+                                        arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg));
+#else
           const int gauge_idx = back_idx;
           const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) :
             arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg));
+#endif
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity);
@@ -179,7 +192,11 @@ namespace quda
         const bool ghost = coord.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 1);
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+          const Link L = arg.Lback(d, coord.x_cb, parity);
+#else
           const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity);
+#endif
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             const Vector in
@@ -188,8 +205,12 @@ namespace quda
           }
         } else if (doBulk<kernel_type>() && !ghost) {
           const int back3_idx = getNeighborIndexCB<3>(coord, d, -1, arg.dc);
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+          const Link L = arg.Lback(d, coord.x_cb, parity);
+#else
           const int gauge_idx = back3_idx;
           const Link L = arg.L(d, gauge_idx, 1 - parity);
+#endif
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity);
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 0b1fb49fb8..04aa4f50fe 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -38,19 +38,21 @@ namespace quda
     Ghost halo_pack;
     Ghost halo;
     const G U;    /** the gauge field */
+    const G Uback; /** the backwards gauge field */
     const real a; /** xpay scale factor - can be -kappa or -kappa^2 */
     /** parameters for distance preconditioning */
     const real alpha0;
     const int t0;
 
     WilsonArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo,
-              const GaugeField &U, double a, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
-              const int *comm_override, double alpha0 = 0.0, int t0 = -1) :
+              const GaugeField &U, const GaugeField &Uback, double a, cvector_ref<const ColorSpinorField> &x,
+              int parity, bool dagger, const int *comm_override, double alpha0 = 0.0, int t0 = -1) :
       DslashArg<Float, nDim, DDArg>(out, in, halo, U, x, parity, dagger, a != 0.0 ? true : false, spin_project,
                                     comm_override),
       halo_pack(halo),
       halo(halo),
       U(U),
+      Uback(Uback),
       a(a),
       alpha0(alpha0),
       t0(t0)
@@ -128,7 +130,11 @@ namespace quda
       if (arg.dd_in.doHopping(coord, d, -1)) {
         const real bwd_coeff = (d < 3) ? 1.0 : bwd_coeff_3;
         const int back_idx = getNeighborIndexCB(coord, d, -1, arg.dc);
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+        const int gauge_idx = (Arg::nDim == 5 ? coord.x_cb % arg.dc.volume_4d_cb : coord.x_cb);
+#else
         const int gauge_idx = (Arg::nDim == 5 ? back_idx % arg.dc.volume_4d_cb : back_idx);
+#endif
         constexpr int proj_dir = dagger ? -1 : +1;
 
         const bool ghost = coord.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
@@ -140,14 +146,22 @@ namespace quda
             idx;
 
           const int gauge_ghost_idx = (Arg::nDim == 5 ? ghost_idx % arg.dc.ghostFaceCB[d] : ghost_idx);
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+          Link U = arg.Uback(d, gauge_idx, gauge_parity);
+#else
           Link U = arg.U.Ghost(d, gauge_ghost_idx, 1 - gauge_parity);
+#endif
           HalfVector in = arg.halo.Ghost(d, 0, ghost_idx + (src_idx * arg.Ls + coord.s) * arg.dc.ghostFaceCB[d],
                                          their_spinor_parity);
 
           out += bwd_coeff * (conj(U) * in).reconstruct(d, proj_dir);
         } else if (doBulk<kernel_type>() && !ghost) {
 
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+          Link U = arg.Uback(d, gauge_idx, gauge_parity);
+#else
           Link U = arg.U(d, gauge_idx, 1 - gauge_parity);
+#endif
           Vector in = arg.in[src_idx](back_idx + coord.s * arg.dc.volume_4d_cb, their_spinor_parity);
 
           out += bwd_coeff * (conj(U) * in.project(d, proj_dir)).reconstruct(d, proj_dir);
diff --git a/lib/dslash_improved_staggered.hpp b/lib/dslash_improved_staggered.hpp
index 60588fd75e..07b6396ee8 100644
--- a/lib/dslash_improved_staggered.hpp
+++ b/lib/dslash_improved_staggered.hpp
@@ -153,8 +153,16 @@ namespace quda
       constexpr bool improved = true;
       constexpr QudaReconstructType recon_u = QUDA_RECONSTRUCT_NO;
       auto halo = ColorSpinorField::create_comms_batch(in, 3);
-      StaggeredArg<Float, nColor, nDim, DDArg, recon_u, recon_l, improved> arg(out, in, halo, U, L, a, x, parity,
-                                                                               dagger, comm_override);
+
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+      GaugeField Uback = shift(U, 1);
+      GaugeField Lback = shift(L, 3);
+#else
+      const GaugeField &Uback = U;
+      const GaugeField &Lback = L;
+#endif
+      StaggeredArg<Float, nColor, nDim, DDArg, recon_u, recon_l, improved> arg(out, in, halo, U, Uback, L, Lback, a, x,
+                                                                               parity, dagger, comm_override);
       Staggered<decltype(arg)> staggered(arg, out, in, halo, L);
       dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, in, halo, profile);
     }
diff --git a/lib/dslash_staggered.hpp b/lib/dslash_staggered.hpp
index 51a15c9ae4..874fe731e5 100644
--- a/lib/dslash_staggered.hpp
+++ b/lib/dslash_staggered.hpp
@@ -49,12 +49,17 @@ namespace quda
       constexpr int nDim = 4;
       constexpr bool improved = false;
       auto halo = ColorSpinorField::create_comms_batch(in);
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+      GaugeField Uback = shift(U, 1);
+#else
+      const GaugeField &Uback = shift(U, 1);
+#endif
 
       if (U.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC
           || (U.LinkType() == QUDA_GENERAL_LINKS && U.Reconstruct() == QUDA_RECONSTRUCT_NO)) {
         if constexpr (is_enabled<QUDA_MILC_GAUGE_ORDER>()) {
           StaggeredArg<Float, nColor, nDim, DDArg, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_MILC> arg(
-            out, in, halo, U, U, a, x, parity, dagger, comm_override);
+            out, in, halo, U, Uback, U, Uback, a, x, parity, dagger, comm_override);
           Staggered<decltype(arg)> staggered(arg, out, in, halo);
 
           dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, in, halo, profile);
@@ -64,7 +69,7 @@ namespace quda
       } else if (U.StaggeredPhase() == QUDA_STAGGERED_PHASE_TIFR) {
         if constexpr (is_enabled<QUDA_TIFR_GAUGE_ORDER>()) {
           StaggeredArg<Float, nColor, nDim, DDArg, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_TIFR> arg(
-            out, in, halo, U, U, a, x, parity, dagger, comm_override);
+            out, in, halo, U, Uback, U, Uback, a, x, parity, dagger, comm_override);
           Staggered<decltype(arg)> staggered(arg, out, in, halo);
 
           dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, in, halo, profile);
diff --git a/lib/dslash_wilson.hpp b/lib/dslash_wilson.hpp
index 80086142e4..c1fc823d3e 100644
--- a/lib/dslash_wilson.hpp
+++ b/lib/dslash_wilson.hpp
@@ -43,7 +43,14 @@ namespace quda
     {
       constexpr int nDim = 4;
       auto halo = ColorSpinorField::create_comms_batch(in);
-      WilsonArg<Float, nColor, nDim, DDArg, recon, distance_pc> arg(out, in, halo, U, a, x, parity, dagger,
+
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+      GaugeField Uback = shift(U, 1);
+#else
+      const GaugeField &Uback = U;
+#endif
+
+      WilsonArg<Float, nColor, nDim, DDArg, recon, distance_pc> arg(out, in, halo, U, Uback, a, x, parity, dagger,
                                                                     comm_override, alpha0, t0);
       Wilson<decltype(arg)> wilson(arg, out, in, halo);
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);

From 9c2025b8cb5cec48ee3109c57a3380daa2dd857b Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 20 Oct 2025 16:09:01 -0700
Subject: [PATCH 011/121] Fix some issues with gauge shift: fix single-GPU
 builds and add half/quarter precision support

---
 include/kernels/gauge_shift.cuh |  2 +-
 lib/gauge_shift.cu              | 22 ++++++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/include/kernels/gauge_shift.cuh b/include/kernels/gauge_shift.cuh
index abe369f439..dced98e50c 100644
--- a/include/kernels/gauge_shift.cuh
+++ b/include/kernels/gauge_shift.cuh
@@ -39,7 +39,7 @@ namespace quda
       byte_array<int8_t, 4> x = {};
       getCoords(x, x_cb, arg.X, parity);
 
-      if (x[dir] < arg.shift && arg.comms_dim[dir]) { // on the boundary so we need to fetch from the ghost zone
+      if (x[dir] < arg.shift && arg.comms_dim[dir] > 1) { // on the boundary so we need to fetch from the ghost zone
         const int ghost_idx = ghostFaceIndex<0, 4>(x, arg.X, dir, arg.shift);
         Link U = arg.in.Ghost(dir, ghost_idx, 1 - parity);
         arg.out(dir, x_cb, parity) = U;
diff --git a/lib/gauge_shift.cu b/lib/gauge_shift.cu
index 6a81de0246..cc5997e4ee 100644
--- a/lib/gauge_shift.cu
+++ b/lib/gauge_shift.cu
@@ -6,7 +6,7 @@
 namespace quda
 {
 
-  template <typename Float, int nColor, QudaReconstructType recon> class GaugeShifter : public TunableKernel3D
+  template <typename Float, int nColor> class GaugeShifter : public TunableKernel3D
   {
     GaugeField &out;
     const GaugeField &in;
@@ -28,8 +28,22 @@ namespace quda
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      GaugeShiftArg<Float, nColor, recon> arg(out, in, shift);
-      launch<GaugeShift>(tp, stream, arg);
+      if (in.Reconstruct() == QUDA_RECONSTRUCT_NO) {
+        GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_NO> arg(out, in, shift);
+        launch<GaugeShift>(tp, stream, arg);
+      } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) {
+        GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_13> arg(out, in, shift);
+        launch<GaugeShift>(tp, stream, arg);
+      } else if (in.Reconstruct() == QUDA_RECONSTRUCT_12) {
+        GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_12> arg(out, in, shift);
+        launch<GaugeShift>(tp, stream, arg);
+      } else if (in.Reconstruct() == QUDA_RECONSTRUCT_9) {
+        GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_9> arg(out, in, shift);
+        launch<GaugeShift>(tp, stream, arg);
+      } else if (in.Reconstruct() == QUDA_RECONSTRUCT_8) {
+        GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_8> arg(out, in, shift);
+        launch<GaugeShift>(tp, stream, arg);
+      }
     }
 
     long long bytes() const { return out.Bytes() + in.Bytes(); }
@@ -45,7 +59,7 @@ namespace quda
     GaugeFieldParam param(in);
     param.create = QUDA_NULL_FIELD_CREATE;
     GaugeField out(param);
-    instantiate<GaugeShifter, ReconstructGauge>(out, in, shift);
+    instantiate<GaugeShifter>(out, in, shift);
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     return out;
   }

From 721fbd523d6a75b5326c5e028c0814a729159aaf Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 20 Oct 2025 16:29:52 -0700
Subject: [PATCH 012/121] make doBulk and doHalo constexpr

---
 include/dslash_helper.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 1714dd64d5..0a207e376e 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -26,7 +26,7 @@ namespace quda
      @param[in] dim Dimension we are working on.  If dim=-1 (default
      argument) then we return true if type is any halo kernel.
   */
-  template <KernelType type> __host__ __device__ __forceinline__ bool doHalo(int dim = -1)
+  template <KernelType type> __host__ __device__ __forceinline__ constexpr bool doHalo(int dim = -1)
   {
     switch (type) {
     case EXTERIOR_KERNEL_ALL: return true;
@@ -44,7 +44,7 @@ namespace quda
      computation
      @param[in] dim Dimension we are working on
   */
-  template <KernelType type> __host__ __device__ __forceinline__ bool doBulk()
+  template <KernelType type> __host__ __device__ __forceinline__ constexpr bool doBulk()
   {
     switch (type) {
     case EXTERIOR_KERNEL_ALL:

From 02a4cb9cd3e65fd37fee0748f44fe272615ad07f Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 20 Oct 2025 17:30:00 -0700
Subject: [PATCH 013/121] Add target::is_thread_zero and target::is_lane_zero
 helper functions for executing single-thread regions of code.  On CUDA
 install the latest version of CCCL via CPM since we need some new features

---
 include/targets/cuda/target_device.h | 51 ++++++++++++++++++++++++++++
 include/targets/hip/target_device.h  | 10 ++++++
 lib/targets/cuda/target_cuda.cmake   |  7 ++++
 3 files changed, 68 insertions(+)

diff --git a/include/targets/cuda/target_device.h b/include/targets/cuda/target_device.h
index ee7c646172..ac3b47dfdc 100644
--- a/include/targets/cuda/target_device.h
+++ b/include/targets/cuda/target_device.h
@@ -7,6 +7,8 @@
 #include <nv/target>
 #endif
 
+#include <cuda/ptx>
+
 #if defined(__CUDACC__) || defined(_NVHPC_CUDA) || (defined(__clang__) && defined(__CUDA__))
 #define QUDA_CUDA_CC
 #endif
@@ -171,6 +173,55 @@ namespace quda
       }
     }
 
+    template <bool is_device> struct is_thread_zero_impl {
+      template <class T> bool operator()(const T &) { return true; }
+    };
+
+#ifdef QUDA_CUDA_CC
+    template <> struct is_thread_zero_impl<true> {
+      template <class T> __device__ bool operator()(const T &)
+      {
+        unsigned int tid = thread_idx_linear<T::value>();
+        unsigned int warp_id = tid / 32;
+        unsigned int uniform_warp_id = __shfl_sync(0xFFFFFFFF, warp_id, 0); // Broadcast from lane 0
+        // unsigned int uniform_warp_id = __reduce_min_sync(~0, warp_id == 0); perhaps faster on sm_100
+        return (uniform_warp_id == 0 && cuda::ptx::elect_sync(0xFFFFFFFF));
+      }
+    };
+#endif
+
+    /**
+       @brief Return true only for a single thread in a thread block.
+       This function assumes all warps in the thread block are
+       converged.  Note that the single thread that is returned is not
+       necessarily thread 0 in the thread block.
+       @tparam dim The dimension of the thread block
+       @return true for a single thread in the thread block, else
+       false
+    */
+    template <int dim = 3> __device__ __host__ inline bool is_thread_zero()
+    {
+      return target::dispatch<is_thread_zero_impl>(std::integral_constant<int, dim>());
+    }
+
+    template <bool is_device> struct is_lane_zero_impl {
+      bool operator()() { return true; }
+    };
+#ifdef QUDA_CUDA_CC
+    template <> struct is_lane_zero_impl<true> {
+      __device__ bool operator()() { return cuda::ptx::elect_sync(0xFFFFFFFF); }
+    };
+#endif
+
+    /**
+       @brief Return true only for a single lane in a warp.
+       This function assumes the warp is converged.
+       Note that the single thread that is returned is not
+       necessarily lane 0 in the warp.
+       @return true for a single lane in the warp, else false
+    */
+    __device__ __host__ inline bool is_lane_zero() { return target::dispatch<is_lane_zero_impl>(); }
+
     template <class T> constexpr bool vectorize()
     {
 #ifdef QUDA_VECTORIZE_SINGLE
diff --git a/include/targets/hip/target_device.h b/include/targets/hip/target_device.h
index 897c9bdae1..4075604cf2 100644
--- a/include/targets/hip/target_device.h
+++ b/include/targets/hip/target_device.h
@@ -135,6 +135,16 @@ namespace quda
       }
     }
 
+    template <int dim = 3> __device__ __host__ inline bool is_thread_zero()
+    {
+      return thread_idx_linear<dim>() == 0;
+    }
+
+    template __device__ __host__ inline bool is_lane_zero()
+    {
+      return (thread_idx_linear<3>() % 64) == 0; // switch this to warp_size
+    }
+
   } // namespace target
 
   namespace device
diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index 0c5fcb46a3..db6d52ed0a 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -419,6 +419,13 @@ if(CUDAToolkit_FOUND)
   target_link_libraries(quda INTERFACE CUDA::cudart_static)
 endif()
 
+CPMAddPackage(
+    NAME CCCL
+    GITHUB_REPOSITORY nvidia/cccl
+    GIT_TAG main # Fetches the latest commit on the main branch
+)
+target_link_libraries(quda PRIVATE CCCL::CCCL)
+
 # nvshmem enabled parts need SEPARABLE_COMPILATION ...
 if(QUDA_NVSHMEM)
   list(APPEND QUDA_DSLASH_OBJS dslash_constant_arg.cu)

From 33b5f2f739fc477c45155770744e649f321f6ade Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 21 Oct 2025 12:48:57 -0700
Subject: [PATCH 014/121] Expose prefetching instructions

---
 include/targets/cuda/load_store.h    | 18 ++++++++++++++++++
 include/targets/generic/load_store.h | 15 +++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index 29b2e50be3..161c93cbe5 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -156,6 +156,24 @@ namespace quda
     }
   };
 
+  // pre-declaration of the prefetch_cache that we wish to specialize
+  template <bool> struct prefetch_cache_line_imp;
+
+  // CUDA specialization of the prefetch_cache that uses inline ptx
+  template <> struct prefetch_cache_line_imp<true> {
+    __device__ inline void operator()(const void *p) { prefetch_L2(p); }
+  };
+
+  // pre-declaration of the prefetch_cache that we wish to specialize
+  template <bool> struct prefetch_cache_bulk_imp;
+
+#if __COMPUTE_CAPABILITY__ >= 900
+  // CUDA specialization of the prefetch_cache_bulk that uses TMA (requires Hopper+)
+  template <> struct prefetch_cache_bulk_imp<true> {
+    __device__ inline void operator()(const void *p, size_t bytes) { prefetch_tma(p, bytes); }
+  };
+#endif
+
 } // namespace quda
 
 #include "../generic/load_store.h"
diff --git a/include/targets/generic/load_store.h b/include/targets/generic/load_store.h
index 93b847a4db..8254509e74 100644
--- a/include/targets/generic/load_store.h
+++ b/include/targets/generic/load_store.h
@@ -64,4 +64,19 @@ namespace quda
     vector_store<vector_t>(ptr, idx, value_v);
   }
 
+  template <bool is_device> struct prefetch_cache_line_imp {
+    __device__ __host__ inline void operator()(const void *) { }
+  };
+
+  __device__ __host__ inline void prefetch_cache_line(const void *p) { target::dispatch<prefetch_cache_line_imp>(p); }
+
+  template <bool is_device> struct prefetch_cache_bulk_imp {
+    __device__ __host__ inline void operator()(const void *, size_t) { }
+  };
+
+  __device__ __host__ inline void prefetch_cache_bulk(const void *p, size_t bytes)
+  {
+    target::dispatch<prefetch_cache_bulk_imp>(p, bytes);
+  }
+
 } // namespace quda

From ccf7a552a7d12ebf33cc5066f76998584946c158 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 21 Oct 2025 12:52:51 -0700
Subject: [PATCH 015/121] Add prefetching support to gauge and colorspinor
 fields

---
 include/color_spinor_field_order.h | 15 ++++++++++++++
 include/gauge_field_order.h        | 33 ++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 008c25db8b..a227e5a829 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -1181,6 +1181,21 @@ namespace quda
         for (int i = 0; i < length / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]);
       }
 
+      __device__ __host__ inline void prefetch(int x, int parity = 0) const
+      {
+#ifndef LEGACY_ACCESSOR_NORM
+        auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
+        auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns));
+#endif
+        if constexpr (isFixed<Float>::value) prefetch_cache_line(norm + x + parity * norm_offset);
+
+#pragma unroll
+        for (int i = 0; i < M; i++) prefetch_cache_line(field + parity * offset + (volumeCB * i + x) * N);
+
+        // now load any remainder
+        if constexpr (Nrem > 0) prefetch_cache_line(field + parity * offset + volumeCB * M * N + x * Nrem);
+      }
+
       __device__ __host__ inline void save(const complex in[length / 2], int x, int parity = 0) const
       {
         real v[length];
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 4561f1f21f..7f8dc28ac5 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1632,6 +1632,39 @@ namespace quda {
         reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
       }
 
+      __device__ inline void prefetch(int x, int dir, int parity) const
+      {
+#pragma unroll
+        for (int i = 0; i < M; i++)
+          prefetch_cache_line(gauge + parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N);
+
+        // now load any remainder
+        if constexpr (Nrem > 0)
+          prefetch_cache_line(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem);
+
+        constexpr bool load_phase = (hasPhase && !(static_phase<stag_phase>() && (reconLen == 13 || use_inphase)));
+        if constexpr (load_phase) prefetch_cache_line(gauge + parity * offset + phaseOffset + stride * dir + x);
+      }
+
+      __device__ inline void prefetch_bulk(int x, int dir, int parity, int block_size) const
+      {
+        if (target::is_thread_zero()) {
+#pragma unroll
+          for (int i = 0; i < M; i++)
+            prefetch_cache_bulk(gauge + parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N,
+                                block_size * N * sizeof(Float));
+
+          // now load any remainder
+          if constexpr (Nrem > 0)
+            prefetch_cache_bulk(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem,
+                                block_size * Nrem * sizeof(Float));
+
+          constexpr bool load_phase = (hasPhase && !(static_phase<stag_phase>() && (reconLen == 13 || use_inphase)));
+          if constexpr (load_phase)
+            prefetch_cache_bulk(gauge + parity * offset + phaseOffset + stride * dir, block_size * sizeof(Float));
+        }
+      }
+
       __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const
       {
         real tmp[reconLen];

From 0642f638bb761ed991dff43664d731c93a76fa41 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 21 Oct 2025 14:50:34 -0700
Subject: [PATCH 016/121] Add L2 gauge-field prefetching support to both Wilson
 and staggered dslash kernels.  Disabled by default (set with with
 Arg::prefetch_distance parameter), and TMA prefetch will be added in next
 push

---
 include/dslash.h                      |   5 ++
 include/dslash_helper.cuh             |   4 +
 include/index_helper.cuh              |   1 +
 include/kernel_helper.h               |   2 +
 include/kernels/dslash_staggered.cuh  | 124 +++++++++++++++++++-------
 include/kernels/dslash_wilson.cuh     |  62 +++++++++++--
 include/targets/cuda/tunable_kernel.h |   8 +-
 include/targets/hip/tunable_kernel.h  |  13 ++-
 8 files changed, 176 insertions(+), 43 deletions(-)

diff --git a/include/dslash.h b/include/dslash.h
index 3e0906810d..d34d83d42a 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -76,6 +76,11 @@ namespace quda
 #ifdef QUDA_DSLASH_DOUBLE_STORE
       strcat(aux_base, ",double_store");
 #endif
+      if constexpr (Arg::prefetch_distance > 0) {
+        strcat(aux_base, ",prefetch=");
+        i32toa(tile_str, Arg::prefetch_distance);
+        strcat(aux_base, tile_str);
+      }
     }
 
     /**
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 0a207e376e..314c27c43c 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -109,6 +109,7 @@ namespace quda
 
     if (kernel_type == INTERIOR_KERNEL) {
       coord.x_cb = idx;
+      coord.x_cb_0 = (target::block_idx().x - arg.pack_blocks) * target::block_dim().x;
       if (nDim == 5)
         coord.X = getCoords5CB(coord, idx, arg.dc.X, arg.X0h, parity, pc_type);
       else
@@ -298,6 +299,7 @@ namespace quda
     static constexpr int n_src_tile = n_src_tile_; // how many RHS per thread
     static constexpr int max_regs = 0;             // by default we don't limit register count
     static constexpr bool spill_shared = false;    // whether a given kernel should use shared memory spilling
+    static constexpr int prefetch_distance = 0;    // whether we are using prefetching in the dslash
 
     const int parity;  // only use this for single parity fields
     const int nParity; // number of parities we're working on
@@ -340,6 +342,7 @@ namespace quda
     int pack_blocks = 0;   // total number of blocks used for packing in the dslash
     int exterior_dims = 0; // dimension to run in the exterior Dslash
     int exterior_blocks = 0;
+    int block_size = 0;
 
     DDArg dd_out;
     DDArg dd_in;
@@ -707,6 +710,7 @@ namespace quda
     static constexpr KernelType kernel_type = kernel_type_;
     static constexpr int max_regs = Arg::max_regs;
     static constexpr bool spill_shared = Arg::spill_shared;
+    static constexpr bool is_dslash = true;
     Arg arg;
 
     dslash_functor_arg(const Arg &arg, unsigned int threads_x) :
diff --git a/include/index_helper.cuh b/include/index_helper.cuh
index 5ea718aa8c..c27215ce4e 100644
--- a/include/index_helper.cuh
+++ b/include/index_helper.cuh
@@ -234,6 +234,7 @@ namespace quda {
     array<int, nDim> gx = {};   // nDim global lattice coordinates
     array<int, nDim> gDim = {}; // global lattice dimensions
     int x_cb;    // checkerboard lattice site index
+    int x_cb_0;  // value of x_cb on first thread in block
     int s;       // fifth dimension coord
     int X;       // full lattice site index
     constexpr const int& operator[](int i) const { return x[i]; }
diff --git a/include/kernel_helper.h b/include/kernel_helper.h
index 14727c327a..bf8fd17d2a 100644
--- a/include/kernel_helper.h
+++ b/include/kernel_helper.h
@@ -19,7 +19,9 @@ namespace quda
     static constexpr bool check_bounds = check_bounds_;
     static constexpr int max_regs = 0;          // by default we don't limit register count
     static constexpr bool spill_shared = false; // whether a given kernel should use shared memory spilling
+    static constexpr bool is_dslash = false;    // whether the arg is for a dslash (with its nested arg struct)
     dim3 threads;          /** number of active threads required */
+    int block_size;        /** product of thread block dimensions */
     int comms_rank;        /** per process value of comm_rank() */
     int comms_rank_global; /** per process value comm_rank_global() */
     int comms_coord[4];    /** array storing {comm_coord(0), ..., comm_coord(3)} */
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index ebb55b9fff..fd383c0d3f 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -52,6 +52,7 @@ namespace quda
     const bool is_first_time_slice; /** are we on the first (global) time slice */
     const bool is_last_time_slice; /** are we on the last (global) time slice */
     static constexpr bool improved = improved_;
+    static constexpr int prefetch_distance = 0;
 
     const real dagger_scale;
 
@@ -75,6 +76,43 @@ namespace quda
     }
   };
 
+  /**
+     @brief Prefetch the gauge field into cache.
+     @param[in] dim The dimension we are presently working on
+     @param[in] dir The direction we are presently working on (1 = forwards, 0 = backwards)
+     @param[in] hop The hopping term we are presently working on (0 = 1 - hop, 1 = 3 - hop)
+     @param[in] coord Coordinates that we are working on with hop-3 boundary conditions evaluated
+     @param[in] coord1 Copy of coordinates that we are working on with hop-1 boundary conditions evaluated
+     @param[in] parity Partiry that we are working on
+     @param[in] arg Paramter struct
+   */
+  template <class coord_t, class Arg>
+  __device__ __host__ void prefetch(int dim, int dir, int hop, const coord_t &coord, const coord_t &coord1, int parity,
+                                    const Arg &arg)
+  {
+    if constexpr (arg.prefetch_distance == 0) return;
+
+    if constexpr (arg.improved) {
+      int step = 4 * dim + 2 * dir + hop + arg.prefetch_distance;
+      if (step >= 16) return;
+
+      // for TMA use arg.block_size and coord.x_cb_0
+      // also should have warp uniform parity
+      int dim2 = step / 4;
+      switch (step % 4) {
+      case 0: arg.U.prefetch(coord.x_cb, dim2, parity); break;
+      case 1: arg.L.prefetch(coord.x_cb, dim2, parity); break;
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+      case 2: arg.Uback.prefetch(coord.x_cb, dim2, parity); break;
+      case 3: arg.Lback.prefetch(coord.x_cb, dim2, parity); break;
+#else
+      case 2: arg.U.prefetch(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity); break;
+      case 3: arg.L.prefetch(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity); break;
+#endif
+      }
+    }
+  }
+
   /**
      @brief Applies the off-diagonal part of the Staggered / Asqtad
      operator.
@@ -107,7 +145,8 @@ namespace quda
 
       // standard - forward direction
       if (arg.dd_in.doHopping(coord, d, +1)) {
-        const bool ghost = (coord[d] + 1 >= arg.dc.X[d]) && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord1.in_boundary[1][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
+
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dc.X, d, 1);
           const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
@@ -116,14 +155,20 @@ namespace quda
             Vector in = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.dc.ghostFaceCB[d], their_spinor_parity);
             out[s] = mv_add(U, in, out[s]);
           }
-        } else if (doBulk<kernel_type>() && !ghost) {
-          const int fwd_idx = getNeighborIndexCB<1>(coord1, d, 1, arg.dc);
-          const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
+        }
+
+        if constexpr (doBulk<kernel_type>()) {
+          if (!ghost) {
+            const int fwd_idx = getNeighborIndexCB<1>(coord1, d, 1, arg.dc);
+            const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) :
+                                          arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
 #pragma unroll
-          for (auto s = 0; s < n_src_tile; s++) {
-            Vector in = arg.in[src_idx + s](fwd_idx, their_spinor_parity);
-            out[s] = mv_add(U, in, out[s]);
+            for (auto s = 0; s < n_src_tile; s++) {
+              Vector in = arg.in[src_idx + s](fwd_idx, their_spinor_parity);
+              out[s] = mv_add(U, in, out[s]);
+            }
           }
+          prefetch(d, 0, 0, coord, coord1, parity, arg);
         }
       }
 
@@ -139,20 +184,25 @@ namespace quda
               = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.dc.ghostFaceCB[d], their_spinor_parity);
             out[s] = mv_add(L, in, out[s]);
           }
-        } else if (doBulk<kernel_type>() && !ghost) {
-          const int fwd3_idx = getNeighborIndexCB<3>(coord, d, 1, arg.dc);
-          const Link L = arg.L(d, coord.x_cb, parity);
+        }
+
+        if constexpr (doBulk<kernel_type>()) {
+          if (!ghost) {
+            const int fwd3_idx = getNeighborIndexCB<3>(coord, d, 1, arg.dc);
+            const Link L = arg.L(d, coord.x_cb, parity);
 #pragma unroll
-          for (auto s = 0; s < n_src_tile; s++) {
-            const Vector in = arg.in[src_idx + s](fwd3_idx, their_spinor_parity);
-            out[s] = mv_add(L, in, out[s]);
+            for (auto s = 0; s < n_src_tile; s++) {
+              const Vector in = arg.in[src_idx + s](fwd3_idx, their_spinor_parity);
+              out[s] = mv_add(L, in, out[s]);
+            }
           }
+          prefetch(d, 0, 1, coord, coord1, parity, arg);
         }
       }
 
       if (arg.dd_in.doHopping(coord, d, -1)) {
         // Backward gather - compute back offset for spinor and gauge fetch
-        const bool ghost = (coord[d] - 1 < 0) && isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord1.in_boundary[1][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx2 = ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 1);
@@ -169,21 +219,26 @@ namespace quda
             Vector in = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.dc.ghostFaceCB[d], their_spinor_parity);
             out[s] = mv_sub(conj(U), in, out[s]);
           }
-        } else if (doBulk<kernel_type>() && !ghost) {
-          const int back_idx = getNeighborIndexCB<1>(coord1, d, -1, arg.dc);
+        }
+
+        if constexpr (doBulk<kernel_type>()) {
+          if (!ghost) {
+            const int back_idx = getNeighborIndexCB<1>(coord1, d, -1, arg.dc);
 #ifdef QUDA_DSLASH_DOUBLE_STORE
-          const Link U = arg.improved ? arg.Uback(d, coord.x_cb, parity) :
-                                        arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg));
+            const Link U = arg.improved ? arg.Uback(d, coord.x_cb, parity) :
+                                          arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg));
 #else
-          const int gauge_idx = back_idx;
-          const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) :
-            arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg));
+            const int gauge_idx = back_idx;
+            const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) :
+                                          arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg));
 #endif
 #pragma unroll
-          for (auto s = 0; s < n_src_tile; s++) {
-            Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity);
-            out[s] = mv_sub(conj(U), in, out[s]);
+            for (auto s = 0; s < n_src_tile; s++) {
+              Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity);
+              out[s] = mv_sub(conj(U), in, out[s]);
+            }
           }
+          prefetch(d, 1, 0, coord, coord1, parity, arg);
         }
       }
 
@@ -203,19 +258,24 @@ namespace quda
               = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.dc.ghostFaceCB[d], their_spinor_parity);
             out[s] = mv_sub(conj(L), in, out[s]);
           }
-        } else if (doBulk<kernel_type>() && !ghost) {
-          const int back3_idx = getNeighborIndexCB<3>(coord, d, -1, arg.dc);
+        }
+
+        if constexpr (doBulk<kernel_type>()) {
+          if (!ghost) {
+            const int back3_idx = getNeighborIndexCB<3>(coord, d, -1, arg.dc);
 #ifdef QUDA_DSLASH_DOUBLE_STORE
-          const Link L = arg.Lback(d, coord.x_cb, parity);
+            const Link L = arg.Lback(d, coord.x_cb, parity);
 #else
-          const int gauge_idx = back3_idx;
-          const Link L = arg.L(d, gauge_idx, 1 - parity);
+            const int gauge_idx = back3_idx;
+            const Link L = arg.L(d, gauge_idx, 1 - parity);
 #endif
 #pragma unroll
-          for (auto s = 0; s < n_src_tile; s++) {
-            const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity);
-            out[s] = mv_sub(conj(L), in, out[s]);
+            for (auto s = 0; s < n_src_tile; s++) {
+              const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity);
+              out[s] = mv_sub(conj(L), in, out[s]);
+            }
           }
+          prefetch(d, 1, 1, coord, coord1, parity, arg);
         }
       }
     } // nDim
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 04aa4f50fe..3a937394d5 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -43,6 +43,7 @@ namespace quda
     /** parameters for distance preconditioning */
     const real alpha0;
     const int t0;
+    static constexpr int prefetch_distance = 0;
 
     WilsonArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo,
               const GaugeField &U, const GaugeField &Uback, double a, cvector_ref<const ColorSpinorField> &x,
@@ -65,6 +66,41 @@ namespace quda
     }
   };
 
+  /**
+     @tparam distance The distance away we are prefetching
+     @param[in] dim The dimension we are presently working on
+     @param[in] dir The direction we are presently working on (1 = forwards, 0 = backwards)
+     @param[in] coord Coordinates that we are working on
+     @param[in] parity Partiry that we are working on
+     @param[in] arg Paramter struct
+  */
+  template <class coord_t, class Arg>
+  __device__ __host__ void prefetch(int dim, int dir, const coord_t &coord, int parity, const Arg &arg)
+  {
+    if constexpr (arg.prefetch_distance == 0) return;
+
+    int step = 2 * dim + dir + arg.prefetch_distance;
+    if (step >= 8) return;
+
+    // for TMA use arg.block_size
+    int dim2 = step / 2;
+    // need warp uniform variants of these and parity
+    const int x_cb = (Arg::nDim == 5 ? coord.x_cb % arg.dc.volume_4d_cb : coord.x_cb);
+
+    switch (step % 2) {
+    case 0: arg.U.prefetch(x_cb, dim2, parity); break;
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+    case 1: arg.Uback.prefetch(x_cb, dim2, parity); break;
+#else
+    case 1: {
+      const int back_idx = getNeighborIndexCB(coord, dim2, -1, arg.dc);
+      const int idx1 = (Arg::nDim == 5 ? back_idx % arg.dc.volume_4d_cb : back_idx);
+      arg.U.prefetch(idx1, dim2, 1 - parity);
+    } break;
+#endif
+    }
+  }
+
   /**
      @brief Applies the off-diagonal part of the Wilson operator
 
@@ -117,12 +153,16 @@ namespace quda
                                          their_spinor_parity);
 
           out += fwd_coeff * (U * in).reconstruct(d, proj_dir);
-        } else if (doBulk<kernel_type>() && !ghost) {
+        }
 
-          Link U = arg.U(d, gauge_idx, gauge_parity);
-          Vector in = arg.in[src_idx](fwd_idx + coord.s * arg.dc.volume_4d_cb, their_spinor_parity);
+        if constexpr (doBulk<kernel_type>()) {
+          if (!ghost) {
+            Link U = arg.U(d, gauge_idx, gauge_parity);
+            Vector in = arg.in[src_idx](fwd_idx + coord.s * arg.dc.volume_4d_cb, their_spinor_parity);
+            out += fwd_coeff * (U * in.project(d, proj_dir)).reconstruct(d, proj_dir);
+          }
 
-          out += fwd_coeff * (U * in.project(d, proj_dir)).reconstruct(d, proj_dir);
+          prefetch(d, 0, coord, parity, arg);
         }
       }
 
@@ -155,16 +195,20 @@ namespace quda
                                          their_spinor_parity);
 
           out += bwd_coeff * (conj(U) * in).reconstruct(d, proj_dir);
-        } else if (doBulk<kernel_type>() && !ghost) {
+        }
 
+        if (doBulk<kernel_type>()) {
+          if (!ghost) {
 #ifdef QUDA_DSLASH_DOUBLE_STORE
-          Link U = arg.Uback(d, gauge_idx, gauge_parity);
+            Link U = arg.Uback(d, gauge_idx, gauge_parity);
 #else
-          Link U = arg.U(d, gauge_idx, 1 - gauge_parity);
+            Link U = arg.U(d, gauge_idx, 1 - gauge_parity);
 #endif
-          Vector in = arg.in[src_idx](back_idx + coord.s * arg.dc.volume_4d_cb, their_spinor_parity);
+            Vector in = arg.in[src_idx](back_idx + coord.s * arg.dc.volume_4d_cb, their_spinor_parity);
+            out += bwd_coeff * (conj(U) * in.project(d, proj_dir)).reconstruct(d, proj_dir);
+          }
 
-          out += bwd_coeff * (conj(U) * in.project(d, proj_dir)).reconstruct(d, proj_dir);
+          prefetch(d, 1, coord, parity, arg);
         }
       }
     } // nDim
diff --git a/include/targets/cuda/tunable_kernel.h b/include/targets/cuda/tunable_kernel.h
index 55219f5f93..46b599254e 100644
--- a/include/targets/cuda/tunable_kernel.h
+++ b/include/targets/cuda/tunable_kernel.h
@@ -57,6 +57,8 @@ namespace quda
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
       checkSharedBytes<Functor>(tp, arg);
+      const_cast<Arg &>(arg).block_size = tp.block.x * tp.block.y * tp.block.z;
+      if constexpr (Arg::is_dslash) const_cast<Arg &>(arg).arg.block_size = arg.block_size;
 #ifdef JITIFY
       launch_error = launch_jitify<Functor, grid_stride, Arg>(kernel.name, tp, stream, arg);
 #else
@@ -66,7 +68,7 @@ namespace quda
       return launch_error;
     }
 
-    template <typename Arg, size_t arg_size = sizeof(Arg)> void check_arg_size(Arg&)
+    template <typename Arg, size_t arg_size = sizeof(Arg)> void check_arg_size(Arg &)
     {
       static_assert(sizeof(Arg) <= device::max_constant_size(), "Parameter struct is greater than max constant size");
     }
@@ -76,6 +78,8 @@ namespace quda
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
       checkSharedBytes<Functor>(tp, arg);
+      const_cast<Arg &>(arg).block_size = tp.block.x * tp.block.y * tp.block.z;
+      if constexpr (Arg::is_dslash) const_cast<Arg &>(arg).arg.block_size = arg.block_size;
 #ifdef JITIFY
       // note we do the copy to constant memory after the kernel has been compiled in launch_jitify
       launch_error = launch_jitify<Functor, grid_stride, Arg>(kernel.name, tp, stream, arg);
@@ -99,6 +103,8 @@ namespace quda
     void launch_cuda(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg) const
     {
       checkSharedBytes<Functor>(tp, arg);
+      const_cast<Arg &>(arg).block_size = tp.block.x * tp.block.y * tp.block.z;
+      if constexpr (Arg::is_dslash) const_cast<Arg &>(arg).arg.block_size = arg.block_size;
       constexpr bool grid_stride = false;
       const_cast<TunableKernel *>(this)->launch_device<Functor, grid_stride>(KERNEL(raw_kernel), tp, stream, arg);
     }
diff --git a/include/targets/hip/tunable_kernel.h b/include/targets/hip/tunable_kernel.h
index 5447eeb25b..bb8b08f56b 100644
--- a/include/targets/hip/tunable_kernel.h
+++ b/include/targets/hip/tunable_kernel.h
@@ -54,17 +54,26 @@ namespace quda
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
       checkSharedBytes<Functor>(tp, arg);
+      const_cast<Arg &>(arg).block_size = tp.block.x * tp.block.y * tp.block.z;
+      if constexpr (Arg::is_dslash) const_cast<Arg &>(arg).arg.block_size = arg.block_size;
       setMaxActiveBlocks(kernel, tp);
       launch_error = qudaLaunchKernel(kernel, tp, stream, static_cast<const void *>(&arg));
       return launch_error;
     }
 
+    template <typename Arg, size_t arg_size = sizeof(Arg)> void check_arg_size(Arg &)
+    {
+      static_assert(sizeof(Arg) <= device::max_constant_size(), "Parameter struct is greater than max constant size");
+    }
+
     template <template <typename> class Functor, bool grid_stride, typename Arg>
     std::enable_if_t<!device::use_kernel_arg<Arg>(), qudaError_t>
     launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
     {
       checkSharedBytes<Functor>(tp, arg);
-      static_assert(sizeof(Arg) <= device::max_constant_size(), "Parameter struct is greater than max constant size");
+      const_cast<Arg &>(arg).block_size = tp.block.x * tp.block.y * tp.block.z;
+      if constexpr (Arg::is_dslash) const_cast<Arg &>(arg).arg.block_size = arg.block_size;
+      check_arg_size(arg);
       qudaMemcpyAsync(device::get_constant_buffer<Arg>(), &arg, sizeof(Arg), qudaMemcpyHostToDevice, stream);
       setMaxActiveBlocks(kernel, tp);
       launch_error = qudaLaunchKernel(kernel, tp, stream, static_cast<const void *>(&arg));
@@ -82,6 +91,8 @@ namespace quda
     void launch_cuda(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg) const
     {
       checkSharedBytes<Functor>(tp, arg);
+      const_cast<Arg &>(arg).block_size = tp.block.x * tp.block.y * tp.block.z;
+      if constexpr (Arg::is_dslash) const_cast<Arg &>(arg).arg.block_size = arg.block_size;
       constexpr bool grid_stride = false;
       const_cast<TunableKernel *>(this)->launch_device<Functor, grid_stride>(KERNEL(raw_kernel), tp, stream, arg);
     }

From 72a001fb431a4bf09165d53d3bcdef834dbdb1c3 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 23 Oct 2025 13:23:17 -0700
Subject: [PATCH 017/121] QUDA_DSLASH_DOUBLE_STORE is now a CMake parameter

---
 CMakeLists.txt   | 5 ++++-
 include/dslash.h | 3 ---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3fa2a949c..7f8f1eca73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -153,6 +153,9 @@ option(QUDA_DIRAC_COVDEV "build code for covariant derivative" ${QUDA_DIRAC_DEFA
 option(QUDA_DIRAC_DISTANCE_PRECONDITIONING "build code for distance preconditioned Wilson/clover Dirac operators" OFF)
 set(QUDA_DOMAIN_DECOMPOSITION "0" CACHE STRING "which domain decomposition to instantiate in QUDA (1-bit number - RedBlack)")
 
+option(QUDA_DSLASH_DOUBLE_STORE "store a forwards shifted copy of the gauge fields for simplified Dslash indexing" OFF)
+mark_as_advanced(QUDA_DSLASH_DOUBLE_STORE)
+
 option(QUDA_QIO "build QIO code for binary I/O" OFF)
 
 # Multi-GPU options
@@ -236,7 +239,7 @@ option(QUDA_CTEST_SEP_DSLASH_POLICIES "Test Dslash policies separately in ctest
 option(QUDA_CTEST_DISABLE_BENCHMARKS "Disable benchmark test" ON)
 
 option(QUDA_FAST_COMPILE_REDUCE "enable fast compilation in blas and reduction kernels (single warp per reduction)" OFF)
-option(QUDA_FAST_COMPILE_DSLASH "enable fast compilation in dslash kernels (~20% perf impact)" OFF)
+option(QUDA_FAST_COMPILE_DSLASH "enable fast compilation in coarse grid dslash kernels (significant perf impact)" OFF)
 
 option(QUDA_OPENMP "enable OpenMP" OFF)
 set(QUDA_CXX_STANDARD
diff --git a/include/dslash.h b/include/dslash.h
index d34d83d42a..a33c9a821d 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -9,9 +9,6 @@
 #include <instantiate.h>
 #include <instantiate_dslash.h>
 
-// enable experimental double store of gauge fields
-//#define QUDA_DSLASH_DOUBLE_STORE
-
 namespace quda
 {
 

From 02e7bc324e8643c6077008b28455ee19826e0ab3 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 23 Oct 2025 14:25:26 -0700
Subject: [PATCH 018/121] Add TMA prefetch support for Wilson and staggered
 fermions (enabled with QUDA_DSLASH_PREFETCH_BULK=ON).  Prefetch distance is
 now set via CMake (QUDA_DSLASH_PREFETCH_DISTANCE_WILSON and
 QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED)

---
 CMakeLists.txt                       | 13 ++++++++
 include/dslash_helper.cuh            | 14 ++++++++-
 include/gauge_field_order.h          | 46 +++++++++++++++-------------
 include/kernels/dslash_staggered.cuh | 24 +++++++++------
 include/kernels/dslash_wilson.cuh    | 13 +++++---
 include/quda_define.h.in             | 31 +++++++++++++++++++
 6 files changed, 104 insertions(+), 37 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f8f1eca73..6fbf8dddbd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -155,6 +155,19 @@ set(QUDA_DOMAIN_DECOMPOSITION "0" CACHE STRING "which domain decomposition to in
 
 option(QUDA_DSLASH_DOUBLE_STORE "store a forwards shifted copy of the gauge fields for simplified Dslash indexing" OFF)
 mark_as_advanced(QUDA_DSLASH_DOUBLE_STORE)
+option(QUDA_DSLASH_PREFETCH_BULK "enable bulk prefetching (Hopper+)" OFF)
+mark_as_advanced(QUDA_DSLASH_PREFETCH_BULK)
+
+set(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON "0" CACHE STRING "set prefetch distance for Wilson-like fermions")
+set(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED "0" CACHE STRING "set prefetch distance for staggered-like fermions")
+mark_as_advanced(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON)
+mark_as_advanced(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED)
+if(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON GREATER 7)
+  message(SEND_ERROR "QUDA_DSLASH_PREFETCH_DISTANCE_WILSON is greater than pipeline length")
+endif()
+if(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED GREATER 15)
+  message(SEND_ERROR "QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED is greater than pipeline length")
+endif()
 
 option(QUDA_QIO "build QIO code for binary I/O" OFF)
 
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 314c27c43c..cd8f2cbcb9 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -300,7 +300,14 @@ namespace quda
     static constexpr int max_regs = 0;             // by default we don't limit register count
     static constexpr bool spill_shared = false;    // whether a given kernel should use shared memory spilling
     static constexpr int prefetch_distance = 0;    // whether we are using prefetching in the dslash
-
+#ifdef QUDA_DSLASH_PREFETCH_BULK
+    static constexpr bool prefetch_bulk = true;
+#ifndef QUDA_DSLASH_DOUBLE_STORE
+    static_assert(!bulk, "Cannot use bulk prefetching unless QUDA_DSLASH_DOUBLE_STORE is enabled");
+#endif
+#else
+    static constexpr bool prefetch_bulk = false;
+#endif
     const int parity;  // only use this for single parity fields
     const int nParity; // number of parities we're working on
     const QudaReconstructType reconstruct;
@@ -743,6 +750,11 @@ namespace quda
       // for full fields set parity from z thread index else use arg setting
       if (arg.nParity == 1) parity = arg.parity;
 
+      // FIXME need warp uniform parity which is not composable with
+      // NVSHMEM since the latter requires blockDim.y and blockDim.z to
+      // cover the entire extent
+      parity = target::block_idx().z; // ensure parity is warp uniform
+
       if ((kernel_type == INTERIOR_KERNEL || kernel_type == UBER_KERNEL) &&
           target::block_idx().x < static_cast<unsigned int>(arg.pack_blocks)) {
         // first few blocks do packing kernel
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 7f8dc28ac5..aa398df31e 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1632,36 +1632,38 @@ namespace quda {
         reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
       }
 
-      __device__ inline void prefetch(int x, int dir, int parity) const
+      template <bool bulk = false> __device__ inline void prefetch(int x, int dir, int parity, int block_size = 0) const
       {
-#pragma unroll
-        for (int i = 0; i < M; i++)
-          prefetch_cache_line(gauge + parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N);
-
-        // now load any remainder
-        if constexpr (Nrem > 0)
-          prefetch_cache_line(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem);
-
-        constexpr bool load_phase = (hasPhase && !(static_phase<stag_phase>() && (reconLen == 13 || use_inphase)));
-        if constexpr (load_phase) prefetch_cache_line(gauge + parity * offset + phaseOffset + stride * dir + x);
-      }
-
-      __device__ inline void prefetch_bulk(int x, int dir, int parity, int block_size) const
-      {
-        if (target::is_thread_zero()) {
+        if constexpr (!bulk) {
+          // use per thread prefetching
 #pragma unroll
           for (int i = 0; i < M; i++)
-            prefetch_cache_bulk(gauge + parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N,
-                                block_size * N * sizeof(Float));
+            prefetch_cache_line(gauge + parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N);
 
           // now load any remainder
           if constexpr (Nrem > 0)
-            prefetch_cache_bulk(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem,
-                                block_size * Nrem * sizeof(Float));
+            prefetch_cache_line(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem);
 
           constexpr bool load_phase = (hasPhase && !(static_phase<stag_phase>() && (reconLen == 13 || use_inphase)));
-          if constexpr (load_phase)
-            prefetch_cache_bulk(gauge + parity * offset + phaseOffset + stride * dir, block_size * sizeof(Float));
+          if constexpr (load_phase) prefetch_cache_line(gauge + parity * offset + phaseOffset + stride * dir + x);
+        } else {
+          // bulk prefetch
+          if (block_size == 0) block_size = blockDim.x;
+          if (target::is_thread_zero()) {
+#pragma unroll
+            for (int i = 0; i < M; i++)
+              prefetch_cache_bulk(gauge + parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N,
+                                  block_size * N * sizeof(Float));
+
+            // now load any remainder
+            if constexpr (Nrem > 0)
+              prefetch_cache_bulk(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem,
+                                  block_size * Nrem * sizeof(Float));
+
+            constexpr bool load_phase = (hasPhase && !(static_phase<stag_phase>() && (reconLen == 13 || use_inphase)));
+            if constexpr (load_phase)
+              prefetch_cache_bulk(gauge + parity * offset + phaseOffset + stride * dir, block_size * sizeof(Float));
+          }
         }
       }
 
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index fd383c0d3f..52fd2b76bf 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -52,7 +52,7 @@ namespace quda
     const bool is_first_time_slice; /** are we on the first (global) time slice */
     const bool is_last_time_slice; /** are we on the last (global) time slice */
     static constexpr bool improved = improved_;
-    static constexpr int prefetch_distance = 0;
+    static constexpr int prefetch_distance = QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED;
 
     const real dagger_scale;
 
@@ -96,18 +96,24 @@ namespace quda
       int step = 4 * dim + 2 * dir + hop + arg.prefetch_distance;
       if (step >= 16) return;
 
-      // for TMA use arg.block_size and coord.x_cb_0
-      // also should have warp uniform parity
+      // if using a bulk prefetch we need to use block's first coordinate
+      auto x_cb = arg.prefetch_bulk ? coord.x_cb_0 : coord.x_cb;
+      x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
+
       int dim2 = step / 4;
       switch (step % 4) {
-      case 0: arg.U.prefetch(coord.x_cb, dim2, parity); break;
-      case 1: arg.L.prefetch(coord.x_cb, dim2, parity); break;
+      case 0: arg.U.prefetch<Arg::prefetch_bulk>(x_cb, dim2, parity); break;
+      case 1: arg.L.prefetch<Arg::prefetch_bulk>(x_cb, dim2, parity); break;
 #ifdef QUDA_DSLASH_DOUBLE_STORE
-      case 2: arg.Uback.prefetch(coord.x_cb, dim2, parity); break;
-      case 3: arg.Lback.prefetch(coord.x_cb, dim2, parity); break;
+      case 2: arg.Uback.prefetch<Arg::prefetch_bulk>(x_cb, dim2, parity); break;
+      case 3: arg.Lback.prefetch<Arg::prefetch_bulk>(x_cb, dim2, parity); break;
 #else
-      case 2: arg.U.prefetch(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity); break;
-      case 3: arg.L.prefetch(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity); break;
+      case 2:
+        arg.U.prefetch<Arg::prefetch_bulk>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity);
+        break;
+      case 3:
+        arg.L.prefetch<Arg::prefetch_bulk>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity);
+        break;
 #endif
       }
     }
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 3a937394d5..c7de6cf661 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -43,7 +43,7 @@ namespace quda
     /** parameters for distance preconditioning */
     const real alpha0;
     const int t0;
-    static constexpr int prefetch_distance = 0;
+    static constexpr int prefetch_distance = QUDA_DSLASH_PREFETCH_DISTANCE_WILSON;
 
     WilsonArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo,
               const GaugeField &U, const GaugeField &Uback, double a, cvector_ref<const ColorSpinorField> &x,
@@ -85,17 +85,20 @@ namespace quda
     // for TMA use arg.block_size
     int dim2 = step / 2;
     // need warp uniform variants of these and parity
-    const int x_cb = (Arg::nDim == 5 ? coord.x_cb % arg.dc.volume_4d_cb : coord.x_cb);
+
+    // if using a bulk prefetch we need to use block's first coordinate
+    auto x_cb = arg.prefetch_bulk ? coord.x_cb_0 : coord.x_cb;
+    x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
 
     switch (step % 2) {
-    case 0: arg.U.prefetch(x_cb, dim2, parity); break;
+    case 0: arg.U.prefetch<Arg::prefetch_bulk>(x_cb, dim2, parity); break;
 #ifdef QUDA_DSLASH_DOUBLE_STORE
-    case 1: arg.Uback.prefetch(x_cb, dim2, parity); break;
+    case 1: arg.Uback.prefetch<Arg::prefetch_bulk>(x_cb, dim2, parity); break;
 #else
     case 1: {
       const int back_idx = getNeighborIndexCB(coord, dim2, -1, arg.dc);
       const int idx1 = (Arg::nDim == 5 ? back_idx % arg.dc.volume_4d_cb : back_idx);
-      arg.U.prefetch(idx1, dim2, 1 - parity);
+      arg.U.prefetch<Arg::prefetch_bulk>(idx1, dim2, 1 - parity);
     } break;
 #endif
     }
diff --git a/include/quda_define.h.in b/include/quda_define.h.in
index 9b6c75f081..c4469a01ed 100644
--- a/include/quda_define.h.in
+++ b/include/quda_define.h.in
@@ -168,6 +168,37 @@
 #define GPU_DISTANCE_PRECONDITIONING
 #endif
 
+/**
+ * @def QUDA_DSLASH_DOUBLE_STORE
+ * @brief This macro sets whether to use double storage of the gauge
+ * field to simplify indexing in the Dslash kernels.
+ */
+#cmakedefine QUDA_DSLASH_DOUBLE_STORE
+
+/**
+ * @def QUDA_DSLASH_PREFETCH_BULK
+ * @brief This macro sets whether to use the TMA for L2 prefetching
+ */
+#cmakedefine QUDA_DSLASH_PREFETCH_BULK
+
+/**
+ * @def QUDA_DSLASH_PREFETCH_BULK
+ * @brief This macro sets whether to use the TMA for L2 prefetching
+ */
+#cmakedefine QUDA_DSLASH_PREFETCH_BULK
+
+/**
+ * @def QUDA_DSLASH_PREFETCH_DISTANCE_WILSON
+ * @brief This macro sets the prefetch distance for Wilson fermions
+ */
+#define QUDA_DSLASH_PREFETCH_DISTANCE_WILSON @QUDA_DSLASH_PREFETCH_DISTANCE_WILSON@
+
+/**
+ * @def QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED
+ * @brief This macro sets the prefetch distance for staggered fermions
+ */
+#define QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED @QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED@
+
 #cmakedefine QUDA_MULTIGRID
 #ifdef QUDA_MULTIGRID
 /**

From 7bb5cdc4b2f82cc172986b5103fcd0b09f905932 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 23 Oct 2025 14:26:36 -0700
Subject: [PATCH 019/121] Add target::uniform helper which is used to create
 warp-uniform variable on CUDA platform

---
 include/targets/cuda/target_device.h | 21 +++++++++++++++++++++
 include/targets/hip/target_device.h  |  9 +++++++++
 2 files changed, 30 insertions(+)

diff --git a/include/targets/cuda/target_device.h b/include/targets/cuda/target_device.h
index ac3b47dfdc..fc5db38863 100644
--- a/include/targets/cuda/target_device.h
+++ b/include/targets/cuda/target_device.h
@@ -231,6 +231,27 @@ namespace quda
 #endif
     }
 
+    template <bool is_device> struct uniform_impl {
+      template <typename T> T operator()(const T &t) { return t; }
+    };
+#ifdef QUDA_CUDA_CC
+    template <> struct uniform_impl<true> {
+      template <typename T> __device__ inline T operator()(const T &t) { return __shfl_sync(0xFFFFFFFF, t, 0); }
+    };
+#endif
+
+    /**
+       @brief Return the warp uniform variant of a given operand.
+       This is used to suggest to a compiler that a variable is
+       constant across the warp.
+       @param[in] t The input value we want to make warp uniform
+       @return The warp uniform variant
+    */
+    template <typename T> __device__ __host__ inline bool uniform(const T &t)
+    {
+      return target::dispatch<uniform_impl>(t);
+    }
+
   } // namespace target
 
   namespace device
diff --git a/include/targets/hip/target_device.h b/include/targets/hip/target_device.h
index 4075604cf2..d390599bdd 100644
--- a/include/targets/hip/target_device.h
+++ b/include/targets/hip/target_device.h
@@ -145,6 +145,15 @@ namespace quda
       return (thread_idx_linear<3>() % 64) == 0; // switch this to warp_size
     }
 
+    /**
+       @brief Return the warp uniform variant of a given operand.
+       This is used to suggest to a compiler that a variable is
+       constant across the warp.  Dummy for HIP.
+       @param[in] t The input value we want to make warp uniform
+       @return The warp uniform variant
+    */
+    template <typename T> constexpr bool uniform(const T &t) { return t; }
+
   } // namespace target
 
   namespace device

From f42a5073eb98a6b60b5e4f4455c2c73e7763722b Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 23 Oct 2025 14:33:20 -0700
Subject: [PATCH 020/121] Fix typo in last commit

---
 include/dslash_helper.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index cd8f2cbcb9..24122d68de 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -303,7 +303,7 @@ namespace quda
 #ifdef QUDA_DSLASH_PREFETCH_BULK
     static constexpr bool prefetch_bulk = true;
 #ifndef QUDA_DSLASH_DOUBLE_STORE
-    static_assert(!bulk, "Cannot use bulk prefetching unless QUDA_DSLASH_DOUBLE_STORE is enabled");
+    static_assert(!prefetch_bulk, "Cannot use bulk prefetching unless QUDA_DSLASH_DOUBLE_STORE is enabled");
 #endif
 #else
     static constexpr bool prefetch_bulk = false;

From e2df25f277b8200087eb5e87ae83485aba5cf04a Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 27 Oct 2025 15:20:48 -0700
Subject: [PATCH 021/121] Fix bug with non-double-store staggered dslash

---
 lib/dslash_staggered.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/dslash_staggered.hpp b/lib/dslash_staggered.hpp
index 874fe731e5..b14d259c8b 100644
--- a/lib/dslash_staggered.hpp
+++ b/lib/dslash_staggered.hpp
@@ -52,7 +52,7 @@ namespace quda
 #ifdef QUDA_DSLASH_DOUBLE_STORE
       GaugeField Uback = shift(U, 1);
 #else
-      const GaugeField &Uback = shift(U, 1);
+      const GaugeField &Uback = U;
 #endif
 
       if (U.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC

From 3010aa6d917f11e836b68b795243a01de8f51fdf Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 27 Oct 2025 16:27:54 -0700
Subject: [PATCH 022/121] Fix bug with parity setting

---
 include/dslash_helper.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 24122d68de..7107884bbd 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -747,14 +747,15 @@ namespace quda
     __forceinline__ __device__ void operator()(int, int s, int parity)
     {
       typename Arg::D dslash(*this);
-      // for full fields set parity from z thread index else use arg setting
-      if (arg.nParity == 1) parity = arg.parity;
 
       // FIXME need warp uniform parity which is not composable with
       // NVSHMEM since the latter requires blockDim.y and blockDim.z to
       // cover the entire extent
       parity = target::block_idx().z; // ensure parity is warp uniform
 
+      // for full fields set parity from z thread index else use arg setting
+      if (arg.nParity == 1) parity = arg.parity;
+
       if ((kernel_type == INTERIOR_KERNEL || kernel_type == UBER_KERNEL) &&
           target::block_idx().x < static_cast<unsigned int>(arg.pack_blocks)) {
         // first few blocks do packing kernel

From acfaf5bffdf99558cf91557dd20fa82f15455bb0 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 27 Oct 2025 16:29:54 -0700
Subject: [PATCH 023/121] Fix bulk prefetch of phase

---
 include/gauge_field_order.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index aa398df31e..3959e7a910 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1662,7 +1662,7 @@ namespace quda {
 
             constexpr bool load_phase = (hasPhase && !(static_phase<stag_phase>() && (reconLen == 13 || use_inphase)));
             if constexpr (load_phase)
-              prefetch_cache_bulk(gauge + parity * offset + phaseOffset + stride * dir, block_size * sizeof(Float));
+              prefetch_cache_bulk(gauge + parity * offset + phaseOffset + stride * dir + x, block_size * sizeof(Float));
           }
         }
       }

From 67f8ce44b82e1773448cc89ee91cb132c4ffab55 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 28 Oct 2025 13:30:29 -0700
Subject: [PATCH 024/121] Add 3-d and 4-d TMA prefetch instructions

---
 include/targets/cuda/inline_ptx.h    | 16 ++++++++++++++++
 include/targets/cuda/load_store.h    | 18 ++++++++++++++++++
 include/targets/generic/load_store.h | 20 +++++++++++++++++++-
 3 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/include/targets/cuda/inline_ptx.h b/include/targets/cuda/inline_ptx.h
index adf92c4720..251ba2b18c 100644
--- a/include/targets/cuda/inline_ptx.h
+++ b/include/targets/cuda/inline_ptx.h
@@ -485,4 +485,20 @@ namespace quda {
     asm volatile("cp.async.bulk.prefetch.L2.global [%0], %1;\n" ::"l"(p), "r"(static_cast<uint32_t>(bytes)));
   }
 
+  using tensor_desc_t = CUtensorMap;
+
+  __device__ __forceinline__ void prefetch_tma_3d(const tensor_desc_t &tensor_map, int x, int y, int z)
+  {
+    asm volatile("cp.async.bulk.prefetch.tensor.3d.L2.global.tile [%0, {%1, %2, %3}];" ::"l"(&tensor_map), "r"(x),
+                 "r"(y), "r"(z)
+                 : "memory");
+  }
+
+  __device__ __forceinline__ void prefetch_tma_4d(const tensor_desc_t &tensor_map, int x, int y, int z, int t)
+  {
+    asm volatile("cp.async.bulk.prefetch.tensor.4d.L2.global.tile [%0, {%1, %2, %3, %4}];" ::"l"(&tensor_map), "r"(x),
+                 "r"(y), "r"(z), "r"(t)
+                 : "memory");
+  }
+
 } // namespace quda
diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index 161c93cbe5..8a56ab6faa 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -166,12 +166,30 @@ namespace quda
 
   // pre-declaration of the prefetch_cache that we wish to specialize
   template <bool> struct prefetch_cache_bulk_imp;
+  template <bool> struct prefetch_cache_tensor_3d_imp;
+  template <bool> struct prefetch_cache_tensor_4d_imp;
 
 #if __COMPUTE_CAPABILITY__ >= 900
   // CUDA specialization of the prefetch_cache_bulk that uses TMA (requires Hopper+)
   template <> struct prefetch_cache_bulk_imp<true> {
     __device__ inline void operator()(const void *p, size_t bytes) { prefetch_tma(p, bytes); }
   };
+
+  // CUDA specialization of the prefetch_cache_tensor_3d that uses TMA (requires Hopper+)
+  template <> struct prefetch_cache_tensor_3d_imp<true> {
+    __device__ inline void operator()(const tensor_desc_t &desc, int x, int y, int z)
+    {
+      prefetch_tma_3d(desc, x, y, z);
+    }
+  };
+
+  // CUDA specialization of the prefetch_cache_tensor_4d that uses TMA (requires Hopper+)
+  template <> struct prefetch_cache_tensor_4d_imp<true> {
+    __device__ inline void operator()(const tensor_desc_t &desc, int x, int y, int z, int t)
+    {
+      prefetch_tma_4d(desc, x, y, z, t);
+    }
+  };
 #endif
 
 } // namespace quda
diff --git a/include/targets/generic/load_store.h b/include/targets/generic/load_store.h
index 8254509e74..453c097b3c 100644
--- a/include/targets/generic/load_store.h
+++ b/include/targets/generic/load_store.h
@@ -71,7 +71,7 @@ namespace quda
   __device__ __host__ inline void prefetch_cache_line(const void *p) { target::dispatch<prefetch_cache_line_imp>(p); }
 
   template <bool is_device> struct prefetch_cache_bulk_imp {
-    __device__ __host__ inline void operator()(const void *, size_t) { }
+    constexpr void operator()(const void *, size_t) { }
   };
 
   __device__ __host__ inline void prefetch_cache_bulk(const void *p, size_t bytes)
@@ -79,4 +79,22 @@ namespace quda
     target::dispatch<prefetch_cache_bulk_imp>(p, bytes);
   }
 
+  template <bool is_device> struct prefetch_cache_tensor_3d_imp {
+    constexpr void operator()(const tensor_desc_t &desc, int x, int y, int z) { }
+  };
+
+  __device__ __host__ inline void prefetch_cache_tensor_3d(const tensor_desc_t &desc, int x, int y, int z)
+  {
+    target::dispatch<prefetch_cache_tensor_3d_imp>(desc, x, y, z);
+  }
+
+  template <bool is_device> struct prefetch_cache_tensor_4d_imp {
+    constexpr void operator()(const tensor_desc_t &desc, int x, int y, int z, int t) { }
+  };
+
+  __device__ __host__ inline void prefetch_cache_tensor_4d(const tensor_desc_t &desc, int x, int y, int z, int t)
+  {
+    target::dispatch<prefetch_cache_tensor_4d_imp>(desc, x, y, z, t);
+  }
+
 } // namespace quda

From 946bed090cb2b0c31f1b9583a04fd09fb7239649 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 28 Oct 2025 13:51:36 -0700
Subject: [PATCH 025/121] first version of tensor descriptor TMA prefetch -
 almost certainly buggy

---
 CMakeLists.txt                       |  5 +-
 include/dslash_helper.cuh            |  8 +--
 include/gauge_field.h                |  9 ++++
 include/gauge_field_order.h          | 28 ++++++----
 include/kernels/dslash_staggered.cuh | 14 ++---
 include/kernels/dslash_wilson.cuh    |  8 +--
 include/quda_define.h.in             | 13 ++---
 include/targets/cuda/kernel.h        | 15 +++---
 lib/dslash_wilson.hpp                |  8 +--
 lib/gauge_field.cpp                  | 76 ++++++++++++++++++++++++++++
 10 files changed, 136 insertions(+), 48 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6fbf8dddbd..ef8664b2fb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -155,8 +155,9 @@ set(QUDA_DOMAIN_DECOMPOSITION "0" CACHE STRING "which domain decomposition to in
 
 option(QUDA_DSLASH_DOUBLE_STORE "store a forwards shifted copy of the gauge fields for simplified Dslash indexing" OFF)
 mark_as_advanced(QUDA_DSLASH_DOUBLE_STORE)
-option(QUDA_DSLASH_PREFETCH_BULK "enable bulk prefetching (Hopper+)" OFF)
-mark_as_advanced(QUDA_DSLASH_PREFETCH_BULK)
+set(QUDA_DSLASH_PREFETCH_TMA "0" CACHE STRING "enable TMA prefetching (Hopper+, 0 - disable, 1 - bulk, 2 - tensor)")
+set_property(CACHE QUDA_DSLASH_PREFETCH_TMA PROPERTY STRINGS 0 1 2)
+mark_as_advanced(QUDA_DSLASH_PREFETCH_TMA)
 
 set(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON "0" CACHE STRING "set prefetch distance for Wilson-like fermions")
 set(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED "0" CACHE STRING "set prefetch distance for staggered-like fermions")
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 7107884bbd..55c1925933 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -300,13 +300,9 @@ namespace quda
     static constexpr int max_regs = 0;             // by default we don't limit register count
     static constexpr bool spill_shared = false;    // whether a given kernel should use shared memory spilling
     static constexpr int prefetch_distance = 0;    // whether we are using prefetching in the dslash
-#ifdef QUDA_DSLASH_PREFETCH_BULK
-    static constexpr bool prefetch_bulk = true;
+    static constexpr int prefetch_tma = QUDA_DSLASH_PREFETCH_TMA;
 #ifndef QUDA_DSLASH_DOUBLE_STORE
-    static_assert(!prefetch_bulk, "Cannot use bulk prefetching unless QUDA_DSLASH_DOUBLE_STORE is enabled");
-#endif
-#else
-    static constexpr bool prefetch_bulk = false;
+    static_assert(!prefetch_tma, "Cannot use TMA prefetching unless QUDA_DSLASH_DOUBLE_STORE is enabled");
 #endif
     const int parity;  // only use this for single parity fields
     const int nParity; // number of parities we're working on
diff --git a/include/gauge_field.h b/include/gauge_field.h
index a4f3d0b590..3d46ece69e 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -43,6 +43,12 @@ namespace quda {
       return 0;
     }
 
+    struct tensor_desc_t {
+      alignas(64) CUtensorMap N;
+      alignas(64) CUtensorMap Nrem;
+      alignas(64) CUtensorMap phase;
+    };
+
   } // namespace gauge
 
   struct GaugeFieldParam : public LatticeFieldParam {
@@ -656,6 +662,9 @@ namespace quda {
      */
     void PrintMatrix(int dim, int parity, unsigned int x_cb, int rank = 0) const;
 
+    gauge::tensor_desc_t create_tensor_descriptor(uint32_t block_size) const;
+    gauge::tensor_desc_t &get_tensor_descriptor(uint32_t block_size) const;
+
     friend struct GaugeFieldParam;
   };
 
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 3959e7a910..3973f311f7 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1563,6 +1563,7 @@ namespace quda {
         Reconstruct<length, Float, recon, ghostExchange_, stag_phase> reconstruct;
         static constexpr int reconLen = recon;
         static constexpr int hasPhase = (reconLen == 9 || reconLen == 13) ? 1 : 0;
+        static constexpr bool loadPhase = hasPhase && !(static_phase<stag_phase>() && (reconLen == 13 || use_inphase));
         static constexpr int N = gauge::get_vector_order<Float>(reconLen - hasPhase);
         static constexpr int M = (reconLen - hasPhase) / N;
         static constexpr int Nrem = reconLen - hasPhase - M * N;
@@ -1580,6 +1581,7 @@ namespace quda {
         const int geometry;
         const AllocInt phaseOffset;
         size_t bytes;
+        gauge::tensor_desc_t tensor_desc;
 
         FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
           reconstruct(u),
@@ -1623,8 +1625,7 @@ namespace quda {
           copy(tmp + M * N, vecTmp);
         }
 
-        constexpr bool load_phase = (hasPhase && !(static_phase<stag_phase>() && (reconLen == 13 || use_inphase)));
-        if constexpr (load_phase) {
+        if constexpr (loadPhase) {
           copy(phase, gauge[parity * offset + phaseOffset + stride * dir + x]);
           phase *= static_cast<real>(2.0);
         }
@@ -1632,10 +1633,9 @@ namespace quda {
         reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
       }
 
-      template <bool bulk = false> __device__ inline void prefetch(int x, int dir, int parity, int block_size = 0) const
+      template <int type> __device__ inline void prefetch(int x, int dir, int parity, int block_size = 0) const
       {
-        if constexpr (!bulk) {
-          // use per thread prefetching
+        if constexpr (type == 0) { // use per thread prefetching
 #pragma unroll
           for (int i = 0; i < M; i++)
             prefetch_cache_line(gauge + parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N);
@@ -1644,10 +1644,8 @@ namespace quda {
           if constexpr (Nrem > 0)
             prefetch_cache_line(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem);
 
-          constexpr bool load_phase = (hasPhase && !(static_phase<stag_phase>() && (reconLen == 13 || use_inphase)));
-          if constexpr (load_phase) prefetch_cache_line(gauge + parity * offset + phaseOffset + stride * dir + x);
-        } else {
-          // bulk prefetch
+          if constexpr (loadPhase) prefetch_cache_line(gauge + parity * offset + phaseOffset + stride * dir + x);
+        } else if constexpr (type == 1) { // bulk prefetch
           if (block_size == 0) block_size = blockDim.x;
           if (target::is_thread_zero()) {
 #pragma unroll
@@ -1660,10 +1658,18 @@ namespace quda {
               prefetch_cache_bulk(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem,
                                   block_size * Nrem * sizeof(Float));
 
-            constexpr bool load_phase = (hasPhase && !(static_phase<stag_phase>() && (reconLen == 13 || use_inphase)));
-            if constexpr (load_phase)
+            if constexpr (loadPhase)
               prefetch_cache_bulk(gauge + parity * offset + phaseOffset + stride * dir + x, block_size * sizeof(Float));
           }
+        } else {                          // tensor prefetch
+          if (target::is_thread_zero()) { // perhaps 3-d is better here?
+            // prefetch_cache_tensor_3d(tensor_desc_N, x, dir, parity);
+            // if constexpr (Nrem > 0) prefetch_cache_tensor_3d(tensor_desc_Nrem, x, dir, parity);
+            // if constexpr (loadPhase) prefetch_cache_tensor_3d(tensor_desc_phase, x, dir, parity);
+            prefetch_cache_tensor_4d(tensor_desc.N, x, 0, dir, parity);
+            if constexpr (Nrem > 0) prefetch_cache_tensor_4d(tensor_desc.Nrem, x, 0, dir, parity);
+            if constexpr (loadPhase) prefetch_cache_tensor_4d(tensor_desc.phase, x, 0, dir, parity);
+          }
         }
       }
 
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 52fd2b76bf..b47d1ad078 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -97,22 +97,22 @@ namespace quda
       if (step >= 16) return;
 
       // if using a bulk prefetch we need to use block's first coordinate
-      auto x_cb = arg.prefetch_bulk ? coord.x_cb_0 : coord.x_cb;
+      auto x_cb = arg.prefetch_tma ? coord.x_cb_0 : coord.x_cb;
       x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
 
       int dim2 = step / 4;
       switch (step % 4) {
-      case 0: arg.U.prefetch<Arg::prefetch_bulk>(x_cb, dim2, parity); break;
-      case 1: arg.L.prefetch<Arg::prefetch_bulk>(x_cb, dim2, parity); break;
+      case 0: arg.U.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
+      case 1: arg.L.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
 #ifdef QUDA_DSLASH_DOUBLE_STORE
-      case 2: arg.Uback.prefetch<Arg::prefetch_bulk>(x_cb, dim2, parity); break;
-      case 3: arg.Lback.prefetch<Arg::prefetch_bulk>(x_cb, dim2, parity); break;
+      case 2: arg.Uback.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
+      case 3: arg.Lback.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
 #else
       case 2:
-        arg.U.prefetch<Arg::prefetch_bulk>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity);
+        arg.U.prefetch<Arg::prefetch_tma>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity);
         break;
       case 3:
-        arg.L.prefetch<Arg::prefetch_bulk>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity);
+        arg.L.prefetch<Arg::prefetch_tma>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity);
         break;
 #endif
       }
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index c7de6cf661..49846273ce 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -87,18 +87,18 @@ namespace quda
     // need warp uniform variants of these and parity
 
     // if using a bulk prefetch we need to use block's first coordinate
-    auto x_cb = arg.prefetch_bulk ? coord.x_cb_0 : coord.x_cb;
+    auto x_cb = arg.prefetch_tma ? coord.x_cb_0 : coord.x_cb;
     x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
 
     switch (step % 2) {
-    case 0: arg.U.prefetch<Arg::prefetch_bulk>(x_cb, dim2, parity); break;
+    case 0: arg.U.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
 #ifdef QUDA_DSLASH_DOUBLE_STORE
-    case 1: arg.Uback.prefetch<Arg::prefetch_bulk>(x_cb, dim2, parity); break;
+    case 1: arg.Uback.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
 #else
     case 1: {
       const int back_idx = getNeighborIndexCB(coord, dim2, -1, arg.dc);
       const int idx1 = (Arg::nDim == 5 ? back_idx % arg.dc.volume_4d_cb : back_idx);
-      arg.U.prefetch<Arg::prefetch_bulk>(idx1, dim2, 1 - parity);
+      arg.U.prefetch<Arg::prefetch_tma>(idx1, dim2, 1 - parity);
     } break;
 #endif
     }
diff --git a/include/quda_define.h.in b/include/quda_define.h.in
index c4469a01ed..e55dabb14a 100644
--- a/include/quda_define.h.in
+++ b/include/quda_define.h.in
@@ -176,16 +176,11 @@
 #cmakedefine QUDA_DSLASH_DOUBLE_STORE
 
 /**
- * @def QUDA_DSLASH_PREFETCH_BULK
- * @brief This macro sets whether to use the TMA for L2 prefetching
+ * @def QUDA_DSLASH_PREFETCH_TMA @brief This macro sets whether to use
+ * the TMA for L2 prefetching: 0 - no TMA, 1 - use bulk prefetch, 2 -
+ * use tensor prefetch
  */
-#cmakedefine QUDA_DSLASH_PREFETCH_BULK
-
-/**
- * @def QUDA_DSLASH_PREFETCH_BULK
- * @brief This macro sets whether to use the TMA for L2 prefetching
- */
-#cmakedefine QUDA_DSLASH_PREFETCH_BULK
+#define QUDA_DSLASH_PREFETCH_TMA @QUDA_DSLASH_PREFETCH_TMA@
 
 /**
  * @def QUDA_DSLASH_PREFETCH_DISTANCE_WILSON
diff --git a/include/targets/cuda/kernel.h b/include/targets/cuda/kernel.h
index d1956f647c..00eab6db97 100644
--- a/include/targets/cuda/kernel.h
+++ b/include/targets/cuda/kernel.h
@@ -57,7 +57,7 @@ namespace quda
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
   MAXNREG(Arg::max_regs)
-  __global__ std::enable_if_t<(device::use_kernel_arg<Arg>() && Arg::max_regs > 0), void> Kernel1D(Arg arg)
+  __global__ std::enable_if_t<(device::use_kernel_arg<Arg>() && Arg::max_regs > 0), void> Kernel1D(const GRID_CONSTANT Arg arg)
   {
     Kernel1D_impl<Functor, Arg, grid_stride>(arg);
   }
@@ -76,7 +76,8 @@ namespace quda
      @param[in] arg Kernel argument
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && Arg::max_regs == 0, void> Kernel1D(Arg arg)
+  __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && Arg::max_regs == 0, void>
+  Kernel1D(const GRID_CONSTANT Arg arg)
   {
     Kernel1D_impl<Functor, Arg, grid_stride>(arg);
   }
@@ -173,7 +174,7 @@ namespace quda
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
   MAXNREG(Arg::max_regs)
-  __global__ std::enable_if_t<(device::use_kernel_arg<Arg>() && Arg::max_regs > 0), void> Kernel2D(Arg arg)
+  __global__ std::enable_if_t<(device::use_kernel_arg<Arg>() && Arg::max_regs > 0), void> Kernel2D(const GRID_CONSTANT Arg arg)
   {
     Kernel2D_impl<Functor, Arg, grid_stride>(arg);
   }
@@ -192,7 +193,8 @@ namespace quda
      @param[in] arg Kernel argument
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && Arg::max_regs == 0, void> Kernel2D(Arg arg)
+  __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && Arg::max_regs == 0, void>
+  Kernel2D(const GRID_CONSTANT Arg arg)
   {
     Kernel2D_impl<Functor, Arg, grid_stride>(arg);
   }
@@ -291,7 +293,7 @@ namespace quda
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
   MAXNREG(Arg::max_regs)
-  __global__ std::enable_if_t<(device::use_kernel_arg<Arg>() && Arg::max_regs > 0), void> Kernel3D(Arg arg)
+  __global__ std::enable_if_t<(device::use_kernel_arg<Arg>() && Arg::max_regs > 0), void> Kernel3D(const GRID_CONSTANT Arg arg)
   {
     Kernel3D_impl<Functor, Arg, grid_stride>(arg);
   }
@@ -310,7 +312,8 @@ namespace quda
      @param[in] arg Kernel argument
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && Arg::max_regs == 0, void> Kernel3D(Arg arg)
+  __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && Arg::max_regs == 0, void>
+  Kernel3D(const GRID_CONSTANT Arg arg)
   {
     Kernel3D_impl<Functor, Arg, grid_stride>(arg);
   }
diff --git a/lib/dslash_wilson.hpp b/lib/dslash_wilson.hpp
index c1fc823d3e..47a2f343ea 100644
--- a/lib/dslash_wilson.hpp
+++ b/lib/dslash_wilson.hpp
@@ -18,11 +18,12 @@ namespace quda
   template <typename Arg> class Wilson : public Dslash<wilson, Arg>
   {
     using Dslash = Dslash<wilson, Arg>;
+    const GaugeField &U;
 
   public:
-    Wilson(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+    Wilson(Arg &arg, const GaugeField &U, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
            const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
@@ -30,6 +31,7 @@ namespace quda
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       Dslash::setParam(tp);
+      const_cast<quda::gauge::tensor_desc_t&>(Dslash::arg.U.tensor_desc) = U.get_tensor_descriptor(tp.block.x);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
   };
@@ -52,7 +54,7 @@ namespace quda
 
       WilsonArg<Float, nColor, nDim, DDArg, recon, distance_pc> arg(out, in, halo, U, Uback, a, x, parity, dagger,
                                                                     comm_override, alpha0, t0);
-      Wilson<decltype(arg)> wilson(arg, out, in, halo);
+      Wilson<decltype(arg)> wilson(arg, U, out, in, halo);
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);
     }
   };
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index fadba50fa8..e6598125c8 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -239,6 +239,82 @@ namespace quda {
     if (param.compute_fat_link_max) fat_link_max = this->abs_max();
   }
 
+  static std::map<int, gauge::tensor_desc_t> tensor_map;
+
+  gauge::tensor_desc_t GaugeField::create_tensor_descriptor(uint32_t block_size) const
+  {
+    gauge::tensor_desc_t tensor;
+
+    auto get_tensor_data_type = [&](size_t word_size) {
+      switch (word_size) {
+      case 1: return CU_TENSOR_MAP_DATA_TYPE_UINT8;
+      case 2: return CU_TENSOR_MAP_DATA_TYPE_UINT16;
+      case 4: return CU_TENSOR_MAP_DATA_TYPE_UINT32;
+      case 8: return CU_TENSOR_MAP_DATA_TYPE_UINT64;
+      default: errorQuda("Unsupported word size %d", precision);
+      }
+      return CU_TENSOR_MAP_DATA_TYPE_UINT8;
+    };
+
+    auto hasPhase = reconstruct == 9 || reconstruct == 13;
+    uint32_t N = gauge::get_vector_order(precision, reconstruct - hasPhase);
+    uint32_t M = (reconstruct - hasPhase) / N;
+    uint32_t Nrem = reconstruct - hasPhase - M * N;
+
+    CUtensorMapDataType dtype = get_tensor_data_type(precision);
+    {
+      uint64_t global_dim[4] = {uint64_t(stride * N), uint64_t(M), uint64_t(geometry), 2llu};
+      uint64_t global_stride[] = {precision * N * stride, precision * (N * M + Nrem) * stride, bytes / 2};
+      uint32_t box_dim[] = {block_size * N, M, 1, 1};
+      uint32_t element_stride[] = {1, 1, 1, 1};
+      auto data = this->data();
+      if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
+      auto res = cuTensorMapEncodeTiled(&tensor.N, dtype, 4, data, global_dim, global_stride, box_dim, element_stride,
+                                        CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+                                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+      if (res != CUDA_SUCCESS) errorQuda("cuTensorMapEncodeTiled failed: %d", (int)res);
+    }
+
+    if (Nrem > 0) {
+      uint64_t global_dim[4] = {uint64_t(stride * Nrem), 1llu, uint64_t(geometry), 2llu};
+      uint64_t global_stride[] = {precision * Nrem * stride, precision * Nrem * stride, bytes / 2};
+      uint32_t box_dim[] = {block_size * Nrem, 1, 1, 1};
+      uint32_t element_stride[] = {1, 1, 1, 1};
+      auto data = this->data<char *>() + M * N * stride * precision;
+      if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
+      auto res = cuTensorMapEncodeTiled(&tensor.Nrem, dtype, 4, data, global_dim, global_stride, box_dim,
+                                        element_stride, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+                                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+      if (res != CUDA_SUCCESS) errorQuda("cuTensorMapEncodeTiled failed: %d", (int)res);
+    }
+
+    if (hasPhase) {
+      uint64_t global_dim[4] = {uint64_t(stride), 1llu, uint64_t(geometry), 2llu};
+      uint64_t global_stride[] = {precision * stride, precision * stride, bytes / 2};
+      uint32_t box_dim[] = {block_size, 1, 1, 1};
+      uint32_t element_stride[] = {1, 1, 1, 1};
+      auto data = this->data<char *>() + PhaseOffset();
+      if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
+      auto res = cuTensorMapEncodeTiled(&tensor.phase, dtype, 4, data, global_dim, global_stride, box_dim,
+                                        element_stride, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+                                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+      if (res != CUDA_SUCCESS) errorQuda("cuTensorMapEncodeTiled failed: %d", (int)res);
+    }
+
+    return tensor;
+  }
+
+  gauge::tensor_desc_t &GaugeField::get_tensor_descriptor(uint32_t block_size) const
+  {
+    auto tensor = tensor_map.find(block_size);
+    if (tensor != tensor_map.end()) {
+      return tensor->second;
+    } else {
+      tensor_map[block_size] = create_tensor_descriptor(block_size);
+    }
+    return tensor_map[block_size];
+  }
+
   void GaugeField::move(GaugeField &&src)
   {
     init = std::exchange(src.init, {});

From d772d5fe20e36d3d868f2d74518663dd747d8ce9 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 28 Oct 2025 15:30:12 -0700
Subject: [PATCH 026/121] Fix some warnings and set Uback tensor descriptor for
 wilson dslash

---
 include/gauge_field.h                |  6 +++---
 include/targets/generic/load_store.h |  4 ++--
 lib/dslash_wilson.hpp                | 10 ++++++----
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/include/gauge_field.h b/include/gauge_field.h
index 3d46ece69e..1f5846875e 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -44,9 +44,9 @@ namespace quda {
     }
 
     struct tensor_desc_t {
-      alignas(64) CUtensorMap N;
-      alignas(64) CUtensorMap Nrem;
-      alignas(64) CUtensorMap phase;
+      CUtensorMap N;
+      CUtensorMap Nrem;
+      CUtensorMap phase;
     };
 
   } // namespace gauge
diff --git a/include/targets/generic/load_store.h b/include/targets/generic/load_store.h
index 453c097b3c..5e0c874ed8 100644
--- a/include/targets/generic/load_store.h
+++ b/include/targets/generic/load_store.h
@@ -80,7 +80,7 @@ namespace quda
   }
 
   template <bool is_device> struct prefetch_cache_tensor_3d_imp {
-    constexpr void operator()(const tensor_desc_t &desc, int x, int y, int z) { }
+    constexpr void operator()(const tensor_desc_t &, int, int, int) { }
   };
 
   __device__ __host__ inline void prefetch_cache_tensor_3d(const tensor_desc_t &desc, int x, int y, int z)
@@ -89,7 +89,7 @@ namespace quda
   }
 
   template <bool is_device> struct prefetch_cache_tensor_4d_imp {
-    constexpr void operator()(const tensor_desc_t &desc, int x, int y, int z, int t) { }
+    constexpr void operator()(const tensor_desc_t &, int, int, int, int) { }
   };
 
   __device__ __host__ inline void prefetch_cache_tensor_4d(const tensor_desc_t &desc, int x, int y, int z, int t)
diff --git a/lib/dslash_wilson.hpp b/lib/dslash_wilson.hpp
index 47a2f343ea..682bc6ae3a 100644
--- a/lib/dslash_wilson.hpp
+++ b/lib/dslash_wilson.hpp
@@ -19,11 +19,12 @@ namespace quda
   {
     using Dslash = Dslash<wilson, Arg>;
     const GaugeField &U;
+    const GaugeField &Uback;
 
   public:
-    Wilson(Arg &arg, const GaugeField &U, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-           const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo), U(U)
+    Wilson(Arg &arg, const GaugeField &U, const GaugeField &Uback, cvector_ref<ColorSpinorField> &out,
+          cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo), U(U), Uback(Uback)
     {
     }
 
@@ -32,6 +33,7 @@ namespace quda
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       Dslash::setParam(tp);
       const_cast<quda::gauge::tensor_desc_t&>(Dslash::arg.U.tensor_desc) = U.get_tensor_descriptor(tp.block.x);
+      const_cast<quda::gauge::tensor_desc_t&>(Dslash::arg.Uback.tensor_desc) = Uback.get_tensor_descriptor(tp.block.x);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
   };
@@ -54,7 +56,7 @@ namespace quda
 
       WilsonArg<Float, nColor, nDim, DDArg, recon, distance_pc> arg(out, in, halo, U, Uback, a, x, parity, dagger,
                                                                     comm_override, alpha0, t0);
-      Wilson<decltype(arg)> wilson(arg, U, out, in, halo);
+      Wilson<decltype(arg)> wilson(arg, U, Uback, out, in, halo);
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);
     }
   };

From 60894ec68972e5c3f625a64051fdfe72ef5b0ac9 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 3 Nov 2025 07:43:54 -0800
Subject: [PATCH 027/121] Add 5-d tensor prefetch instruction to CUDA. 
 Introduce 3-operand variants of vector_load and vector_store: these allow for
 hte pointer offset and the index to be computed together first in 32-bit,
 before accumulation to the pointer in 64-bit, reducing pointer arithmetic
 overheads

---
 include/targets/cuda/inline_ptx.h    | 11 +++++++++--
 include/targets/cuda/load_store.h    | 13 +++++++++++--
 include/targets/generic/load_store.h | 25 +++++++++++++++++++++++--
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/include/targets/cuda/inline_ptx.h b/include/targets/cuda/inline_ptx.h
index 251ba2b18c..b5a94266ba 100644
--- a/include/targets/cuda/inline_ptx.h
+++ b/include/targets/cuda/inline_ptx.h
@@ -494,10 +494,17 @@ namespace quda {
                  : "memory");
   }
 
-  __device__ __forceinline__ void prefetch_tma_4d(const tensor_desc_t &tensor_map, int x, int y, int z, int t)
+  __device__ __forceinline__ void prefetch_tma_4d(const tensor_desc_t &tensor_map, int x, int y, int z, int w)
   {
     asm volatile("cp.async.bulk.prefetch.tensor.4d.L2.global.tile [%0, {%1, %2, %3, %4}];" ::"l"(&tensor_map), "r"(x),
-                 "r"(y), "r"(z), "r"(t)
+                 "r"(y), "r"(z), "r"(w)
+                 : "memory");
+  }
+
+  __device__ __forceinline__ void prefetch_tma_5d(const tensor_desc_t &tensor_map, int x, int y, int z, int w, int u)
+  {
+    asm volatile("cp.async.bulk.prefetch.tensor.5d.L2.global.tile [%0, {%1, %2, %3, %4, %5}];" ::"l"(&tensor_map),
+                 "r"(x), "r"(y), "r"(z), "r"(w), "r"(u)
                  : "memory");
   }
 
diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index 8a56ab6faa..90c83ef59a 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -168,6 +168,7 @@ namespace quda
   template <bool> struct prefetch_cache_bulk_imp;
   template <bool> struct prefetch_cache_tensor_3d_imp;
   template <bool> struct prefetch_cache_tensor_4d_imp;
+  template <bool> struct prefetch_cache_tensor_5d_imp;
 
 #if __COMPUTE_CAPABILITY__ >= 900
   // CUDA specialization of the prefetch_cache_bulk that uses TMA (requires Hopper+)
@@ -185,9 +186,17 @@ namespace quda
 
   // CUDA specialization of the prefetch_cache_tensor_4d that uses TMA (requires Hopper+)
   template <> struct prefetch_cache_tensor_4d_imp<true> {
-    __device__ inline void operator()(const tensor_desc_t &desc, int x, int y, int z, int t)
+    __device__ inline void operator()(const tensor_desc_t &desc, int x, int y, int z, int w)
     {
-      prefetch_tma_4d(desc, x, y, z, t);
+      prefetch_tma_4d(desc, x, y, z, w);
+    }
+  };
+
+  // CUDA specialization of the prefetch_cache_tensor_5d that uses TMA (requires Hopper+)
+  template <> struct prefetch_cache_tensor_5d_imp<true> {
+    __device__ inline void operator()(const tensor_desc_t &desc, int x, int y, int z, int w, int u)
+    {
+      prefetch_tma_5d(desc, x, y, z, w, u);
     }
   };
 #endif
diff --git a/include/targets/generic/load_store.h b/include/targets/generic/load_store.h
index 5e0c874ed8..1562c9095f 100644
--- a/include/targets/generic/load_store.h
+++ b/include/targets/generic/load_store.h
@@ -39,6 +39,12 @@ namespace quda
     return value_a;
   }
 
+  template <typename scalar_t, int N, size_t prefetch = 0>
+  __device__ __host__ inline array<scalar_t, N> vector_load(const scalar_t *ptr, unsigned int offset, int idx)
+  {
+    return vector_load<scalar_t, N, prefetch>(ptr + (offset + N * idx), 0);
+  }
+
   /**
      @brief Non-specialized store operation
   */
@@ -64,6 +70,12 @@ namespace quda
     vector_store<vector_t>(ptr, idx, value_v);
   }
 
+  template <typename scalar_t, int N>
+  __device__ __host__ inline void vector_store(scalar_t *ptr, unsigned offset, int idx, const array<scalar_t, N> &value_a)
+  {
+    vector_store<scalar_t, N>(ptr + (offset + N * idx), 0, value_a);
+  }
+
   template <bool is_device> struct prefetch_cache_line_imp {
     __device__ __host__ inline void operator()(const void *) { }
   };
@@ -92,9 +104,18 @@ namespace quda
     constexpr void operator()(const tensor_desc_t &, int, int, int, int) { }
   };
 
-  __device__ __host__ inline void prefetch_cache_tensor_4d(const tensor_desc_t &desc, int x, int y, int z, int t)
+  __device__ __host__ inline void prefetch_cache_tensor_4d(const tensor_desc_t &desc, int x, int y, int z, int w)
+  {
+    target::dispatch<prefetch_cache_tensor_4d_imp>(desc, x, y, z, w);
+  }
+
+  template <bool is_device> struct prefetch_cache_tensor_5d_imp {
+    constexpr void operator()(const tensor_desc_t &, int, int, int, int, int) { }
+  };
+
+  __device__ __host__ inline void prefetch_cache_tensor_5d(const tensor_desc_t &desc, int x, int y, int z, int w, int u)
   {
-    target::dispatch<prefetch_cache_tensor_4d_imp>(desc, x, y, z, t);
+    target::dispatch<prefetch_cache_tensor_5d_imp>(desc, x, y, z, w, u);
   }
 
 } // namespace quda

From 991086918a66897f3e6e0691b9fb857917d08560 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 3 Nov 2025 07:45:52 -0800
Subject: [PATCH 028/121] colorspinor::FloatNOrder load/save functions use
 3-operand vector_load and vector_store to reduce indexing overheads

---
 include/color_spinor_field_order.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index a227e5a829..440725ae8d 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -1166,14 +1166,14 @@ namespace quda
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first load from memory
-          auto vecTmp = vector_load<Float, N>(field + parity * offset, volumeCB * i + x);
+          auto vecTmp = vector_load<Float, N>(field, parity * offset, volumeCB * i + x);
           // now copy into output and scale
           copy_and_scale(v + i * N, vecTmp, nrm);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
-          auto vecTmp = vector_load<Float, Nrem>(field + parity * offset + volumeCB * M * N, x);
+          auto vecTmp = vector_load<Float, Nrem>(field, parity * offset + volumeCB * M * N, x);
           copy_and_scale(v + M * N, vecTmp, nrm);
         }
 
@@ -1187,13 +1187,13 @@ namespace quda
         auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
         auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns));
 #endif
-        if constexpr (isFixed<Float>::value) prefetch_cache_line(norm + x + parity * norm_offset);
+        if constexpr (isFixed<Float>::value) prefetch_cache_line(norm + (x + parity * norm_offset));
 
 #pragma unroll
-        for (int i = 0; i < M; i++) prefetch_cache_line(field + parity * offset + (volumeCB * i + x) * N);
+        for (int i = 0; i < M; i++) prefetch_cache_line(field + (parity * offset + (volumeCB * i + x) * N));
 
         // now load any remainder
-        if constexpr (Nrem > 0) prefetch_cache_line(field + parity * offset + volumeCB * M * N + x * Nrem);
+        if constexpr (Nrem > 0) prefetch_cache_line(field + (parity * offset + volumeCB * M * N + x * Nrem));
       }
 
       __device__ __host__ inline void save(const complex in[length / 2], int x, int parity = 0) const
@@ -1201,7 +1201,7 @@ namespace quda
         real v[length];
 #ifndef LEGACY_ACCESSOR_NORM
         auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
-        auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns));
+        auto norm = reinterpret_cast<float *>(field + (volumeCB * 2 * Nc * Ns));
 #endif
 #pragma unroll
         for (int i = 0; i < length / 2; i++) {
@@ -1229,14 +1229,14 @@ namespace quda
           // first do scalar copy converting into storage type
           copy_and_scale<Float, real, N>(vecTmp, v + i * N, scale_inv);
           // second do vectorized copy into memory
-          vector_store(field + parity * offset, volumeCB * i + x, vecTmp);
+          vector_store(field, parity * offset, volumeCB * i + x, vecTmp);
         }
 
         if constexpr (Nrem > 0) {
           array<Float, Nrem> vecTmp;
           copy_and_scale<Float, real, Nrem>(vecTmp, v + M * N, scale_inv);
           // second do vectorized copy into memory
-          vector_store(field + parity * offset + volumeCB * M * N, x, vecTmp);
+          vector_store(field, parity * offset + volumeCB * M * N, x, vecTmp);
         }
       }
 

From b9a4d5f45810a2527cc491d722aac9862a5bdb59 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 3 Nov 2025 07:47:11 -0800
Subject: [PATCH 029/121] Continued improvements to tensor TMA prefetch variant
 and gauge::FloatNOrder uses optimized 3-operand indexing

---
 include/gauge_field_order.h | 31 +++++++++++-------------
 lib/gauge_field.cpp         | 47 ++++++++++++++++++++++++++-----------
 2 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 3973f311f7..ed23ce61dd 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1614,14 +1614,14 @@ namespace quda {
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first load from memory
-          auto vecTmp = vector_load<Float, N>(gauge + parity * offset + dir * (M * N + Nrem) * stride, i * stride + x);
+          auto vecTmp = vector_load<Float, N>(gauge, parity * offset + dir * (M * N + Nrem) * stride, i * stride + x);
           // second do copy converting into register type
           copy(tmp + i * N, vecTmp);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
-          auto vecTmp = vector_load<Float, Nrem>(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x);
+          auto vecTmp = vector_load<Float, Nrem>(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x);
           copy(tmp + M * N, vecTmp);
         }
 
@@ -1635,40 +1635,37 @@ namespace quda {
 
       template <int type> __device__ inline void prefetch(int x, int dir, int parity, int block_size = 0) const
       {
-        if constexpr (type == 0) { // use per thread prefetching
+        if constexpr (type == 0) { // use per-thread prefetching
 #pragma unroll
           for (int i = 0; i < M; i++)
-            prefetch_cache_line(gauge + parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N);
+            prefetch_cache_line(gauge + (parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N));
 
           // now load any remainder
           if constexpr (Nrem > 0)
-            prefetch_cache_line(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem);
+            prefetch_cache_line(gauge + (parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem));
 
-          if constexpr (loadPhase) prefetch_cache_line(gauge + parity * offset + phaseOffset + stride * dir + x);
+          if constexpr (loadPhase) prefetch_cache_line(gauge + (parity * offset + phaseOffset + stride * dir + x));
         } else if constexpr (type == 1) { // bulk prefetch
           if (block_size == 0) block_size = blockDim.x;
           if (target::is_thread_zero()) {
 #pragma unroll
             for (int i = 0; i < M; i++)
-              prefetch_cache_bulk(gauge + parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N,
+              prefetch_cache_bulk(gauge + (parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N),
                                   block_size * N * sizeof(Float));
 
             // now load any remainder
             if constexpr (Nrem > 0)
-              prefetch_cache_bulk(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem,
+              prefetch_cache_bulk(gauge + (parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem),
                                   block_size * Nrem * sizeof(Float));
 
             if constexpr (loadPhase)
-              prefetch_cache_bulk(gauge + parity * offset + phaseOffset + stride * dir + x, block_size * sizeof(Float));
+              prefetch_cache_bulk(gauge + (parity * offset + phaseOffset + stride * dir + x), block_size * sizeof(Float));
           }
-        } else {                          // tensor prefetch
-          if (target::is_thread_zero()) { // perhaps 3-d is better here?
-            // prefetch_cache_tensor_3d(tensor_desc_N, x, dir, parity);
-            // if constexpr (Nrem > 0) prefetch_cache_tensor_3d(tensor_desc_Nrem, x, dir, parity);
-            // if constexpr (loadPhase) prefetch_cache_tensor_3d(tensor_desc_phase, x, dir, parity);
-            prefetch_cache_tensor_4d(tensor_desc.N, x, 0, dir, parity);
-            if constexpr (Nrem > 0) prefetch_cache_tensor_4d(tensor_desc.Nrem, x, 0, dir, parity);
-            if constexpr (loadPhase) prefetch_cache_tensor_4d(tensor_desc.phase, x, 0, dir, parity);
+        } else { // n-d tensor prefetch
+          if (target::is_thread_zero()) {
+            prefetch_cache_tensor_5d(tensor_desc.N, x, x / 16, 0, dir, parity);
+            if constexpr (Nrem > 0) prefetch_cache_tensor_4d(tensor_desc.Nrem, x, x / 16, dir, parity);
+            if constexpr (loadPhase) prefetch_cache_tensor_4d(tensor_desc.phase, x, x / 16, dir, parity);
           }
         }
       }
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index e6598125c8..391ac93056 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -263,42 +263,61 @@ namespace quda {
 
     CUtensorMapDataType dtype = get_tensor_data_type(precision);
     {
-      uint64_t global_dim[4] = {uint64_t(stride * N), uint64_t(M), uint64_t(geometry), 2llu};
-      uint64_t global_stride[] = {precision * N * stride, precision * (N * M + Nrem) * stride, bytes / 2};
-      uint32_t box_dim[] = {block_size * N, M, 1, 1};
-      uint32_t element_stride[] = {1, 1, 1, 1};
+      if (stride % 16 != 0) errorQuda("Volume requirements not met: stride mod 16 = %lu", stride % 16);
+      uint64_t global_dim[] = {16llu * N, uint64_t(stride / 16), uint64_t(M), uint64_t(geometry), 2llu};
+      uint64_t global_stride[]
+        = {precision * 16llu * N, precision * stride * N, precision * stride * (N * M + Nrem), bytes / 2};
+      uint32_t box_dim[] = {16u * N, std::max(1u, block_size / 16), M, 1, 1};
+      uint32_t element_stride[] = {1, 1, 1, 1, 1};
       auto data = this->data();
       if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
-      auto res = cuTensorMapEncodeTiled(&tensor.N, dtype, 4, data, global_dim, global_stride, box_dim, element_stride,
+      auto res = cuTensorMapEncodeTiled(&tensor.N, dtype, 5, data, global_dim, global_stride, box_dim, element_stride,
                                         CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
                                         CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
-      if (res != CUDA_SUCCESS) errorQuda("cuTensorMapEncodeTiled failed: %d", (int)res);
+      if (res != CUDA_SUCCESS) {
+        const char *errStr = nullptr;
+        cuGetErrorString(res, &errStr);
+        errorQuda("cuTensorMapEncodeTiled failed: %s", errStr);
+      }
     }
 
     if (Nrem > 0) {
-      uint64_t global_dim[4] = {uint64_t(stride * Nrem), 1llu, uint64_t(geometry), 2llu};
-      uint64_t global_stride[] = {precision * Nrem * stride, precision * Nrem * stride, bytes / 2};
-      uint32_t box_dim[] = {block_size * Nrem, 1, 1, 1};
+      if (stride % 16 != 0) errorQuda("Volume requirements not met: stride mod 16 = %lu", stride % 16);
+      uint64_t global_dim[]
+        = {16llu * Nrem, uint64_t(stride / 16), uint64_t(geometry), 2llu}; // can remove the M dimension?
+      uint64_t global_stride[] = {precision * 16llu * Nrem, precision * stride * (N * M + Nrem), bytes / 2};
+      uint32_t box_dim[] = {16u * Nrem, std::max(1u, block_size / 16), 1, 1, 1};
       uint32_t element_stride[] = {1, 1, 1, 1};
       auto data = this->data<char *>() + M * N * stride * precision;
       if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
       auto res = cuTensorMapEncodeTiled(&tensor.Nrem, dtype, 4, data, global_dim, global_stride, box_dim,
                                         element_stride, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
                                         CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
-      if (res != CUDA_SUCCESS) errorQuda("cuTensorMapEncodeTiled failed: %d", (int)res);
+      if (res != CUDA_SUCCESS) {
+        const char *errStr = nullptr;
+        cuGetErrorString(res, &errStr);
+        errorQuda("cuTensorMapEncodeTiled failed: %s box = {%u, %u, %u, %u}", errStr, box_dim[0], box_dim[1],
+                  box_dim[2], box_dim[3]);
+      }
     }
 
     if (hasPhase) {
-      uint64_t global_dim[4] = {uint64_t(stride), 1llu, uint64_t(geometry), 2llu};
-      uint64_t global_stride[] = {precision * stride, precision * stride, bytes / 2};
-      uint32_t box_dim[] = {block_size, 1, 1, 1};
+      if (stride % 16 != 0) errorQuda("Volume requirements not met: stride mod 16 = %lu", stride % 16);
+      uint64_t global_dim[] = {16llu, uint64_t(stride / 16), uint64_t(geometry), 2llu};
+      uint64_t global_stride[] = {precision * 16llu, precision * stride, bytes / 2};
+      uint32_t box_dim[] = {16u, std::max(1u, block_size / 16u), 1, 1};
       uint32_t element_stride[] = {1, 1, 1, 1};
       auto data = this->data<char *>() + PhaseOffset();
       if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
       auto res = cuTensorMapEncodeTiled(&tensor.phase, dtype, 4, data, global_dim, global_stride, box_dim,
                                         element_stride, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
                                         CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
-      if (res != CUDA_SUCCESS) errorQuda("cuTensorMapEncodeTiled failed: %d", (int)res);
+      if (res != CUDA_SUCCESS) {
+        const char *errStr = nullptr;
+        cuGetErrorString(res, &errStr);
+        errorQuda("cuTensorMapEncodeTiled failed: %s box = {%u, %u, %u, %u}", errStr, box_dim[0], box_dim[1],
+                  box_dim[2], box_dim[3]);
+      }
     }
 
     return tensor;

From 23992e06ba73dc293a72a645e8788212648df597 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 4 Nov 2025 13:34:52 -0800
Subject: [PATCH 030/121] Guard TMA tensor descriptor creation with
 __COMPUTE_CAPABILITY__ >= 900

TMA (Tensor Memory Accelerator) is only available on Hopper (sm_90+) and
later architectures. This commit wraps the cuTensorMapEncodeTiled calls
with a compile-time guard to prevent runtime errors on Volta/Ampere GPUs.
---
 lib/gauge_field.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 391ac93056..ea1b2c049f 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -245,6 +245,7 @@ namespace quda {
   {
     gauge::tensor_desc_t tensor;
 
+#if __COMPUTE_CAPABILITY__ >= 900
     auto get_tensor_data_type = [&](size_t word_size) {
       switch (word_size) {
       case 1: return CU_TENSOR_MAP_DATA_TYPE_UINT8;
@@ -319,6 +320,7 @@ namespace quda {
                   box_dim[2], box_dim[3]);
       }
     }
+#endif // __COMPUTE_CAPABILITY__ >= 900
 
     return tensor;
   }

From f0f9afd541c6017ccbd9da0b9f7c908cf04d5070 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 4 Nov 2025 14:14:19 -0800
Subject: [PATCH 031/121] Optimization for fixed point gauge field load with
 QUDA_RECONSTRUCT_NO: combine scaling factors to reduce number of multiplies

---
 include/gauge_field_order.h | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index ed23ce61dd..f800b896fd 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1030,14 +1030,12 @@ namespace quda {
         __device__ __host__ inline void Unpack(complex out[N / 2], const real in[N], int, int, real, const I *,
                                                const int *) const
         {
-          if constexpr (isFixed<Float>::value) {
+          // For recon==18, scaling is handled in FloatNOrder::load() via copy_and_scale
+          // For other recon types, this Unpack is never called (they have their own specializations)
 #pragma unroll
-            for (int i = 0; i < N / 2; i++) { out[i] = scale * complex(in[2 * i + 0], in[2 * i + 1]); }
-          } else {
-#pragma unroll
-            for (int i = 0; i < N / 2; i++) { out[i] = complex(in[2 * i + 0], in[2 * i + 1]); }
-          }
+          for (int i = 0; i < N / 2; i++) { out[i] = complex(in[2 * i + 0], in[2 * i + 1]); }
         }
+
         __device__ __host__ inline real getPhase(const complex[]) const { return 0; }
     };
 
@@ -1582,6 +1580,7 @@ namespace quda {
         const AllocInt phaseOffset;
         size_t bytes;
         gauge::tensor_desc_t tensor_desc;
+        const real combined_scale; // Precomputed scale for copy_and_scale: fixedInvMaxValue * reconstruct.scale
 
         FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
           reconstruct(u),
@@ -1592,7 +1591,16 @@ namespace quda {
           stride(u.Stride()),
           geometry(u.Geometry()),
           phaseOffset(u.PhaseOffset() / sizeof(Float)),
-          bytes(u.Bytes())
+          bytes(u.Bytes()),
+          combined_scale([&]() {
+            if constexpr (recon == 18) {
+              // QUDA_RECONSTRUCT_NO: combine fixedInvMaxValue with reconstruct.scale
+              return isFixed<Float>::value ? fixedInvMaxValue<Float>::value * reconstruct.scale : 1.0;
+            } else {
+              // Other reconstruction types: only need fixedInvMaxValue (reconstruct.scale doesn't exist)
+              return isFixed<Float>::value ? fixedInvMaxValue<Float>::value : 1.0;
+            }
+          }())
         {
           if (geometry == QUDA_COARSE_GEOMETRY)
             errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
@@ -1615,14 +1623,14 @@ namespace quda {
         for (int i = 0; i < M; i++) {
           // first load from memory
           auto vecTmp = vector_load<Float, N>(gauge, parity * offset + dir * (M * N + Nrem) * stride, i * stride + x);
-          // second do copy converting into register type
-          copy(tmp + i * N, vecTmp);
+          // second do copy converting into register type with combined scaling
+          copy_and_scale(tmp + i * N, vecTmp, combined_scale);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
           auto vecTmp = vector_load<Float, Nrem>(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x);
-          copy(tmp + M * N, vecTmp);
+          copy_and_scale(tmp + M * N, vecTmp, combined_scale);
         }
 
         if constexpr (loadPhase) {

From cfaa7051e5549f05927cbceeeac6636d3258507d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 4 Nov 2025 17:07:24 -0800
Subject: [PATCH 032/121] Optimization of fixed-point phase rescaling

---
 include/gauge_field_order.h | 53 +++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index f800b896fd..9240e3cca6 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1030,8 +1030,6 @@ namespace quda {
         __device__ __host__ inline void Unpack(complex out[N / 2], const real in[N], int, int, real, const I *,
                                                const int *) const
         {
-          // For recon==18, scaling is handled in FloatNOrder::load() via copy_and_scale
-          // For other recon types, this Unpack is never called (they have their own specializations)
 #pragma unroll
           for (int i = 0; i < N / 2; i++) { out[i] = complex(in[2 * i + 0], in[2 * i + 1]); }
         }
@@ -1581,6 +1579,7 @@ namespace quda {
         size_t bytes;
         gauge::tensor_desc_t tensor_desc;
         const real combined_scale; // Precomputed scale for copy_and_scale: fixedInvMaxValue * reconstruct.scale
+        const real phase_scale; // Precomputed scale for phase loading: fixedInvMaxValue * 2.0 (or just 2.0 for float)
 
         FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
           reconstruct(u),
@@ -1600,7 +1599,9 @@ namespace quda {
               // Other reconstruction types: only need fixedInvMaxValue (reconstruct.scale doesn't exist)
               return isFixed<Float>::value ? fixedInvMaxValue<Float>::value : 1.0;
             }
-          }())
+          }()),
+          phase_scale(isFixed<Float>::value ? fixedInvMaxValue<Float>::value * static_cast<real>(2.0) :
+                                              static_cast<real>(2.0))
         {
           if (geometry == QUDA_COARSE_GEOMETRY)
             errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
@@ -1634,8 +1635,12 @@ namespace quda {
         }
 
         if constexpr (loadPhase) {
-          copy(phase, gauge[parity * offset + phaseOffset + stride * dir + x]);
-          phase *= static_cast<real>(2.0);
+          if constexpr (isFixed<Float>::value) {
+            copy_and_scale(phase, gauge[parity * offset + phaseOffset + stride * dir + x], phase_scale);
+          } else {
+            copy(phase, gauge[parity * offset + phaseOffset + stride * dir + x]);
+            phase *= static_cast<real>(2.0);
+          }
         }
 
         reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
@@ -1736,15 +1741,15 @@ namespace quda {
             // first do vectorized copy from memory into registers
             auto vecTmp = vector_load<Float, N>(ghost[dir], (i * 2 + parity) * faceVolumeCB[dir] + x);
 
-            // second do copy converting into register type
-            copy(tmp + i * N, vecTmp);
+            // second do copy converting into register type with combined scaling
+            copy_and_scale(tmp + i * N, vecTmp, combined_scale);
           }
 
           // now load any remainder
           if constexpr (Nrem > 0) {
             auto vecTmp
               = vector_load<Float, Nrem>(ghost[dir] + 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x);
-            copy(tmp + M * N, vecTmp);
+            copy_and_scale(tmp + M * N, vecTmp, combined_scale);
           }
 
           real phase = 0.;
@@ -1753,8 +1758,13 @@ namespace quda {
             // if(stag_phase == QUDA_STAGGERED_PHASE_MILC )  {
             //   phase = inphase < static_cast<real>(0) ? static_cast<real>(-0.5) : static_cast<real>(0.5);
             // } else {
-            copy(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x]);
-            phase *= static_cast<real>(2.0);
+            if constexpr (isFixed<Float>::value) {
+              copy_and_scale(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x],
+                             phase_scale);
+            } else {
+              copy(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x]);
+              phase *= static_cast<real>(2.0);
+            }
             // }
           }
           reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
@@ -1839,8 +1849,8 @@ namespace quda {
           auto vecTmp = vector_load<Float, N>(ghost[dim] + dir * reconLen * 2 * geometry * R[dim] * faceVolumeCB[dim],
                                               ((i * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
-          // second do copy converting into register type
-          copy(tmp + i * N, vecTmp);
+          // second do copy converting into register type with combined scaling
+          copy_and_scale(tmp + i * N, vecTmp, combined_scale);
         }
 
         // now load any remainder
@@ -1849,14 +1859,23 @@ namespace quda {
             = vector_load<Float, Nrem>(ghost[dim] + (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
                                        (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
-          copy(tmp + M * N, vecTmp);
+          copy_and_scale(tmp + M * N, vecTmp, combined_scale);
         }
 
         real phase = 0.;
-        if constexpr (hasPhase)
-          copy(phase,
-               ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
-                          + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x]);
+        if constexpr (hasPhase) {
+          if constexpr (isFixed<Float>::value) {
+            copy_and_scale(phase,
+                           ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
+                                      + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x],
+                           phase_scale);
+          } else {
+            copy(phase,
+                 ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
+                            + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x]);
+            phase *= static_cast<real>(2.0);
+          }
+        }
 
         // use the extended_idx to determine the boundary condition
         reconstruct.Unpack(v, tmp, extended_idx, g, 2. * phase, X, R);

From 17d349ca1e666220ed27892e1ff90f3fe37f9366 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 4 Nov 2025 23:14:51 -0800
Subject: [PATCH 033/121] Small optimziation to recon-8 unpack, reduces
 reconstruct by 4 multiplications

---
 include/gauge_field_order.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 9240e3cca6..216857a63a 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1385,29 +1385,31 @@ namespace quda {
           real r_inv2 = u0_inv * row_sum_inv;
           {
             complex A = cmul(conj(out[0]), out[3]);
+            complex u0A = u0 * A;
 
             // out[4] = -(conj(out[6])*conj(out[2]) + u0*A*out[1])*r_inv2; // U11
             out[4] = cmul(conj(out[6]), conj(out[2]));
-            out[4] = cmac(u0 * A, out[1], out[4]);
+            out[4] = cmac(u0A, out[1], out[4]);
             out[4] = -r_inv2 * out[4];
 
             // out[5] = (conj(out[6])*conj(out[1]) - u0*A*out[2])*r_inv2;  // U12
             out[5] = cmul(conj(out[6]), conj(out[1]));
-            out[5] = cmac(-u0 * A, out[2], out[5]);
+            out[5] = cmac(-u0A, out[2], out[5]);
             out[5] = r_inv2 * out[5];
           }
 
           {
             complex A = cmul(conj(out[0]), out[6]);
+            complex u0A = u0 * A;
 
             // out[7] = (conj(out[3])*conj(out[2]) - u0*A*out[1])*r_inv2;  // U21
             out[7] = cmul(conj(out[3]), conj(out[2]));
-            out[7] = cmac(-u0 * A, out[1], out[7]);
+            out[7] = cmac(-u0A, out[1], out[7]);
             out[7] = r_inv2 * out[7];
 
             // out[8] = -(conj(out[3])*conj(out[1]) + u0*A*out[2])*r_inv2; // U12
             out[8] = cmul(conj(out[3]), conj(out[1]));
-            out[8] = cmac(u0 * A, out[2], out[8]);
+            out[8] = cmac(u0A, out[2], out[8]);
             out[8] = -r_inv2 * out[8];
           }
 

From a5abce8a0bbd32f30582fd7ff4641d2a39af7277 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 5 Nov 2025 08:55:33 -0800
Subject: [PATCH 034/121] Fix backward hopping ghost boundary check in
 staggered dslash

The backward hopping term was incorrectly using coord1.in_boundary[1][d]
(forward boundary) instead of coord1.in_boundary[0][d] (backward boundary).
This caused incorrect ghost zone detection for backward hops, manifesting
as failures in non-trivial MPI partitions.

Bug introduced in commit 0642f638bb when adding L2 prefetching support.
---
 include/kernels/dslash_staggered.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index b47d1ad078..d6c855a070 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -208,7 +208,7 @@ namespace quda
 
       if (arg.dd_in.doHopping(coord, d, -1)) {
         // Backward gather - compute back offset for spinor and gauge fetch
-        const bool ghost = coord1.in_boundary[1][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
+        const bool ghost = coord1.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx2 = ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 1);

From c2658842c810be97482fa24f6de38e0abecd9aa4 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 5 Nov 2025 09:33:40 -0800
Subject: [PATCH 035/121] Fix UBSAN error: avoid pointer arithmetic on null
 pointers

Replace ternary operator with if-else to prevent evaluating pointer
arithmetic when ghost pointers are null. Add additional null check
since ghost_[2*dim+dir] can be null even when comm_dim_partitioned(dim)
is true (e.g., during initialization or in certain edge cases).

This fixes the UBSAN error:
  runtime error: applying non-zero offset to null pointer
---
 include/color_spinor_field_order.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 440725ae8d..3298a48dd7 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -1010,11 +1010,14 @@ namespace quda
       {
         for (int dim = 0; dim < 4; dim++) {
           for (int dir = 0; dir < 2; dir++) {
-            ghost[2 * dim + dir] = comm_dim_partitioned(dim) ? static_cast<Float *>(ghost_[2 * dim + dir]) : nullptr;
-            ghost_norm[2 * dim + dir] = !comm_dim_partitioned(dim) ?
-              nullptr :
-              reinterpret_cast<norm_type *>(static_cast<char *>(ghost_[2 * dim + dir])
-                                            + nParity * length_ghost * faceVolumeCB[dim] * sizeof(Float));
+            if (comm_dim_partitioned(dim) && ghost_[2 * dim + dir]) {
+              ghost[2 * dim + dir] = static_cast<Float *>(ghost_[2 * dim + dir]);
+              ghost_norm[2 * dim + dir] = reinterpret_cast<norm_type *>(
+                static_cast<char *>(ghost_[2 * dim + dir]) + nParity * length_ghost * faceVolumeCB[dim] * sizeof(Float));
+            } else {
+              ghost[2 * dim + dir] = nullptr;
+              ghost_norm[2 * dim + dir] = nullptr;
+            }
           }
         }
       }

From aee623d0683c45405459a7bbf0764815a80703c8 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 5 Nov 2025 10:03:01 -0800
Subject: [PATCH 036/121] Optimize vector_load/vector_store in
 gauge_field_order.h to reduce 64-bit integer arithmetic

Convert vector_load and vector_store calls to use 3-operand and 4-operand
forms where applicable, separating large constant pointer offsets from
varying index calculations. This reduces 64-bit integer arithmetic in
favor of 32-bit operations on GPU.

Changes applied to:
- loadGhost() and saveGhost() remainder cases
- loadGhostEx() and saveGhostEx() all cases

The optimization uses implicit conversion to unsigned int for the offset
parameter, allowing more efficient address calculations.
---
 include/gauge_field_order.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 216857a63a..8127dff32d 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1697,7 +1697,7 @@ namespace quda {
 #pragma unroll
           for (int j = 0; j < N; j++) copy(vecTmp[j], tmp[i * N + j]);
           // second do vectorized copy into memory
-          vector_store(gauge + parity * offset + dir * (M * N + Nrem) * stride, x + i * stride, vecTmp);
+          vector_store(gauge, parity * offset + dir * (M * N + Nrem) * stride, x + i * stride, vecTmp);
         }
 
         // now save any remainder
@@ -1706,7 +1706,7 @@ namespace quda {
 #pragma unroll
           for (int j = 0; j < Nrem; j++) copy(vecTmp[j], tmp[M * N + j]);
           // second do vectorized copy into memory
-          vector_store(gauge + parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x, vecTmp);
+          vector_store(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x, vecTmp);
         }
 
         if constexpr (hasPhase) {
@@ -1750,7 +1750,7 @@ namespace quda {
           // now load any remainder
           if constexpr (Nrem > 0) {
             auto vecTmp
-              = vector_load<Float, Nrem>(ghost[dir] + 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x);
+              = vector_load<Float, Nrem>(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x);
             copy_and_scale(tmp + M * N, vecTmp, combined_scale);
           }
 
@@ -1797,7 +1797,7 @@ namespace quda {
 #pragma unroll
             for (int j = 0; j < Nrem; j++) copy(vecTmp[j], tmp[M * N + j]);
             // second do vectorized copy into memory
-            vector_store(ghost[dir] + 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x, vecTmp);
+            vector_store(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x, vecTmp);
           }
 
           if constexpr (hasPhase) {
@@ -1848,7 +1848,7 @@ namespace quda {
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first do vectorized copy from memory
-          auto vecTmp = vector_load<Float, N>(ghost[dim] + dir * reconLen * 2 * geometry * R[dim] * faceVolumeCB[dim],
+          auto vecTmp = vector_load<Float, N>(ghost[dim], dir * reconLen * 2 * geometry * R[dim] * faceVolumeCB[dim],
                                               ((i * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
           // second do copy converting into register type with combined scaling
@@ -1858,7 +1858,7 @@ namespace quda {
         // now load any remainder
         if constexpr (Nrem > 0) {
           auto vecTmp
-            = vector_load<Float, Nrem>(ghost[dim] + (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
+            = vector_load<Float, Nrem>(ghost[dim], (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
                                        (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
           copy_and_scale(tmp + M * N, vecTmp, combined_scale);
@@ -1896,7 +1896,7 @@ namespace quda {
 #pragma unroll
           for (int j = 0; j < N; j++) copy(vecTmp[j], tmp[i * N + j]);
           // second do vectorized copy to memory
-          vector_store(ghost[dim] + dir * reconLen * 2 * geometry * R[dim] * faceVolumeCB[dim],
+          vector_store(ghost[dim], dir * reconLen * 2 * geometry * R[dim] * faceVolumeCB[dim],
                        ((i * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] + x, vecTmp);
         }
 
@@ -1906,7 +1906,7 @@ namespace quda {
 #pragma unroll
           for (int j = 0; j < Nrem; j++) copy(vecTmp[j], tmp[M * N + j]);
           // second do vectorized copy into memory
-          vector_store(ghost[dim] + (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
+          vector_store(ghost[dim], (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
                        (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x, vecTmp);
         }
 

From 6cfc18a6d1433b06cb04177f17fb7495beadb423 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 5 Nov 2025 13:22:08 -0800
Subject: [PATCH 037/121] Fix double-store dslash kernels when we have T
 partitioning - boundary condition needs to be tweaked

---
 include/gauge_field_order.h          | 112 ++++++++++++++-------------
 include/kernels/dslash_staggered.cuh |  17 ++--
 include/kernels/dslash_wilson.cuh    |   8 +-
 lib/gauge_shift.cu                   |   1 +
 4 files changed, 74 insertions(+), 64 deletions(-)

diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 8127dff32d..21b8106163 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -997,7 +997,7 @@ namespace quda {
          type)
       */
     template <int N, typename Float, QudaReconstructType, QudaGhostExchange ghostExchange_,
-              QudaStaggeredPhase = QUDA_STAGGERED_PHASE_NO>
+              QudaStaggeredPhase = QUDA_STAGGERED_PHASE_NO, bool = false>
     struct Reconstruct {
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
@@ -1048,36 +1048,40 @@ namespace quda {
          @param isLastTimeSlide if we're on the last time slice of nodes
          @param ghostExchange if the field is extended or not (determines indexing type)
       */
-      template <QudaGhostExchange ghostExchange_, typename T, typename I>
-      __device__ __host__ inline T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM],
-          T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice,
-          bool isLastTimeSlice, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_NO)
-      {
+    template <QudaGhostExchange ghostExchange_, bool shifted, typename T, typename I>
+    __device__ __host__ inline T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM], T tBoundary,
+                                              T scale, int firstTimeSliceBound, int lastTimeSliceBound,
+                                              bool isFirstTimeSlice, bool isLastTimeSlice,
+                                              QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_NO)
+    {
 
-        // MWTODO: should this return tBoundary : scale or tBoundary*scale : scale
+      // MWTODO: should this return tBoundary : scale or tBoundary*scale : scale
 
-        if (ghostExchange_ == QUDA_GHOST_EXCHANGE_PAD
-            || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED)) {
-          if (idx >= firstTimeSliceBound) { // halo region on the first time slice
-            return isFirstTimeSlice ? tBoundary : scale;
-          } else if (idx >= lastTimeSliceBound) { // last link on the last time slice
-            return isLastTimeSlice ? tBoundary : scale;
-          } else {
-            return scale;
-          }
-        } else if (ghostExchange_ == QUDA_GHOST_EXCHANGE_EXTENDED
-            || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED)) {
-          if (idx >= (R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < R[3] * X[0] * X[1] * X[2] / 2) {
-            // the boundary condition is on the R[3]-1 time slice
-            return isFirstTimeSlice ? tBoundary : scale;
-          } else if (idx >= (X[3] - R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < (X[3] - R[3]) * X[0] * X[1] * X[2] / 2) {
-            // the boundary condition lies on the X[3]-R[3]-1 time slice
-            return isLastTimeSlice ? tBoundary : scale;
-          } else {
-            return scale;
-          }
+      if (ghostExchange_ == QUDA_GHOST_EXCHANGE_PAD
+          || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED)) {
+
+        if (!shifted && idx >= firstTimeSliceBound) { // halo region on the first time slice
+          return isFirstTimeSlice ? tBoundary : scale;
+        } else if (shifted && idx < firstTimeSliceBound) { // shifted link on first time slice
+          return isFirstTimeSlice ? tBoundary : scale;
+        } else if (!shifted && idx >= lastTimeSliceBound) { // last link on the last time slice
+          return isLastTimeSlice ? tBoundary : scale;
+        } else {
+          return scale;
+        }
+      } else if (ghostExchange_ == QUDA_GHOST_EXCHANGE_EXTENDED
+                 || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED)) {
+        if (idx >= (R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < R[3] * X[0] * X[1] * X[2] / 2) {
+          // the boundary condition is on the R[3]-1 time slice
+          return isFirstTimeSlice ? tBoundary : scale;
+        } else if (idx >= (X[3] - R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < (X[3] - R[3]) * X[0] * X[1] * X[2] / 2) {
+          // the boundary condition lies on the X[3]-R[3]-1 time slice
+          return isLastTimeSlice ? tBoundary : scale;
+        } else {
+          return scale;
         }
-        return scale;
+      }
+      return scale;
       }
 
       // not actually used - here for reference
@@ -1100,8 +1104,8 @@ namespace quda {
          @tparam ghostExchange_ optional template the ghostExchange
          type to avoid the run-time overhead
       */
-      template <typename Float, QudaGhostExchange ghostExchange_>
-      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_12, ghostExchange_> {
+      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase phase, bool shifted>
+      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_12, ghostExchange_, phase, shifted> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         const real anisotropy;
@@ -1115,7 +1119,7 @@ namespace quda {
         Reconstruct(const GaugeField &u) :
           anisotropy(u.Anisotropy()),
           tBoundary(static_cast<real>(u.TBoundary())),
-          firstTimeSliceBound(u.VolumeCB()),
+          firstTimeSliceBound(!shifted ? u.VolumeCB() : u.X()[0] * u.X()[1] * u.X()[2] / 2),
           lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),
           isFirstTimeSlice(comm_coord(3) == 0 ? true : false),
           isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
@@ -1141,8 +1145,8 @@ namespace quda {
 
           const real u0 = dir < 3 ?
             anisotropy :
-            timeBoundary<ghostExchange_>(idx, X, R, tBoundary, static_cast<real>(1.0), firstTimeSliceBound,
-                                         lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange);
+            timeBoundary<ghostExchange_, shifted>(idx, X, R, tBoundary, static_cast<real>(1.0), firstTimeSliceBound,
+                                                  lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange);
 
           // out[6] = u0*conj(out[1]*out[5] - out[2]*out[4]);
           out[6] = cmul(out[2], out[4]);
@@ -1173,8 +1177,8 @@ namespace quda {
          @tparam ghostExchange_ optional template the ghostExchange
          type to avoid the run-time overhead
       */
-      template <typename Float, QudaGhostExchange ghostExchange_>
-      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_10, ghostExchange_> {
+      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase phase, bool shifted>
+      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_10, ghostExchange_, phase, shifted> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
 
@@ -1221,8 +1225,8 @@ namespace quda {
          @tparam ghostExchange_ optional template the ghostExchange
          type to avoid the run-time overhead
       */
-      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
-      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_13, ghostExchange_, stag_phase> {
+      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase, bool shifted>
+      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_13, ghostExchange_, stag_phase, shifted> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         const Reconstruct<18, Float, QUDA_RECONSTRUCT_12, ghostExchange_> reconstruct_12;
@@ -1298,8 +1302,8 @@ namespace quda {
          @tparam ghostExchange_ optional template the ghostExchange type
          to avoid the run-time overhead
       */
-      template <typename Float, QudaGhostExchange ghostExchange_>
-      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_8, ghostExchange_> {
+      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase, bool shifted>
+      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_8, ghostExchange_, stag_phase, shifted> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         const complex anisotropy; // imaginary value stores inverse
@@ -1314,7 +1318,7 @@ namespace quda {
         Reconstruct(const GaugeField &u, real scale = 1.0) :
           anisotropy(u.Anisotropy() * scale, 1.0 / (u.Anisotropy() * scale)),
           tBoundary(static_cast<real>(u.TBoundary()) * scale, 1.0 / (static_cast<real>(u.TBoundary()) * scale)),
-          firstTimeSliceBound(u.VolumeCB()),
+          firstTimeSliceBound(!shifted ? u.VolumeCB() : u.X()[0] * u.X()[1] * u.X()[2] / 2),
           lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),
           isFirstTimeSlice(comm_coord(3) == 0 ? true : false),
           isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
@@ -1431,8 +1435,8 @@ namespace quda {
         {
           complex u = dir < 3 ?
             anisotropy :
-            timeBoundary<ghostExchange_>(idx, X, R, tBoundary, scale, firstTimeSliceBound, lastTimeSliceBound,
-                                         isFirstTimeSlice, isLastTimeSlice, ghostExchange);
+            timeBoundary<ghostExchange_, shifted>(idx, X, R, tBoundary, scale, firstTimeSliceBound, lastTimeSliceBound,
+                                                  isFirstTimeSlice, isLastTimeSlice, ghostExchange);
 
           Unpack(out, in, idx, dir, phase, X, R, scale, u);
         }
@@ -1448,11 +1452,11 @@ namespace quda {
          @tparam ghostExchange_ optional template the ghostExchange type
          to avoid the run-time overhead
       */
-      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
-      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_9, ghostExchange_, stag_phase> {
+      template <typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase, bool shifted>
+      struct Reconstruct<18, Float, QUDA_RECONSTRUCT_9, ghostExchange_, stag_phase, shifted> {
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
-        const Reconstruct<18, Float, QUDA_RECONSTRUCT_8, ghostExchange_> reconstruct_8;
+        const Reconstruct<18, Float, QUDA_RECONSTRUCT_8, ghostExchange_, stag_phase, shifted> reconstruct_8;
         const real scale;
         const real scale_inv;
 
@@ -1549,16 +1553,16 @@ namespace quda {
 
       template <typename Float, int length_, QudaReconstructType recon,
                 QudaStaggeredPhase stag_phase = QUDA_STAGGERED_PHASE_NO, bool huge_alloc = default_huge_alloc,
-                QudaGhostExchange ghostExchange_ = QUDA_GHOST_EXCHANGE_INVALID, bool use_inphase = false>
+                QudaGhostExchange ghostExchange_ = QUDA_GHOST_EXCHANGE_INVALID, bool use_inphase = false, bool shifted = false>
       struct FloatNOrder {
-        using Accessor = FloatNOrder<Float, length_, recon, stag_phase, huge_alloc, ghostExchange_, use_inphase>;
+        using Accessor = FloatNOrder<Float, length_, recon, stag_phase, huge_alloc, ghostExchange_, use_inphase, shifted>;
 
         using store_t = Float;
         static constexpr int length = length_;
         using real = typename mapper<Float>::type;
         using complex = complex<real>;
         typedef typename AllocType<huge_alloc>::type AllocInt;
-        Reconstruct<length, Float, recon, ghostExchange_, stag_phase> reconstruct;
+        Reconstruct<length, Float, recon, ghostExchange_, stag_phase, shifted> reconstruct;
         static constexpr int reconLen = recon;
         static constexpr int hasPhase = (reconLen == 9 || reconLen == 13) ? 1 : 0;
         static constexpr bool loadPhase = hasPhase && !(static_phase<stag_phase>() && (reconLen == 13 || use_inphase));
@@ -2512,20 +2516,20 @@ namespace quda {
 
   template <typename T, QudaReconstructType recon, int N = 18, QudaStaggeredPhase stag = QUDA_STAGGERED_PHASE_NO,
             bool huge_alloc = gauge::default_huge_alloc, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_INVALID,
-            bool use_inphase = false, QudaGaugeFieldOrder order = QUDA_NATIVE_GAUGE_ORDER>
+            bool use_inphase = false, QudaGaugeFieldOrder order = QUDA_NATIVE_GAUGE_ORDER, bool shifted = false>
   struct gauge_mapper {
-    typedef gauge::FloatNOrder<T, N, recon, stag, huge_alloc, ghostExchange, use_inphase> type;
+    typedef gauge::FloatNOrder<T, N, recon, stag, huge_alloc, ghostExchange, use_inphase, shifted> type;
   };
 
   template <typename T, QudaReconstructType recon, int N, QudaStaggeredPhase stag, bool huge_alloc,
-            QudaGhostExchange ghostExchange, bool use_inphase>
-  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_MILC_GAUGE_ORDER> {
+            QudaGhostExchange ghostExchange, bool use_inphase, bool shifted>
+  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_MILC_GAUGE_ORDER, shifted> {
     typedef gauge::MILCOrder<T, N> type;
   };
 
   template <typename T, QudaReconstructType recon, int N, QudaStaggeredPhase stag, bool huge_alloc,
-            QudaGhostExchange ghostExchange, bool use_inphase>
-  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_QDP_GAUGE_ORDER> {
+            QudaGhostExchange ghostExchange, bool use_inphase, bool shifted>
+  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_QDP_GAUGE_ORDER, shifted> {
     typedef gauge::QDPOrder<T, N> type;
   };
 
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index d6c855a070..49d0c27b61 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -33,19 +33,22 @@ namespace quda
     static constexpr QudaGhostExchange ghost = QUDA_GHOST_EXCHANGE_PAD;
     static constexpr bool use_inphase = improved_ ? false : true;
     static constexpr QudaStaggeredPhase phase = phase_;
-    using GU = typename gauge_mapper<Float, reconstruct_u, 18, phase, gauge_direct_load, ghost, use_inphase>::type;
-    using GL =
-        typename gauge_mapper<Float, reconstruct_l, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost, use_inphase>::type;
+    template <bool shifted>
+    using GU = typename gauge_mapper<Float, reconstruct_u, 18, phase, gauge_direct_load, ghost, use_inphase,
+                                     QUDA_NATIVE_GAUGE_ORDER, shifted>::type;
+    template <bool shifted>
+    using GL = typename gauge_mapper<Float, reconstruct_l, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost,
+                                     use_inphase, QUDA_NATIVE_GAUGE_ORDER, shifted>::type;
 
     F out[MAX_MULTI_RHS];  /** output vector field */
     F in[MAX_MULTI_RHS];   /** input vector field */
     const Ghost halo_pack; /** accessor for writing the halo */
     const Ghost halo;      /** accessor for reading the halo */
     F x[MAX_MULTI_RHS];    /** input vector when doing xpay */
-    const GU U; /** the gauge field */
-    const GU Uback; /** the gauge field */
-    const GL L; /** the long gauge field */
-    const GL Lback; /** the long gauge field */
+    const GU<false> U;     /** the gauge field */
+    const GU<true> Uback;  /** the gauge field */
+    const GL<false> L;     /** the long gauge field */
+    const GL<true> Lback;  /** the long gauge field */
 
     const real a; /** xpay scale factor */
     const real tboundary; /** temporal boundary condition */
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 49846273ce..d7bd2ab301 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -28,7 +28,9 @@ namespace quda
     static constexpr bool distance_pc = distance_pc_;
     static constexpr bool gauge_direct_load = false; // false means texture load
     static constexpr QudaGhostExchange ghost = QUDA_GHOST_EXCHANGE_PAD;
-    typedef typename gauge_mapper<Float, reconstruct, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost>::type G;
+    template <bool shifted>
+    using G = typename gauge_mapper<Float, reconstruct, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost, false,
+                                    QUDA_NATIVE_GAUGE_ORDER, shifted>::type;
 
     typedef typename mapper<Float>::type real;
 
@@ -37,8 +39,8 @@ namespace quda
     F x[MAX_MULTI_RHS];   /** input vector set when doing xpay */
     Ghost halo_pack;
     Ghost halo;
-    const G U;    /** the gauge field */
-    const G Uback; /** the backwards gauge field */
+    const G<false> U;    /** the gauge field */
+    const G<true> Uback; /** the backwards gauge field */
     const real a; /** xpay scale factor - can be -kappa or -kappa^2 */
     /** parameters for distance preconditioning */
     const real alpha0;
diff --git a/lib/gauge_shift.cu b/lib/gauge_shift.cu
index cc5997e4ee..1bf692a096 100644
--- a/lib/gauge_shift.cu
+++ b/lib/gauge_shift.cu
@@ -59,6 +59,7 @@ namespace quda
     GaugeFieldParam param(in);
     param.create = QUDA_NULL_FIELD_CREATE;
     GaugeField out(param);
+    const_cast<double&>(out.LinkMax()) = in.LinkMax();
     instantiate<GaugeShifter>(out, in, shift);
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
     return out;

From 168f097a2c4c3e049e5ab12c6e785c6fe7351f12 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 5 Nov 2025 14:44:15 -0800
Subject: [PATCH 038/121] Fix performance when using double-store gauge field:
 shifted gauge field is now cached and owned by the unshifted field

---
 include/dslash.h                     |  4 +---
 include/dslash_helper.cuh            | 12 +++++++++---
 include/gauge_field.h                | 13 ++++++++++++-
 include/kernels/dslash_staggered.cuh |  8 ++++----
 include/kernels/dslash_wilson.cuh    |  6 +++---
 lib/dslash_improved_staggered.hpp    | 11 ++---------
 lib/dslash_staggered.hpp             |  9 ++-------
 lib/dslash_wilson.hpp                | 20 +++++++-------------
 lib/gauge_field.cpp                  |  7 +++++++
 9 files changed, 47 insertions(+), 43 deletions(-)

diff --git a/include/dslash.h b/include/dslash.h
index a33c9a821d..1084027993 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -70,9 +70,7 @@ namespace quda
       char tile_str[16];
       i32toa(tile_str, Arg::n_src_tile);
       strcat(aux_base, tile_str);
-#ifdef QUDA_DSLASH_DOUBLE_STORE
-      strcat(aux_base, ",double_store");
-#endif
+      if constexpr (dslash_double_store()) strcat(aux_base, ",double_store");
       if constexpr (Arg::prefetch_distance > 0) {
         strcat(aux_base, ",prefetch=");
         i32toa(tile_str, Arg::prefetch_distance);
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 55c1925933..c55a588486 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -20,6 +20,13 @@ constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::TRUE;
 
 namespace quda
 {
+
+#ifdef QUDA_DSLASH_DOUBLE_STORE
+  constexpr bool dslash_double_store() { return true; }
+#else
+  constexpr bool dslash_double_store() { return false; }
+#endif
+
   /**
      @brief Helper function to determine if we should do halo
      computation
@@ -301,9 +308,8 @@ namespace quda
     static constexpr bool spill_shared = false;    // whether a given kernel should use shared memory spilling
     static constexpr int prefetch_distance = 0;    // whether we are using prefetching in the dslash
     static constexpr int prefetch_tma = QUDA_DSLASH_PREFETCH_TMA;
-#ifndef QUDA_DSLASH_DOUBLE_STORE
-    static_assert(!prefetch_tma, "Cannot use TMA prefetching unless QUDA_DSLASH_DOUBLE_STORE is enabled");
-#endif
+    static_assert(!prefetch_tma || dslash_double_store(),
+                  "Cannot use TMA prefetching unless QUDA_DSLASH_DOUBLE_STORE is enabled");
     const int parity;  // only use this for single parity fields
     const int nParity; // number of parities we're working on
     const QudaReconstructType reconstruct;
diff --git a/include/gauge_field.h b/include/gauge_field.h
index 1f5846875e..38cc4b2de9 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <memory>
 #include <quda_internal.h>
 #include <quda.h>
 #include <lattice_field.h>
@@ -199,6 +200,8 @@ namespace quda {
     double tadpole = 0.0;
     double fat_link_max = 0.0;
 
+    mutable std::unique_ptr<GaugeField> shifted; // shifted copy of the gauge field, used for double-store enabled dslash
+
     mutable array<quda_ptr, 2 *QUDA_MAX_DIM> ghost
       = {}; // stores the ghost zone of the gauge field (non-native fields only)
 
@@ -653,6 +656,14 @@ namespace quda {
       }
     }
 
+    /**
+       @brief Return the shifted gauge field by shift in each
+       dimension.  Shifted field is cached for subsequent reuse.
+       @param[in] shift value (1 or 3 supported)
+       @return Reference to shifted field
+    */
+    GaugeField &shift(int shift) const;
+
     /**
      * @brief Print the site data
      * @param[in] parity Parity index
@@ -683,9 +694,9 @@ namespace quda {
      the resulting shifted field.  This is used to move the backwards
      links on to this site.  The input field must be a padded field
      with the ghost pre-exchanged if communications are enabled.
-     @param[out] out Output shifted field
      @param[in] in Input shifted field
      @param[in] shift value (1 or 3 supported)
+     @return Shifted field
    */
   GaugeField shift(const GaugeField &in, int shift);
 
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 49d0c27b61..e5fa03d75c 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -60,13 +60,13 @@ namespace quda
     const real dagger_scale;
 
     StaggeredArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                 const ColorSpinorField &halo, const GaugeField &U, const GaugeField &Uback, const GaugeField &L,
-                 const GaugeField &Lback, double a, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
-                 const int *comm_override) :
+                 const ColorSpinorField &halo, const GaugeField &U, const GaugeField &L, double a,
+                 cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
       DslashArg < Float,
     nDim, DDArg, improved ? 3 : 1, n_src_tile
       > (out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, spin_project, comm_override),
-    halo_pack(halo, improved_ ? 3 : 1), halo(halo, improved_ ? 3 : 1), U(U), Uback(Uback), L(L), Lback(Lback), a(a),
+    halo_pack(halo, improved_ ? 3 : 1), halo(halo, improved_ ? 3 : 1), U(U),
+    Uback(dslash_double_store() ? U.shift(1) : U), L(L), Lback(dslash_double_store() ? L.shift(3) : L), a(a),
     tboundary(U.TBoundary()), is_first_time_slice(comm_coord(3) == 0 ? true : false),
     is_last_time_slice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
     dagger_scale(dagger ? static_cast<real>(-1.0) : static_cast<real>(1.0))
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index d7bd2ab301..06352cfd0e 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -48,14 +48,14 @@ namespace quda
     static constexpr int prefetch_distance = QUDA_DSLASH_PREFETCH_DISTANCE_WILSON;
 
     WilsonArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo,
-              const GaugeField &U, const GaugeField &Uback, double a, cvector_ref<const ColorSpinorField> &x,
-              int parity, bool dagger, const int *comm_override, double alpha0 = 0.0, int t0 = -1) :
+              const GaugeField &U, double a, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
+              const int *comm_override, double alpha0 = 0.0, int t0 = -1) :
       DslashArg<Float, nDim, DDArg>(out, in, halo, U, x, parity, dagger, a != 0.0 ? true : false, spin_project,
                                     comm_override),
       halo_pack(halo),
       halo(halo),
       U(U),
-      Uback(Uback),
+      Uback(dslash_double_store() ? U.shift(1) : U),
       a(a),
       alpha0(alpha0),
       t0(t0)
diff --git a/lib/dslash_improved_staggered.hpp b/lib/dslash_improved_staggered.hpp
index 07b6396ee8..59bf28f46d 100644
--- a/lib/dslash_improved_staggered.hpp
+++ b/lib/dslash_improved_staggered.hpp
@@ -154,15 +154,8 @@ namespace quda
       constexpr QudaReconstructType recon_u = QUDA_RECONSTRUCT_NO;
       auto halo = ColorSpinorField::create_comms_batch(in, 3);
 
-#ifdef QUDA_DSLASH_DOUBLE_STORE
-      GaugeField Uback = shift(U, 1);
-      GaugeField Lback = shift(L, 3);
-#else
-      const GaugeField &Uback = U;
-      const GaugeField &Lback = L;
-#endif
-      StaggeredArg<Float, nColor, nDim, DDArg, recon_u, recon_l, improved> arg(out, in, halo, U, Uback, L, Lback, a, x,
-                                                                               parity, dagger, comm_override);
+      StaggeredArg<Float, nColor, nDim, DDArg, recon_u, recon_l, improved> arg(out, in, halo, U, L, a, x, parity,
+                                                                               dagger, comm_override);
       Staggered<decltype(arg)> staggered(arg, out, in, halo, L);
       dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, in, halo, profile);
     }
diff --git a/lib/dslash_staggered.hpp b/lib/dslash_staggered.hpp
index b14d259c8b..51a15c9ae4 100644
--- a/lib/dslash_staggered.hpp
+++ b/lib/dslash_staggered.hpp
@@ -49,17 +49,12 @@ namespace quda
       constexpr int nDim = 4;
       constexpr bool improved = false;
       auto halo = ColorSpinorField::create_comms_batch(in);
-#ifdef QUDA_DSLASH_DOUBLE_STORE
-      GaugeField Uback = shift(U, 1);
-#else
-      const GaugeField &Uback = U;
-#endif
 
       if (U.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC
           || (U.LinkType() == QUDA_GENERAL_LINKS && U.Reconstruct() == QUDA_RECONSTRUCT_NO)) {
         if constexpr (is_enabled<QUDA_MILC_GAUGE_ORDER>()) {
           StaggeredArg<Float, nColor, nDim, DDArg, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_MILC> arg(
-            out, in, halo, U, Uback, U, Uback, a, x, parity, dagger, comm_override);
+            out, in, halo, U, U, a, x, parity, dagger, comm_override);
           Staggered<decltype(arg)> staggered(arg, out, in, halo);
 
           dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, in, halo, profile);
@@ -69,7 +64,7 @@ namespace quda
       } else if (U.StaggeredPhase() == QUDA_STAGGERED_PHASE_TIFR) {
         if constexpr (is_enabled<QUDA_TIFR_GAUGE_ORDER>()) {
           StaggeredArg<Float, nColor, nDim, DDArg, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_TIFR> arg(
-            out, in, halo, U, Uback, U, Uback, a, x, parity, dagger, comm_override);
+            out, in, halo, U, U, a, x, parity, dagger, comm_override);
           Staggered<decltype(arg)> staggered(arg, out, in, halo);
 
           dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, in, halo, profile);
diff --git a/lib/dslash_wilson.hpp b/lib/dslash_wilson.hpp
index 682bc6ae3a..16f81bf391 100644
--- a/lib/dslash_wilson.hpp
+++ b/lib/dslash_wilson.hpp
@@ -19,12 +19,11 @@ namespace quda
   {
     using Dslash = Dslash<wilson, Arg>;
     const GaugeField &U;
-    const GaugeField &Uback;
 
   public:
-    Wilson(Arg &arg, const GaugeField &U, const GaugeField &Uback, cvector_ref<ColorSpinorField> &out,
-          cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo), U(U), Uback(Uback)
+    Wilson(Arg &arg, const GaugeField &U, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+           const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
@@ -33,7 +32,8 @@ namespace quda
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       Dslash::setParam(tp);
       const_cast<quda::gauge::tensor_desc_t&>(Dslash::arg.U.tensor_desc) = U.get_tensor_descriptor(tp.block.x);
-      const_cast<quda::gauge::tensor_desc_t&>(Dslash::arg.Uback.tensor_desc) = Uback.get_tensor_descriptor(tp.block.x);
+      const_cast<quda::gauge::tensor_desc_t &>(Dslash::arg.Uback.tensor_desc)
+        = (U.shift(1)).get_tensor_descriptor(tp.block.x);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
   };
@@ -48,15 +48,9 @@ namespace quda
       constexpr int nDim = 4;
       auto halo = ColorSpinorField::create_comms_batch(in);
 
-#ifdef QUDA_DSLASH_DOUBLE_STORE
-      GaugeField Uback = shift(U, 1);
-#else
-      const GaugeField &Uback = U;
-#endif
-
-      WilsonArg<Float, nColor, nDim, DDArg, recon, distance_pc> arg(out, in, halo, U, Uback, a, x, parity, dagger,
+      WilsonArg<Float, nColor, nDim, DDArg, recon, distance_pc> arg(out, in, halo, U, a, x, parity, dagger,
                                                                     comm_override, alpha0, t0);
-      Wilson<decltype(arg)> wilson(arg, U, Uback, out, in, halo);
+      Wilson<decltype(arg)> wilson(arg, U, out, in, halo);
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);
     }
   };
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index ea1b2c049f..a5566a9a4f 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -1516,6 +1516,13 @@ namespace quda {
     }
   }
 
+  GaugeField& GaugeField::shift(int shift_offset) const
+  {
+    // If we don't yet have a cached shifted copy or the shift value changed
+    if (!shifted) shifted = std::make_unique<GaugeField>(::quda::shift(*this, shift_offset));
+    return *shifted;
+  }
+
   void GaugeField::PrintMatrix(int dim, int parity, unsigned int x_cb, int rank) const
   {
     genericPrintMatrix(*this, dim, parity, x_cb, rank);

From f11bd843761206aa2dbb9a085328bbefa3e9cf9b Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 5 Nov 2025 14:50:51 -0800
Subject: [PATCH 039/121] Dslash prefetch should distinguish in the aux string

---
 include/dslash.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/dslash.h b/include/dslash.h
index 1084027993..5e8c12dbe6 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -75,6 +75,7 @@ namespace quda
         strcat(aux_base, ",prefetch=");
         i32toa(tile_str, Arg::prefetch_distance);
         strcat(aux_base, tile_str);
+        if constexpr (Arg::prefetch_tma) strcat(aux_base, Arg::prefetch_tma == 1 ? ",tma=bulk" : ",tma=tensor");
       }
     }
 

From a2a9b24a277b867a4fdb70d007cbcccde91c167f Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 6 Nov 2025 12:48:25 -0800
Subject: [PATCH 040/121] Added experimental optimization: replace parity *
 offset with bitmask approach to reduce IMAD count

---
 include/color_spinor_field_order.h   | 93 +++++++++++++++------------
 include/gauge_field_order.h          | 95 +++++++++++++++-------------
 include/kernels/dslash_staggered.cuh | 69 ++++++++++++++------
 3 files changed, 156 insertions(+), 101 deletions(-)

diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 3298a48dd7..d0c0a59773 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -971,7 +971,8 @@ namespace quda
       }
     };
 
-    template <typename Float, int Ns, int Nc, bool spin_project = false, bool huge_alloc = false, bool disable_ghost = false>
+    template <typename Float, int Ns, int Nc, bool spin_project = false, bool huge_alloc = false,
+              bool disable_ghost = false, bool use_parity_mask = false>
     struct GhostNOrder {
       GhostNOrder() = default;
       GhostNOrder(const GhostNOrder &) = default;
@@ -979,15 +980,15 @@ namespace quda
       GhostNOrder &operator=(const GhostNOrder &) = default;
     };
 
-    template <typename Float, int Ns, int Nc, bool spin_project, bool huge_alloc>
-    struct GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, false> {
+    template <typename Float, int Ns, int Nc, bool spin_project, bool huge_alloc, bool use_parity_mask>
+    struct GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, false, use_parity_mask> {
       static constexpr int length = 2 * Ns * Nc;
       static constexpr int length_ghost = spin_project ? length / 2 : length;
       // if spin projecting, check that short vector length is compatible, if not halve the vector length
       static constexpr int N = colorspinor::get_vector_order<Float>(length_ghost);
       static constexpr int M = length_ghost / N;
       static constexpr int Nrem = length_ghost - M * N;
-      using Accessor = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc>;
+      using Accessor = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, false, use_parity_mask>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       using norm_type = float;
@@ -1025,19 +1026,20 @@ namespace quda
       __device__ __host__ inline void loadGhost(complex out[length_ghost / 2], int x, int dim, int dir, int parity = 0) const
       {
         real v[length_ghost];
+        const auto parity_offset = use_parity_mask ? (parity & faceVolumeCB[dim]) : (parity * faceVolumeCB[dim]);
         norm_type nrm
-          = isFixed<Float>::value ? vector_load<float, 1>(ghost_norm[2 * dim + dir], parity * faceVolumeCB[dim] + x)[0] : 0.0;
+          = isFixed<Float>::value ? vector_load<float, 1>(ghost_norm[2 * dim + dir], parity_offset + x)[0] : 0.0;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
-          auto vecTmp = vector_load<Float, N>(ghost[2 * dim + dir] + parity * faceVolumeCB[dim] * length_ghost,
-                                              i * faceVolumeCB[dim] + x);
+          auto vecTmp
+            = vector_load<Float, N>(ghost[2 * dim + dir] + parity_offset * length_ghost, i * faceVolumeCB[dim] + x);
           copy_and_scale(v + i * N, vecTmp, nrm);
         }
 
         if constexpr (Nrem > 0) { // now load any remainder
           auto vecTmp = vector_load<Float, Nrem>(
-            ghost[2 * dim + dir] + parity * faceVolumeCB[dim] * length_ghost + faceVolumeCB[dim] * M * N, x);
+            ghost[2 * dim + dir] + parity_offset * length_ghost + faceVolumeCB[dim] * M * N, x);
           copy_and_scale(v + M * N, vecTmp, nrm);
         }
 
@@ -1055,6 +1057,7 @@ namespace quda
           v[2 * i + 1] = in[i].imag();
         }
 
+        const auto parity_offset = use_parity_mask ? (parity & faceVolumeCB[dim]) : (parity * faceVolumeCB[dim]);
         norm_type scale = 0.0;
         norm_type scale_inv = 0.0;
         if constexpr (isFixed<Float>::value) {
@@ -1065,7 +1068,7 @@ namespace quda
             max_[i] = fmaxf((norm_type)fabsf((norm_type)v[i]), (norm_type)fabsf((norm_type)v[i + length_ghost / 2]));
 #pragma unroll
           for (int i = 0; i < length_ghost / 2; i++) scale = fmaxf(max_[i], scale);
-          ghost_norm[2 * dim + dir][parity * faceVolumeCB[dim] + x] = scale * fixedInvMaxValue<Float>::value;
+          ghost_norm[2 * dim + dir][parity_offset + x] = scale * fixedInvMaxValue<Float>::value;
           scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
         }
 
@@ -1075,15 +1078,14 @@ namespace quda
           // first do scalar copy converting into storage type
           copy_and_scale<Float, real, N>(vecTmp, v + i * N, scale_inv);
           // second do vectorized copy into memory
-          vector_store(ghost[2 * dim + dir] + parity * faceVolumeCB[dim] * length_ghost, i * faceVolumeCB[dim] + x,
-                       vecTmp);
+          vector_store(ghost[2 * dim + dir] + parity_offset * length_ghost, i * faceVolumeCB[dim] + x, vecTmp);
         }
 
         if constexpr (Nrem > 0) { // now load any remainder
           array<Float, Nrem> vecTmp;
           copy_and_scale<Float, real, Nrem>(vecTmp, v + M * N, scale_inv);
-          vector_store<Float, Nrem>(
-            ghost[2 * dim + dir] + parity * faceVolumeCB[dim] * length_ghost + faceVolumeCB[dim] * M * N, x, vecTmp);
+          vector_store<Float, Nrem>(ghost[2 * dim + dir] + parity_offset * length_ghost + faceVolumeCB[dim] * M * N, x,
+                                    vecTmp);
         }
       }
 
@@ -1115,14 +1117,15 @@ namespace quda
        pointer arithmetic for huge allocations (e.g., packed set of
        vectors).  Default is to use 32-bit pointer arithmetic.
      */
-    template <typename Float, int Ns, int Nc, bool spin_project = false, bool huge_alloc = false, bool disable_ghost = false>
-    struct FloatNOrder : GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost> {
+    template <typename Float, int Ns, int Nc, bool spin_project = false, bool huge_alloc = false,
+              bool disable_ghost = false, bool use_parity_mask = false>
+    struct FloatNOrder : GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost, use_parity_mask> {
       static constexpr int length = 2 * Ns * Nc;
       static constexpr int N = colorspinor::get_vector_order<Float>(length);
       static constexpr int M = length / N;
       static constexpr int Nrem = length - M * N;
-      using Accessor = FloatNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost>;
-      using GhostNOrder = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost>;
+      using Accessor = FloatNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost, use_parity_mask>;
+      using GhostNOrder = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost, use_parity_mask>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       using AllocInt = typename AllocType<huge_alloc>::type;
@@ -1164,19 +1167,21 @@ namespace quda
         auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
         auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns));
 #endif
-        norm_type nrm = isFixed<Float>::value ? vector_load<float, 1>(norm, x + parity * norm_offset)[0] : 0.0;
+        const auto parity_norm_offset = use_parity_mask ? (parity & norm_offset) : (parity * norm_offset);
+        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
+        norm_type nrm = isFixed<Float>::value ? vector_load<float, 1>(norm, x + parity_norm_offset)[0] : 0.0;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first load from memory
-          auto vecTmp = vector_load<Float, N>(field, parity * offset, volumeCB * i + x);
+          auto vecTmp = vector_load<Float, N>(field, parity_offset, volumeCB * i + x);
           // now copy into output and scale
           copy_and_scale(v + i * N, vecTmp, nrm);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
-          auto vecTmp = vector_load<Float, Nrem>(field, parity * offset + volumeCB * M * N, x);
+          auto vecTmp = vector_load<Float, Nrem>(field, parity_offset + volumeCB * M * N, x);
           copy_and_scale(v + M * N, vecTmp, nrm);
         }
 
@@ -1190,13 +1195,15 @@ namespace quda
         auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
         auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns));
 #endif
-        if constexpr (isFixed<Float>::value) prefetch_cache_line(norm + (x + parity * norm_offset));
+        const auto parity_norm_offset = use_parity_mask ? (parity & norm_offset) : (parity * norm_offset);
+        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
+        if constexpr (isFixed<Float>::value) prefetch_cache_line(norm + (x + parity_norm_offset));
 
 #pragma unroll
-        for (int i = 0; i < M; i++) prefetch_cache_line(field + (parity * offset + (volumeCB * i + x) * N));
+        for (int i = 0; i < M; i++) prefetch_cache_line(field + (parity_offset + (volumeCB * i + x) * N));
 
         // now load any remainder
-        if constexpr (Nrem > 0) prefetch_cache_line(field + (parity * offset + volumeCB * M * N + x * Nrem));
+        if constexpr (Nrem > 0) prefetch_cache_line(field + (parity_offset + volumeCB * M * N + x * Nrem));
       }
 
       __device__ __host__ inline void save(const complex in[length / 2], int x, int parity = 0) const
@@ -1212,6 +1219,8 @@ namespace quda
           v[2 * i + 1] = in[i].imag();
         }
 
+        const auto parity_norm_offset = use_parity_mask ? (parity & norm_offset) : (parity * norm_offset);
+        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
         norm_type scale = 0.0;
         norm_type scale_inv = 0.0;
         if constexpr (isFixed<Float>::value) {
@@ -1222,7 +1231,7 @@ namespace quda
             max_[i] = fmaxf(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length / 2]));
 #pragma unroll
           for (int i = 0; i < length / 2; i++) scale = fmaxf(max_[i], scale);
-          norm[x + parity * norm_offset] = scale * fixedInvMaxValue<Float>::value;
+          norm[x + parity_norm_offset] = scale * fixedInvMaxValue<Float>::value;
           scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
         }
 
@@ -1232,14 +1241,14 @@ namespace quda
           // first do scalar copy converting into storage type
           copy_and_scale<Float, real, N>(vecTmp, v + i * N, scale_inv);
           // second do vectorized copy into memory
-          vector_store(field, parity * offset, volumeCB * i + x, vecTmp);
+          vector_store(field, parity_offset, volumeCB * i + x, vecTmp);
         }
 
         if constexpr (Nrem > 0) {
           array<Float, Nrem> vecTmp;
           copy_and_scale<Float, real, Nrem>(vecTmp, v + M * N, scale_inv);
           // second do vectorized copy into memory
-          vector_store(field, parity * offset + volumeCB * M * N, x, vecTmp);
+          vector_store(field, parity_offset + volumeCB * M * N, x, vecTmp);
         }
       }
 
@@ -1260,12 +1269,13 @@ namespace quda
       size_t Bytes() const { return offset * 2ll * sizeof(Float) * N; }
     };
 
-    template <bool spin_project, bool huge_alloc> struct GhostNOrder<short, 1, 3, spin_project, huge_alloc, false> {
+    template <bool spin_project, bool huge_alloc, bool use_parity_mask>
+    struct GhostNOrder<short, 1, 3, spin_project, huge_alloc, false, use_parity_mask> {
       using Float = short;
       static constexpr int Ns = 1;
       static constexpr int Nc = 3;
       static constexpr int length_ghost = 2 * Ns * Nc;
-      using Accessor = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc>;
+      using Accessor = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, false, use_parity_mask>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       using norm_type = float;
@@ -1296,7 +1306,8 @@ namespace quda
       __device__ __host__ inline void loadGhost(complex out[length_ghost / 2], int x, int dim, int dir, int parity = 0) const
       {
         real v[length_ghost];
-        auto vecTmp = vector_load<Float, 8>(ghost[2 * dim + dir], parity * faceVolumeCB[dim] + x);
+        const auto parity_offset = use_parity_mask ? (parity & faceVolumeCB[dim]) : (parity * faceVolumeCB[dim]);
+        auto vecTmp = vector_load<Float, 8>(ghost[2 * dim + dir], parity_offset + x);
 
         // extract the norm
         norm_type nrm;
@@ -1339,7 +1350,8 @@ namespace quda
         array<Float, 6> vecTmp2;
         copy_and_scale<Float, real, 6>(vecTmp2, v, scale_inv);
         memcpy(&vecTmp, &vecTmp2, sizeof(vecTmp2));
-        vector_store(ghost[2 * dim + dir], parity * faceVolumeCB[dim] + x, vecTmp);
+        const auto parity_offset = use_parity_mask ? (parity & faceVolumeCB[dim]) : (parity * faceVolumeCB[dim]);
+        vector_store(ghost[2 * dim + dir], parity_offset + x, vecTmp);
       }
 
       /**
@@ -1369,15 +1381,15 @@ namespace quda
        pointer arithmetic for huge allocations (e.g., packed set of
        vectors).  Default is to use 32-bit pointer arithmetic.
      */
-    template <bool spin_project, bool huge_alloc, bool disable_ghost>
-    struct FloatNOrder<short, 1, 3, spin_project, huge_alloc, disable_ghost>
-      : GhostNOrder<short, 1, 3, spin_project, huge_alloc, disable_ghost> {
+    template <bool spin_project, bool huge_alloc, bool disable_ghost, bool use_parity_mask>
+    struct FloatNOrder<short, 1, 3, spin_project, huge_alloc, disable_ghost, use_parity_mask>
+      : GhostNOrder<short, 1, 3, spin_project, huge_alloc, disable_ghost, use_parity_mask> {
       using Float = short;
       static constexpr int Ns = 1;
       static constexpr int Nc = 3;
       static constexpr int length = 2 * Ns * Nc;
-      using Accessor = FloatNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost>;
-      using GhostNOrder = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost>;
+      using Accessor = FloatNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost, use_parity_mask>;
+      using GhostNOrder = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost, use_parity_mask>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       using AllocInt = typename AllocType<huge_alloc>::type;
@@ -1402,7 +1414,8 @@ namespace quda
       __device__ __host__ inline void load(complex out[length / 2], int x, int parity = 0) const
       {
         real v[length];
-        auto vecTmp = vector_load<Float, 8>(field, parity * offset + x);
+        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
+        auto vecTmp = vector_load<Float, 8>(field, parity_offset + x);
 
         // extract the norm
         norm_type nrm;
@@ -1446,7 +1459,8 @@ namespace quda
         copy_and_scale<Float, real, 6>(vecTmp2, v, scale_inv);
         memcpy(&vecTmp, &vecTmp2, sizeof(vecTmp2));
 
-        vector_store(field, parity * offset + x, vecTmp);
+        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
+        vector_store(field, parity_offset + x, vecTmp);
       }
 
       /**
@@ -1779,9 +1793,10 @@ namespace quda
 
   } // namespace colorspinor
 
-  template <typename T, int Ns, int Nc, bool project = false, bool huge_alloc = false, bool disable_ghost = false>
+  template <typename T, int Ns, int Nc, bool project = false, bool huge_alloc = false, bool disable_ghost = false,
+            bool use_parity_mask = false>
   struct colorspinor_mapper {
-    typedef colorspinor::FloatNOrder<T, Ns, Nc, project, huge_alloc, disable_ghost> type;
+    typedef colorspinor::FloatNOrder<T, Ns, Nc, project, huge_alloc, disable_ghost, use_parity_mask> type;
   };
 
   template <typename T, QudaFieldOrder order, int Ns, int Nc> struct colorspinor_order_mapper {
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 21b8106163..74a48cab99 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1551,11 +1551,12 @@ namespace quda {
         }
       }
 
-      template <typename Float, int length_, QudaReconstructType recon,
-                QudaStaggeredPhase stag_phase = QUDA_STAGGERED_PHASE_NO, bool huge_alloc = default_huge_alloc,
-                QudaGhostExchange ghostExchange_ = QUDA_GHOST_EXCHANGE_INVALID, bool use_inphase = false, bool shifted = false>
+      template <typename Float, int length_, QudaReconstructType recon, QudaStaggeredPhase stag_phase = QUDA_STAGGERED_PHASE_NO,
+                bool huge_alloc = default_huge_alloc, QudaGhostExchange ghostExchange_ = QUDA_GHOST_EXCHANGE_INVALID,
+                bool use_inphase = false, bool shifted = false, bool use_parity_mask = false>
       struct FloatNOrder {
-        using Accessor = FloatNOrder<Float, length_, recon, stag_phase, huge_alloc, ghostExchange_, use_inphase, shifted>;
+        using Accessor
+          = FloatNOrder<Float, length_, recon, stag_phase, huge_alloc, ghostExchange_, use_inphase, shifted, use_parity_mask>;
 
         using store_t = Float;
         static constexpr int length = length_;
@@ -1625,26 +1626,27 @@ namespace quda {
       __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real phase = 1.0) const
       {
         real tmp[reconLen];
+        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first load from memory
-          auto vecTmp = vector_load<Float, N>(gauge, parity * offset + dir * (M * N + Nrem) * stride, i * stride + x);
+          auto vecTmp = vector_load<Float, N>(gauge, parity_offset + dir * (M * N + Nrem) * stride, i * stride + x);
           // second do copy converting into register type with combined scaling
           copy_and_scale(tmp + i * N, vecTmp, combined_scale);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
-          auto vecTmp = vector_load<Float, Nrem>(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x);
+          auto vecTmp = vector_load<Float, Nrem>(gauge, parity_offset + (dir * (M * N + Nrem) + M * N) * stride, x);
           copy_and_scale(tmp + M * N, vecTmp, combined_scale);
         }
 
         if constexpr (loadPhase) {
           if constexpr (isFixed<Float>::value) {
-            copy_and_scale(phase, gauge[parity * offset + phaseOffset + stride * dir + x], phase_scale);
+            copy_and_scale(phase, gauge[parity_offset + phaseOffset + stride * dir + x], phase_scale);
           } else {
-            copy(phase, gauge[parity * offset + phaseOffset + stride * dir + x]);
+            copy(phase, gauge[parity_offset + phaseOffset + stride * dir + x]);
             phase *= static_cast<real>(2.0);
           }
         }
@@ -1655,36 +1657,39 @@ namespace quda {
       template <int type> __device__ inline void prefetch(int x, int dir, int parity, int block_size = 0) const
       {
         if constexpr (type == 0) { // use per-thread prefetching
+          const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
 #pragma unroll
           for (int i = 0; i < M; i++)
-            prefetch_cache_line(gauge + (parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N));
+            prefetch_cache_line(gauge + (parity_offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N));
 
           // now load any remainder
           if constexpr (Nrem > 0)
-            prefetch_cache_line(gauge + (parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem));
+            prefetch_cache_line(gauge + (parity_offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem));
 
-          if constexpr (loadPhase) prefetch_cache_line(gauge + (parity * offset + phaseOffset + stride * dir + x));
+          if constexpr (loadPhase) prefetch_cache_line(gauge + (parity_offset + phaseOffset + stride * dir + x));
         } else if constexpr (type == 1) { // bulk prefetch
           if (block_size == 0) block_size = blockDim.x;
           if (target::is_thread_zero()) {
+            const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
 #pragma unroll
             for (int i = 0; i < M; i++)
-              prefetch_cache_bulk(gauge + (parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N),
+              prefetch_cache_bulk(gauge + (parity_offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N),
                                   block_size * N * sizeof(Float));
 
             // now load any remainder
             if constexpr (Nrem > 0)
-              prefetch_cache_bulk(gauge + (parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem),
+              prefetch_cache_bulk(gauge + (parity_offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem),
                                   block_size * Nrem * sizeof(Float));
 
             if constexpr (loadPhase)
-              prefetch_cache_bulk(gauge + (parity * offset + phaseOffset + stride * dir + x), block_size * sizeof(Float));
+              prefetch_cache_bulk(gauge + (parity_offset + phaseOffset + stride * dir + x), block_size * sizeof(Float));
           }
         } else { // n-d tensor prefetch
           if (target::is_thread_zero()) {
-            prefetch_cache_tensor_5d(tensor_desc.N, x, x / 16, 0, dir, parity);
-            if constexpr (Nrem > 0) prefetch_cache_tensor_4d(tensor_desc.Nrem, x, x / 16, dir, parity);
-            if constexpr (loadPhase) prefetch_cache_tensor_4d(tensor_desc.phase, x, x / 16, dir, parity);
+            const auto parity_idx = use_parity_mask ? (parity & 1) : parity;
+            prefetch_cache_tensor_5d(tensor_desc.N, x, x / 16, 0, dir, parity_idx);
+            if constexpr (Nrem > 0) prefetch_cache_tensor_4d(tensor_desc.Nrem, x, x / 16, dir, parity_idx);
+            if constexpr (loadPhase) prefetch_cache_tensor_4d(tensor_desc.phase, x, x / 16, dir, parity_idx);
           }
         }
       }
@@ -1693,6 +1698,7 @@ namespace quda {
       {
         real tmp[reconLen];
         reconstruct.Pack(tmp, v);
+        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
@@ -1701,7 +1707,7 @@ namespace quda {
 #pragma unroll
           for (int j = 0; j < N; j++) copy(vecTmp[j], tmp[i * N + j]);
           // second do vectorized copy into memory
-          vector_store(gauge, parity * offset + dir * (M * N + Nrem) * stride, x + i * stride, vecTmp);
+          vector_store(gauge, parity_offset + dir * (M * N + Nrem) * stride, x + i * stride, vecTmp);
         }
 
         // now save any remainder
@@ -1710,12 +1716,12 @@ namespace quda {
 #pragma unroll
           for (int j = 0; j < Nrem; j++) copy(vecTmp[j], tmp[M * N + j]);
           // second do vectorized copy into memory
-          vector_store(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x, vecTmp);
+          vector_store(gauge, parity_offset + (dir * (M * N + Nrem) + M * N) * stride, x, vecTmp);
         }
 
         if constexpr (hasPhase) {
           real phase = reconstruct.getPhase(v);
-          copy(gauge[parity * offset + phaseOffset + dir * stride + x], static_cast<real>(0.5) * phase);
+          copy(gauge[parity_offset + phaseOffset + dir * stride + x], static_cast<real>(0.5) * phase);
         }
       }
 
@@ -1741,11 +1747,12 @@ namespace quda {
           // This also works perfectly when phases are stored. No need to change this.
         } else {
           real tmp[reconLen];
+          const auto parity_offset = use_parity_mask ? (parity & faceVolumeCB[dir]) : (parity * faceVolumeCB[dir]);
 
 #pragma unroll
           for (int i = 0; i < M; i++) {
             // first do vectorized copy from memory into registers
-            auto vecTmp = vector_load<Float, N>(ghost[dir], (i * 2 + parity) * faceVolumeCB[dir] + x);
+            auto vecTmp = vector_load<Float, N>(ghost[dir], (i * 2) * faceVolumeCB[dir] + parity_offset + x);
 
             // second do copy converting into register type with combined scaling
             copy_and_scale(tmp + i * N, vecTmp, combined_scale);
@@ -1753,8 +1760,7 @@ namespace quda {
 
           // now load any remainder
           if constexpr (Nrem > 0) {
-            auto vecTmp
-              = vector_load<Float, Nrem>(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x);
+            auto vecTmp = vector_load<Float, Nrem>(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity_offset + x);
             copy_and_scale(tmp + M * N, vecTmp, combined_scale);
           }
 
@@ -1765,10 +1771,9 @@ namespace quda {
             //   phase = inphase < static_cast<real>(0) ? static_cast<real>(-0.5) : static_cast<real>(0.5);
             // } else {
             if constexpr (isFixed<Float>::value) {
-              copy_and_scale(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x],
-                             phase_scale);
+              copy_and_scale(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity_offset + x], phase_scale);
             } else {
-              copy(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x]);
+              copy(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity_offset + x]);
               phase *= static_cast<real>(2.0);
             }
             // }
@@ -1784,6 +1789,7 @@ namespace quda {
         } else {
           real tmp[reconLen];
           reconstruct.Pack(tmp, v);
+          const auto parity_offset = use_parity_mask ? (parity & faceVolumeCB[dir]) : (parity * faceVolumeCB[dir]);
 
 #pragma unroll
           for (int i = 0; i < M; i++) {
@@ -1792,7 +1798,7 @@ namespace quda {
 #pragma unroll
             for (int j = 0; j < N; j++) copy(vecTmp[j], tmp[i * N + j]);
             // second do vectorized copy into memory
-            vector_store(ghost[dir], (i * 2 + parity) * faceVolumeCB[dir] + x, vecTmp);
+            vector_store(ghost[dir], (i * 2) * faceVolumeCB[dir] + parity_offset + x, vecTmp);
           }
 
           // now save any remainder
@@ -1801,13 +1807,12 @@ namespace quda {
 #pragma unroll
             for (int j = 0; j < Nrem; j++) copy(vecTmp[j], tmp[M * N + j]);
             // second do vectorized copy into memory
-            vector_store(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x, vecTmp);
+            vector_store(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity_offset + x, vecTmp);
           }
 
           if constexpr (hasPhase) {
             real phase = reconstruct.getPhase(v);
-            copy(ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x],
-                 static_cast<real>(0.5) * phase);
+            copy(ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity_offset + x], static_cast<real>(0.5) * phase);
           }
         }
       }
@@ -1848,12 +1853,13 @@ namespace quda {
                                                   int g, int parity, const int R[]) const
       {
         real tmp[reconLen];
+        const auto parity_idx = use_parity_mask ? (parity & 1) : parity;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first do vectorized copy from memory
           auto vecTmp = vector_load<Float, N>(ghost[dim], dir * reconLen * 2 * geometry * R[dim] * faceVolumeCB[dim],
-                                              ((i * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
+                                              ((i * 2 + parity_idx) * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
           // second do copy converting into register type with combined scaling
           copy_and_scale(tmp + i * N, vecTmp, combined_scale);
@@ -1863,7 +1869,7 @@ namespace quda {
         if constexpr (Nrem > 0) {
           auto vecTmp
             = vector_load<Float, Nrem>(ghost[dim], (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
-                                       (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
+                                       (parity_idx * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
           copy_and_scale(tmp + M * N, vecTmp, combined_scale);
         }
@@ -1873,12 +1879,12 @@ namespace quda {
           if constexpr (isFixed<Float>::value) {
             copy_and_scale(phase,
                            ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
-                                      + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x],
+                                      + (parity_idx * geometry + g) * R[dim] * faceVolumeCB[dim] + x],
                            phase_scale);
           } else {
             copy(phase,
                  ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
-                            + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x]);
+                            + (parity_idx * geometry + g) * R[dim] * faceVolumeCB[dim] + x]);
             phase *= static_cast<real>(2.0);
           }
         }
@@ -1892,6 +1898,7 @@ namespace quda {
       {
         real tmp[reconLen];
         reconstruct.Pack(tmp, v);
+        const auto parity_idx = use_parity_mask ? (parity & 1) : parity;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
@@ -1901,7 +1908,7 @@ namespace quda {
           for (int j = 0; j < N; j++) copy(vecTmp[j], tmp[i * N + j]);
           // second do vectorized copy to memory
           vector_store(ghost[dim], dir * reconLen * 2 * geometry * R[dim] * faceVolumeCB[dim],
-                       ((i * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] + x, vecTmp);
+                       ((i * 2 + parity_idx) * geometry + g) * R[dim] * faceVolumeCB[dim] + x, vecTmp);
         }
 
         // now save any remainder
@@ -1911,13 +1918,13 @@ namespace quda {
           for (int j = 0; j < Nrem; j++) copy(vecTmp[j], tmp[M * N + j]);
           // second do vectorized copy into memory
           vector_store(ghost[dim], (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
-                       (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x, vecTmp);
+                       (parity_idx * geometry + g) * R[dim] * faceVolumeCB[dim] + x, vecTmp);
         }
 
         if constexpr (hasPhase) {
           real phase = reconstruct.getPhase(v);
           copy(ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
-                          + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x],
+                          + (parity_idx * geometry + g) * R[dim] * faceVolumeCB[dim] + x],
                static_cast<real>(0.5) * phase);
         }
       }
@@ -2516,20 +2523,22 @@ namespace quda {
 
   template <typename T, QudaReconstructType recon, int N = 18, QudaStaggeredPhase stag = QUDA_STAGGERED_PHASE_NO,
             bool huge_alloc = gauge::default_huge_alloc, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_INVALID,
-            bool use_inphase = false, QudaGaugeFieldOrder order = QUDA_NATIVE_GAUGE_ORDER, bool shifted = false>
+            bool use_inphase = false, QudaGaugeFieldOrder order = QUDA_NATIVE_GAUGE_ORDER, bool shifted = false,
+            bool use_parity_mask = false>
   struct gauge_mapper {
-    typedef gauge::FloatNOrder<T, N, recon, stag, huge_alloc, ghostExchange, use_inphase, shifted> type;
+    typedef gauge::FloatNOrder<T, N, recon, stag, huge_alloc, ghostExchange, use_inphase, shifted, use_parity_mask> type;
   };
 
   template <typename T, QudaReconstructType recon, int N, QudaStaggeredPhase stag, bool huge_alloc,
-            QudaGhostExchange ghostExchange, bool use_inphase, bool shifted>
-  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_MILC_GAUGE_ORDER, shifted> {
+            QudaGhostExchange ghostExchange, bool use_inphase, bool shifted, bool use_parity_mask>
+  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_MILC_GAUGE_ORDER, shifted,
+                      use_parity_mask> {
     typedef gauge::MILCOrder<T, N> type;
   };
 
   template <typename T, QudaReconstructType recon, int N, QudaStaggeredPhase stag, bool huge_alloc,
-            QudaGhostExchange ghostExchange, bool use_inphase, bool shifted>
-  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_QDP_GAUGE_ORDER, shifted> {
+            QudaGhostExchange ghostExchange, bool use_inphase, bool shifted, bool use_parity_mask>
+  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_QDP_GAUGE_ORDER, shifted, use_parity_mask> {
     typedef gauge::QDPOrder<T, N> type;
   };
 
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index e5fa03d75c..e88473c503 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -23,9 +23,12 @@ namespace quda
     static constexpr int nSpin = 1;
     static constexpr bool spin_project = false;
     static constexpr bool spinor_direct_load = false; // false means texture load
-    using F = typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load, true>::type;
+    static constexpr bool use_parity_mask = true;
+    using F =
+      typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load, true, use_parity_mask>::type;
 
-    using Ghost = typename colorspinor::GhostNOrder<Float, nSpin, nColor, spin_project, spinor_direct_load, false>;
+    using Ghost =
+      typename colorspinor::GhostNOrder<Float, nSpin, nColor, spin_project, spinor_direct_load, false, use_parity_mask>;
 
     static constexpr QudaReconstructType reconstruct_u = reconstruct_u_;
     static constexpr QudaReconstructType reconstruct_l = reconstruct_l_;
@@ -35,10 +38,10 @@ namespace quda
     static constexpr QudaStaggeredPhase phase = phase_;
     template <bool shifted>
     using GU = typename gauge_mapper<Float, reconstruct_u, 18, phase, gauge_direct_load, ghost, use_inphase,
-                                     QUDA_NATIVE_GAUGE_ORDER, shifted>::type;
+                                     QUDA_NATIVE_GAUGE_ORDER, shifted, use_parity_mask>::type;
     template <bool shifted>
     using GL = typename gauge_mapper<Float, reconstruct_l, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost,
-                                     use_inphase, QUDA_NATIVE_GAUGE_ORDER, shifted>::type;
+                                     use_inphase, QUDA_NATIVE_GAUGE_ORDER, shifted, use_parity_mask>::type;
 
     F out[MAX_MULTI_RHS];  /** output vector field */
     F in[MAX_MULTI_RHS];   /** input vector field */
@@ -104,6 +107,15 @@ namespace quda
       x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
 
       int dim2 = step / 4;
+      // Compute opposite parity/mask depending on optimization mode
+      int opposite_parity;
+      if constexpr (Arg::use_parity_mask) {
+        const int parity_orig = (parity == 0) ? 0 : 1;
+        opposite_parity = -(1 - parity_orig);
+      } else {
+        opposite_parity = 1 - parity;
+      }
+
       switch (step % 4) {
       case 0: arg.U.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
       case 1: arg.L.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
@@ -112,10 +124,10 @@ namespace quda
       case 3: arg.Lback.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
 #else
       case 2:
-        arg.U.prefetch<Arg::prefetch_tma>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity);
+        arg.U.prefetch<Arg::prefetch_tma>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, opposite_parity);
         break;
       case 3:
-        arg.L.prefetch<Arg::prefetch_tma>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity);
+        arg.L.prefetch<Arg::prefetch_tma>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, opposite_parity);
         break;
 #endif
       }
@@ -138,7 +150,21 @@ namespace quda
   {
     typedef typename mapper<typename Arg::Float>::type real;
     typedef Matrix<complex<real>, Arg::nColor> Link;
-    const int their_spinor_parity = (arg.nParity == 2) ? 1 - parity : 0;
+    // Note: parity parameter is already converted to mask if use_parity_mask is true
+    // Compute their_spinor_parity and one_minus_parity in same format (mask or original)
+    int their_spinor_parity;
+    int one_minus_parity;
+    if constexpr (Arg::use_parity_mask) {
+      // parity is the parity mask (-parity_orig), recover original parity for logic operations
+      const int parity_orig = (parity == 0) ? 0 : 1;
+      their_spinor_parity = (arg.nParity == 2) ? 1 - parity_orig : 0;
+      their_spinor_parity = -their_spinor_parity; // convert to mask
+      one_minus_parity = -(1 - parity_orig);
+    } else {
+      // parity is the original parity value
+      their_spinor_parity = (arg.nParity == 2) ? 1 - parity : 0;
+      one_minus_parity = 1 - parity;
+    }
 
     Coord coord1 = coord;
     if constexpr (arg.improved) { // need to compute 1-hop in_boundary
@@ -220,8 +246,8 @@ namespace quda
           const Link U = arg.improved ? arg.Uback(d, coord.x_cb, parity) :
                                         arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg));
 #else
-          const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, 1 - parity) :
-            arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg));
+          const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, one_minus_parity) :
+                                        arg.U.Ghost(d, ghost_idx2, one_minus_parity, StaggeredPhase(coord, d, -1, arg));
 #endif
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
@@ -238,8 +264,8 @@ namespace quda
                                           arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg));
 #else
             const int gauge_idx = back_idx;
-            const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) :
-                                          arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg));
+            const Link U = arg.improved ? arg.U(d, gauge_idx, one_minus_parity) :
+                                          arg.U(d, gauge_idx, one_minus_parity, StaggeredPhase(coord, d, -1, arg));
 #endif
 #pragma unroll
             for (auto s = 0; s < n_src_tile; s++) {
@@ -259,7 +285,7 @@ namespace quda
 #ifdef QUDA_DSLASH_DOUBLE_STORE
           const Link L = arg.Lback(d, coord.x_cb, parity);
 #else
-          const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity);
+          const Link L = arg.L.Ghost(d, ghost_idx, one_minus_parity);
 #endif
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
@@ -276,7 +302,7 @@ namespace quda
             const Link L = arg.Lback(d, coord.x_cb, parity);
 #else
             const int gauge_idx = back3_idx;
-            const Link L = arg.L(d, gauge_idx, 1 - parity);
+            const Link L = arg.L(d, gauge_idx, one_minus_parity);
 #endif
 #pragma unroll
             for (auto s = 0; s < n_src_tile; s++) {
@@ -307,17 +333,22 @@ namespace quda
         = mykernel_type == EXTERIOR_KERNEL_ALL ? false : true; // is thread active (non-trival for fused kernel only)
       int thread_dim;                                        // which dimension is thread working on (fused kernel only)
       auto coord = getCoords<QUDA_4D_PC, mykernel_type, Arg>(arg, idx, 0, parity, thread_dim);
-      const int my_spinor_parity = arg.nParity == 2 ? parity : 0;
+      int my_spinor_parity = arg.nParity == 2 ? parity : 0;
+      // Convert to parity mask for optimized indexing if enabled
+      int parity_for_load = Arg::use_parity_mask ? -parity : parity;
+      int my_spinor_parity_for_load = Arg::use_parity_mask ? -my_spinor_parity : my_spinor_parity;
 
       array<Vector, n_src_tile> out;
       if (arg.dd_out.isZero(coord)) {
         if (mykernel_type != EXTERIOR_KERNEL_ALL || active)
 #pragma unroll
-          for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity) = out[s]; }
+          for (auto s = 0; s < n_src_tile; s++) {
+            arg.out[src_idx + s](coord.x_cb, my_spinor_parity_for_load) = out[s];
+          }
         return;
       }
 
-      applyStaggered<mykernel_type, n_src_tile>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
+      applyStaggered<mykernel_type, n_src_tile>(out, arg, coord, parity_for_load, idx, thread_dim, active, src_idx);
 
 #pragma unroll
       for (auto s = 0; s < n_src_tile; s++) out[s] *= arg.dagger_scale;
@@ -327,19 +358,19 @@ namespace quda
       } else if (xpay && mykernel_type == INTERIOR_KERNEL) {
 #pragma unroll
         for (auto s = 0; s < n_src_tile; s++) {
-          Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity);
+          Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity_for_load);
           out[s] = axpy(arg.a, x, -out[s]);
         }
       } else if (mykernel_type != INTERIOR_KERNEL) {
 #pragma unroll
         for (auto s = 0; s < n_src_tile; s++) {
-          Vector x = arg.out[src_idx + s](coord.x_cb, my_spinor_parity);
+          Vector x = arg.out[src_idx + s](coord.x_cb, my_spinor_parity_for_load);
           out[s] = xpay ? x - out[s] : x + out[s];
         }
       }
       if (mykernel_type != EXTERIOR_KERNEL_ALL || active) {
 #pragma unroll
-        for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity) = out[s]; }
+        for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity_for_load) = out[s]; }
       }
     }
 

From 7d17452440791c3be410ea4c1a9f44304a4a3419 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 7 Nov 2025 22:28:39 -0800
Subject: [PATCH 041/121] Optimization for staggered packing kernels: ensure we
 do division by int_fastdiv not ints

---
 include/dslash_quda.h    | 2 +-
 include/index_helper.cuh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/dslash_quda.h b/include/dslash_quda.h
index 5091e28674..09404ddb5a 100644
--- a/include/dslash_quda.h
+++ b/include/dslash_quda.h
@@ -19,7 +19,7 @@ namespace quda
     int_fastdiv X[QUDA_MAX_DIM];
     int Ls;
 
-    int volume_4d;
+    int_fastdiv volume_4d;
     int_fastdiv volume_4d_cb;
 
     int_fastdiv face_X[4];
diff --git a/include/index_helper.cuh b/include/index_helper.cuh
index c27215ce4e..85b82367fa 100644
--- a/include/index_helper.cuh
+++ b/include/index_helper.cuh
@@ -822,7 +822,7 @@ namespace quda {
     int s = face_idx_in / arg.dc.face_XYZT[dim];
     int face_idx = face_idx_in - s * arg.dc.face_XYZT[dim];
 
-    int dims[3] = {};
+    std::remove_const_t<std::remove_reference_t<decltype(arg.dc.X[0])>> dims[3] = {};
     int d1 = 0;
 #pragma unroll 4
     for (int d2 = 0; d2 < 4; d2++) { // this will evaluate at compile time

From 27b725d40170097fcc9b4bd9c02e6a3240eb8be9 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 7 Nov 2025 22:36:35 -0800
Subject: [PATCH 042/121] Optimize scale_inv multiplication in gauge field
 reconstruction

Factor out scale_inv multiplication to reduce operations:
- Removed scale_inv from three conj() operations
- Apply scale_inv once to complex phase A in dynamic phasing branch
- Apply scale_inv once to phase scalar in static phasing branch
- Reduces multiply operations from 3 to 1
---
 include/gauge_field_order.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 21b8106163..bdfda7d896 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1249,25 +1249,27 @@ namespace quda {
 
           out[6] = cmul(out[2], out[4]);
           out[6] = cmac(out[1], out[5], -out[6]);
-          out[6] = scale_inv * conj(out[6]);
+          out[6] = conj(out[6]);
 
           out[7] = cmul(out[0], out[5]);
           out[7] = cmac(out[2], out[3], -out[7]);
-          out[7] = scale_inv * conj(out[7]);
+          out[7] = conj(out[7]);
 
           out[8] = cmul(out[1], out[3]);
           out[8] = cmac(out[0], out[4], -out[8]);
-          out[8] = scale_inv * conj(out[8]);
+          out[8] = conj(out[8]);
 
           if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing
             // Multiply the third row by exp(I*3*phase), since the cross product will end up in a scale factor of exp(-I*2*phase)
             real cos_sin[2];
             sincospi(static_cast<real>(3.0) * phase, &cos_sin[1], &cos_sin[0]);
             complex A(cos_sin[0], cos_sin[1]);
+            A *= scale_inv;
             out[6] = cmul(A, out[6]);
             out[7] = cmul(A, out[7]);
             out[8] = cmul(A, out[8]);
           } else { // phase is +/- 1 so real multiply is sufficient
+            phase *= scale_inv;
             out[6] *= phase;
             out[7] *= phase;
             out[8] *= phase;

From 2e12a2c26001e576524787d5245c3c0a0a8607a4 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Sun, 9 Nov 2025 17:58:48 -0800
Subject: [PATCH 043/121] Optimize the alternate path for i2f: with a
 pre-computed shift constant, we can use a single FMA instead of an add then
 multiply

---
 include/color_spinor_field_order.h | 18 ++++---
 include/convert.h                  | 86 ++++++++++++++++++++++++++++++
 include/gauge_field_order.h        | 24 +++++----
 3 files changed, 111 insertions(+), 17 deletions(-)

diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 3298a48dd7..a1741b438e 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -1027,18 +1027,19 @@ namespace quda
         real v[length_ghost];
         norm_type nrm
           = isFixed<Float>::value ? vector_load<float, 1>(ghost_norm[2 * dim + dir], parity * faceVolumeCB[dim] + x)[0] : 0.0;
+        norm_type nrm_shift = -nrm * 12582912.0f;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
           auto vecTmp = vector_load<Float, N>(ghost[2 * dim + dir] + parity * faceVolumeCB[dim] * length_ghost,
                                               i * faceVolumeCB[dim] + x);
-          copy_and_scale(v + i * N, vecTmp, nrm);
+          copy_and_scale(v + i * N, vecTmp, nrm, nrm_shift);
         }
 
         if constexpr (Nrem > 0) { // now load any remainder
           auto vecTmp = vector_load<Float, Nrem>(
             ghost[2 * dim + dir] + parity * faceVolumeCB[dim] * length_ghost + faceVolumeCB[dim] * M * N, x);
-          copy_and_scale(v + M * N, vecTmp, nrm);
+          copy_and_scale(v + M * N, vecTmp, nrm, nrm_shift);
         }
 
 #pragma unroll
@@ -1162,22 +1163,23 @@ namespace quda
         real v[length];
 #ifndef LEGACY_ACCESSOR_NORM
         auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
-        auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns));
+        auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns)); // FIXME - optimize 64-bit indexing here
 #endif
         norm_type nrm = isFixed<Float>::value ? vector_load<float, 1>(norm, x + parity * norm_offset)[0] : 0.0;
+        norm_type nrm_shift = -nrm * 12582912.0f;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first load from memory
           auto vecTmp = vector_load<Float, N>(field, parity * offset, volumeCB * i + x);
           // now copy into output and scale
-          copy_and_scale(v + i * N, vecTmp, nrm);
+          copy_and_scale(v + i * N, vecTmp, nrm, nrm_shift);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
           auto vecTmp = vector_load<Float, Nrem>(field, parity * offset + volumeCB * M * N, x);
-          copy_and_scale(v + M * N, vecTmp, nrm);
+          copy_and_scale(v + M * N, vecTmp, nrm, nrm_shift);
         }
 
 #pragma unroll
@@ -1301,9 +1303,10 @@ namespace quda
         // extract the norm
         norm_type nrm;
         memcpy(&nrm, &vecTmp[6], sizeof(norm_type));
+        norm_type nrm_shift = -nrm * 12582912.0f;
         array<Float, 6> vecTmp2;
         memcpy(&vecTmp2, &vecTmp, sizeof(vecTmp2));
-        copy_and_scale(v, vecTmp2, nrm);
+        copy_and_scale(v, vecTmp2, nrm, nrm_shift);
 
 #pragma unroll
         for (int i = 0; i < length_ghost / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]);
@@ -1409,8 +1412,9 @@ namespace quda
         memcpy(&nrm, &vecTmp[6], sizeof(norm_type));
         array<Float, 6> vecTmp2;
         memcpy(&vecTmp2, &vecTmp, sizeof(vecTmp2));
+        norm_type nrm_shift = -nrm * 12582912.0f;
         // now copy into output and scale
-        copy_and_scale(v, vecTmp2, nrm);
+        copy_and_scale(v, vecTmp2, nrm, nrm_shift);
 
 #pragma unroll
         for (int i = 0; i < length / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]);
diff --git a/include/convert.h b/include/convert.h
index f56751873c..f87ef514f1 100644
--- a/include/convert.h
+++ b/include/convert.h
@@ -60,6 +60,14 @@ namespace quda
     }
   };
 
+  template <bool is_device> struct i2f_fma {
+    template <typename T> constexpr float operator()(int a, T, float b, float) { return static_cast<float>(a) * b; }
+    template <typename T> constexpr float2 operator()(int a1, int a2, T, float b, float)
+    {
+      return mul2(float2 {static_cast<float>(a1), static_cast<float>(a2)}, float2 {b, b});
+    }
+  };
+
   /**
      @brief This is a LUT which is used to determine whether a given
      int-to-float conversion in a array of numbers to be converted
@@ -116,6 +124,38 @@ namespace quda
     }
   };
 
+  template <> struct i2f_fma<true> {
+    template <typename T, typename alternative_t>
+    __device__ std::enable_if_t<std::is_same_v<alternative_t, std::integral_constant<bool, alternative_t::value>>, float>
+    operator()(T a, alternative_t, float b, float c)
+    {
+      if constexpr (!alternative_t::value) {
+        return b * static_cast<float>(a);
+      } else {
+        // will work for up to 23-bit int
+        int32_t i = a + 0x4B400000;
+        float f;
+        memcpy(&f, &i, sizeof(int32_t));
+        return b * f + c;
+      }
+    }
+
+    template <typename T, typename alternative_t>
+    __device__ std::enable_if_t<std::is_same_v<alternative_t, std::integral_constant<bool, alternative_t::value>>, float2>
+    operator()(const T &a1, const T &a2, alternative_t, float b, float c)
+    {
+      if constexpr (!alternative_t::value) {
+        return mul2(float2 {b, b}, float2 {static_cast<float>(a1), static_cast<float>(a2)});
+      } else {
+        // will work for up to 23-bit int
+        int2 i = {a1 + 0x4B400000, a2 + 0x4B400000};
+        float2 f;
+        memcpy(&f, &i, sizeof(int2));
+        return fma2({b, b}, f, {c, c});
+      }
+    }
+  };
+
   /**
      @brief Regular float-to-integer round used on the host
   */
@@ -287,6 +327,52 @@ namespace quda
     });
   }
 
+  /**
+     @brief Specialized variants of the copy_and_scale that passes the
+     alternative i2f constant to be subtracted (this allows for
+     optimal FMA issuance).  Note the scale factors are ignored unless
+     the input type (b) is either a short or char vector.
+  */
+  template <typename T1, typename T2, typename T3>
+  constexpr std::enable_if_t<!isFixed<T1>::value && !isFixed<T2>::value, void> copy_and_scale(T1 &a, const T2 &b,
+                                                                                              const T3 &, const T3 &)
+  {
+    copy(a, b);
+  }
+
+  template <typename T1, typename T2, typename T3>
+  constexpr std::enable_if_t<!isFixed<T1>::value && isFixed<T2>::value, void> copy_and_scale(T1 &a, const T2 &b,
+                                                                                             const T3 &c, const T3 &d)
+  {
+    a = target::dispatch<i2f_fma>(b, std::integral_constant<bool, i2f_i[0]>(), c, d);
+  }
+
+  template <typename T1, typename T2, int n, typename T3>
+  constexpr std::enable_if_t<!isFixed<T1>::value && !isFixed<T2>::value, void>
+  copy_and_scale(T1 *a, const array<T2, n> &b, const T3 &, const T3 &)
+  {
+    for (int i = 0; i < n; i++) copy(a[i], b[i]);
+  }
+
+  template <typename T1, typename T2, int n, typename T3>
+  constexpr std::enable_if_t<!isFixed<T1>::value && !isFixed<T2>::value, void>
+  copy_and_scale(array<T1, n> &a, const T2 *b, const T3 &, const T3 &)
+  {
+    for (int i = 0; i < n; i++) copy(a[i], b[i]);
+  }
+
+  template <typename T1, typename T2, int n, typename T3>
+  constexpr std::enable_if_t<!isFixed<T1>::value && isFixed<T2>::value, void>
+  copy_and_scale(T1 *a, const array<T2, n> &b, const T3 &c, const T3 &d)
+  {
+    static_assert(n % 2 == 0);
+    constexpr_for<0, n, 2>([&](auto i) {
+      auto ai = target::dispatch<i2f_fma>(b[i + 0], b[i + 1], std::integral_constant<bool, i2f_i[(i / 2) % 4]>(), c, d);
+      a[i + 0] = ai.x;
+      a[i + 1] = ai.y;
+    });
+  }
+
   template <class fixed_t, class float_t> __device__ __host__ fixed_t f2i_round(float_t f)
   {
 #if 1
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index bdfda7d896..dba8a4900d 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1587,7 +1587,9 @@ namespace quda {
         size_t bytes;
         gauge::tensor_desc_t tensor_desc;
         const real combined_scale; // Precomputed scale for copy_and_scale: fixedInvMaxValue * reconstruct.scale
+        const real combined_shift; // Precomputed shift for the alternate i2f_fma combined_scale * -12582912.0f
         const real phase_scale; // Precomputed scale for phase loading: fixedInvMaxValue * 2.0 (or just 2.0 for float)
+        const real phase_shift; // Precomputed shift for the alternative i2f_fma phase_scale *  -12582912.0f
 
         FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
           reconstruct(u),
@@ -1608,8 +1610,10 @@ namespace quda {
               return isFixed<Float>::value ? fixedInvMaxValue<Float>::value : 1.0;
             }
           }()),
+          combined_shift(combined_scale * -12582912.0f),
           phase_scale(isFixed<Float>::value ? fixedInvMaxValue<Float>::value * static_cast<real>(2.0) :
-                                              static_cast<real>(2.0))
+                                              static_cast<real>(2.0)),
+          phase_shift(phase_scale * -12582912.0f)
         {
           if (geometry == QUDA_COARSE_GEOMETRY)
             errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
@@ -1633,18 +1637,18 @@ namespace quda {
           // first load from memory
           auto vecTmp = vector_load<Float, N>(gauge, parity * offset + dir * (M * N + Nrem) * stride, i * stride + x);
           // second do copy converting into register type with combined scaling
-          copy_and_scale(tmp + i * N, vecTmp, combined_scale);
+          copy_and_scale(tmp + i * N, vecTmp, combined_scale, combined_shift);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
           auto vecTmp = vector_load<Float, Nrem>(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x);
-          copy_and_scale(tmp + M * N, vecTmp, combined_scale);
+          copy_and_scale(tmp + M * N, vecTmp, combined_scale, combined_shift);
         }
 
         if constexpr (loadPhase) {
           if constexpr (isFixed<Float>::value) {
-            copy_and_scale(phase, gauge[parity * offset + phaseOffset + stride * dir + x], phase_scale);
+            copy_and_scale(phase, gauge[parity * offset + phaseOffset + stride * dir + x], phase_scale, phase_shift);
           } else {
             copy(phase, gauge[parity * offset + phaseOffset + stride * dir + x]);
             phase *= static_cast<real>(2.0);
@@ -1750,14 +1754,14 @@ namespace quda {
             auto vecTmp = vector_load<Float, N>(ghost[dir], (i * 2 + parity) * faceVolumeCB[dir] + x);
 
             // second do copy converting into register type with combined scaling
-            copy_and_scale(tmp + i * N, vecTmp, combined_scale);
+            copy_and_scale(tmp + i * N, vecTmp, combined_scale, combined_shift);
           }
 
           // now load any remainder
           if constexpr (Nrem > 0) {
             auto vecTmp
               = vector_load<Float, Nrem>(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x);
-            copy_and_scale(tmp + M * N, vecTmp, combined_scale);
+            copy_and_scale(tmp + M * N, vecTmp, combined_scale, combined_shift);
           }
 
           real phase = 0.;
@@ -1768,7 +1772,7 @@ namespace quda {
             // } else {
             if constexpr (isFixed<Float>::value) {
               copy_and_scale(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x],
-                             phase_scale);
+                             phase_scale, phase_shift);
             } else {
               copy(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x]);
               phase *= static_cast<real>(2.0);
@@ -1858,7 +1862,7 @@ namespace quda {
                                               ((i * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
           // second do copy converting into register type with combined scaling
-          copy_and_scale(tmp + i * N, vecTmp, combined_scale);
+          copy_and_scale(tmp + i * N, vecTmp, combined_scale, combined_shift);
         }
 
         // now load any remainder
@@ -1867,7 +1871,7 @@ namespace quda {
             = vector_load<Float, Nrem>(ghost[dim], (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
                                        (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
-          copy_and_scale(tmp + M * N, vecTmp, combined_scale);
+          copy_and_scale(tmp + M * N, vecTmp, combined_scale, combined_shift);
         }
 
         real phase = 0.;
@@ -1876,7 +1880,7 @@ namespace quda {
             copy_and_scale(phase,
                            ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
                                       + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x],
-                           phase_scale);
+                           phase_scale, phase_shift);
           } else {
             copy(phase,
                  ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]

From abed9acff8d36231c0e2228e32d963476135549c Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Sun, 9 Nov 2025 23:53:34 -0800
Subject: [PATCH 044/121] Revert "Added experimental optimization: replace
 parity * offset with bitmask approach to reduce IMAD count"

This reverts commit a2a9b24a277b867a4fdb70d007cbcccde91c167f.
---
 include/color_spinor_field_order.h   | 93 ++++++++++++---------------
 include/gauge_field_order.h          | 94 +++++++++++++---------------
 include/kernels/dslash_staggered.cuh | 69 ++++++--------------
 3 files changed, 100 insertions(+), 156 deletions(-)

diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index c57bbb97b0..a1741b438e 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -971,8 +971,7 @@ namespace quda
       }
     };
 
-    template <typename Float, int Ns, int Nc, bool spin_project = false, bool huge_alloc = false,
-              bool disable_ghost = false, bool use_parity_mask = false>
+    template <typename Float, int Ns, int Nc, bool spin_project = false, bool huge_alloc = false, bool disable_ghost = false>
     struct GhostNOrder {
       GhostNOrder() = default;
       GhostNOrder(const GhostNOrder &) = default;
@@ -980,15 +979,15 @@ namespace quda
       GhostNOrder &operator=(const GhostNOrder &) = default;
     };
 
-    template <typename Float, int Ns, int Nc, bool spin_project, bool huge_alloc, bool use_parity_mask>
-    struct GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, false, use_parity_mask> {
+    template <typename Float, int Ns, int Nc, bool spin_project, bool huge_alloc>
+    struct GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, false> {
       static constexpr int length = 2 * Ns * Nc;
       static constexpr int length_ghost = spin_project ? length / 2 : length;
       // if spin projecting, check that short vector length is compatible, if not halve the vector length
       static constexpr int N = colorspinor::get_vector_order<Float>(length_ghost);
       static constexpr int M = length_ghost / N;
       static constexpr int Nrem = length_ghost - M * N;
-      using Accessor = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, false, use_parity_mask>;
+      using Accessor = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       using norm_type = float;
@@ -1026,21 +1025,20 @@ namespace quda
       __device__ __host__ inline void loadGhost(complex out[length_ghost / 2], int x, int dim, int dir, int parity = 0) const
       {
         real v[length_ghost];
-        const auto parity_offset = use_parity_mask ? (parity & faceVolumeCB[dim]) : (parity * faceVolumeCB[dim]);
         norm_type nrm
-          = isFixed<Float>::value ? vector_load<float, 1>(ghost_norm[2 * dim + dir], parity_offset + x)[0] : 0.0;
+          = isFixed<Float>::value ? vector_load<float, 1>(ghost_norm[2 * dim + dir], parity * faceVolumeCB[dim] + x)[0] : 0.0;
         norm_type nrm_shift = -nrm * 12582912.0f;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
-          auto vecTmp
-            = vector_load<Float, N>(ghost[2 * dim + dir] + parity_offset * length_ghost, i * faceVolumeCB[dim] + x);
+          auto vecTmp = vector_load<Float, N>(ghost[2 * dim + dir] + parity * faceVolumeCB[dim] * length_ghost,
+                                              i * faceVolumeCB[dim] + x);
           copy_and_scale(v + i * N, vecTmp, nrm, nrm_shift);
         }
 
         if constexpr (Nrem > 0) { // now load any remainder
           auto vecTmp = vector_load<Float, Nrem>(
-            ghost[2 * dim + dir] + parity_offset * length_ghost + faceVolumeCB[dim] * M * N, x);
+            ghost[2 * dim + dir] + parity * faceVolumeCB[dim] * length_ghost + faceVolumeCB[dim] * M * N, x);
           copy_and_scale(v + M * N, vecTmp, nrm, nrm_shift);
         }
 
@@ -1058,7 +1056,6 @@ namespace quda
           v[2 * i + 1] = in[i].imag();
         }
 
-        const auto parity_offset = use_parity_mask ? (parity & faceVolumeCB[dim]) : (parity * faceVolumeCB[dim]);
         norm_type scale = 0.0;
         norm_type scale_inv = 0.0;
         if constexpr (isFixed<Float>::value) {
@@ -1069,7 +1066,7 @@ namespace quda
             max_[i] = fmaxf((norm_type)fabsf((norm_type)v[i]), (norm_type)fabsf((norm_type)v[i + length_ghost / 2]));
 #pragma unroll
           for (int i = 0; i < length_ghost / 2; i++) scale = fmaxf(max_[i], scale);
-          ghost_norm[2 * dim + dir][parity_offset + x] = scale * fixedInvMaxValue<Float>::value;
+          ghost_norm[2 * dim + dir][parity * faceVolumeCB[dim] + x] = scale * fixedInvMaxValue<Float>::value;
           scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
         }
 
@@ -1079,14 +1076,15 @@ namespace quda
           // first do scalar copy converting into storage type
           copy_and_scale<Float, real, N>(vecTmp, v + i * N, scale_inv);
           // second do vectorized copy into memory
-          vector_store(ghost[2 * dim + dir] + parity_offset * length_ghost, i * faceVolumeCB[dim] + x, vecTmp);
+          vector_store(ghost[2 * dim + dir] + parity * faceVolumeCB[dim] * length_ghost, i * faceVolumeCB[dim] + x,
+                       vecTmp);
         }
 
         if constexpr (Nrem > 0) { // now load any remainder
           array<Float, Nrem> vecTmp;
           copy_and_scale<Float, real, Nrem>(vecTmp, v + M * N, scale_inv);
-          vector_store<Float, Nrem>(ghost[2 * dim + dir] + parity_offset * length_ghost + faceVolumeCB[dim] * M * N, x,
-                                    vecTmp);
+          vector_store<Float, Nrem>(
+            ghost[2 * dim + dir] + parity * faceVolumeCB[dim] * length_ghost + faceVolumeCB[dim] * M * N, x, vecTmp);
         }
       }
 
@@ -1118,15 +1116,14 @@ namespace quda
        pointer arithmetic for huge allocations (e.g., packed set of
        vectors).  Default is to use 32-bit pointer arithmetic.
      */
-    template <typename Float, int Ns, int Nc, bool spin_project = false, bool huge_alloc = false,
-              bool disable_ghost = false, bool use_parity_mask = false>
-    struct FloatNOrder : GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost, use_parity_mask> {
+    template <typename Float, int Ns, int Nc, bool spin_project = false, bool huge_alloc = false, bool disable_ghost = false>
+    struct FloatNOrder : GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost> {
       static constexpr int length = 2 * Ns * Nc;
       static constexpr int N = colorspinor::get_vector_order<Float>(length);
       static constexpr int M = length / N;
       static constexpr int Nrem = length - M * N;
-      using Accessor = FloatNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost, use_parity_mask>;
-      using GhostNOrder = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost, use_parity_mask>;
+      using Accessor = FloatNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost>;
+      using GhostNOrder = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       using AllocInt = typename AllocType<huge_alloc>::type;
@@ -1168,22 +1165,20 @@ namespace quda
         auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
         auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns)); // FIXME - optimize 64-bit indexing here
 #endif
-        const auto parity_norm_offset = use_parity_mask ? (parity & norm_offset) : (parity * norm_offset);
-        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
-        norm_type nrm = isFixed<Float>::value ? vector_load<float, 1>(norm, x + parity_norm_offset)[0] : 0.0;
+        norm_type nrm = isFixed<Float>::value ? vector_load<float, 1>(norm, x + parity * norm_offset)[0] : 0.0;
         norm_type nrm_shift = -nrm * 12582912.0f;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first load from memory
-          auto vecTmp = vector_load<Float, N>(field, parity_offset, volumeCB * i + x);
+          auto vecTmp = vector_load<Float, N>(field, parity * offset, volumeCB * i + x);
           // now copy into output and scale
           copy_and_scale(v + i * N, vecTmp, nrm, nrm_shift);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
-          auto vecTmp = vector_load<Float, Nrem>(field, parity_offset + volumeCB * M * N, x);
+          auto vecTmp = vector_load<Float, Nrem>(field, parity * offset + volumeCB * M * N, x);
           copy_and_scale(v + M * N, vecTmp, nrm, nrm_shift);
         }
 
@@ -1197,15 +1192,13 @@ namespace quda
         auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
         auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns));
 #endif
-        const auto parity_norm_offset = use_parity_mask ? (parity & norm_offset) : (parity * norm_offset);
-        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
-        if constexpr (isFixed<Float>::value) prefetch_cache_line(norm + (x + parity_norm_offset));
+        if constexpr (isFixed<Float>::value) prefetch_cache_line(norm + (x + parity * norm_offset));
 
 #pragma unroll
-        for (int i = 0; i < M; i++) prefetch_cache_line(field + (parity_offset + (volumeCB * i + x) * N));
+        for (int i = 0; i < M; i++) prefetch_cache_line(field + (parity * offset + (volumeCB * i + x) * N));
 
         // now load any remainder
-        if constexpr (Nrem > 0) prefetch_cache_line(field + (parity_offset + volumeCB * M * N + x * Nrem));
+        if constexpr (Nrem > 0) prefetch_cache_line(field + (parity * offset + volumeCB * M * N + x * Nrem));
       }
 
       __device__ __host__ inline void save(const complex in[length / 2], int x, int parity = 0) const
@@ -1221,8 +1214,6 @@ namespace quda
           v[2 * i + 1] = in[i].imag();
         }
 
-        const auto parity_norm_offset = use_parity_mask ? (parity & norm_offset) : (parity * norm_offset);
-        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
         norm_type scale = 0.0;
         norm_type scale_inv = 0.0;
         if constexpr (isFixed<Float>::value) {
@@ -1233,7 +1224,7 @@ namespace quda
             max_[i] = fmaxf(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length / 2]));
 #pragma unroll
           for (int i = 0; i < length / 2; i++) scale = fmaxf(max_[i], scale);
-          norm[x + parity_norm_offset] = scale * fixedInvMaxValue<Float>::value;
+          norm[x + parity * norm_offset] = scale * fixedInvMaxValue<Float>::value;
           scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
         }
 
@@ -1243,14 +1234,14 @@ namespace quda
           // first do scalar copy converting into storage type
           copy_and_scale<Float, real, N>(vecTmp, v + i * N, scale_inv);
           // second do vectorized copy into memory
-          vector_store(field, parity_offset, volumeCB * i + x, vecTmp);
+          vector_store(field, parity * offset, volumeCB * i + x, vecTmp);
         }
 
         if constexpr (Nrem > 0) {
           array<Float, Nrem> vecTmp;
           copy_and_scale<Float, real, Nrem>(vecTmp, v + M * N, scale_inv);
           // second do vectorized copy into memory
-          vector_store(field, parity_offset + volumeCB * M * N, x, vecTmp);
+          vector_store(field, parity * offset + volumeCB * M * N, x, vecTmp);
         }
       }
 
@@ -1271,13 +1262,12 @@ namespace quda
       size_t Bytes() const { return offset * 2ll * sizeof(Float) * N; }
     };
 
-    template <bool spin_project, bool huge_alloc, bool use_parity_mask>
-    struct GhostNOrder<short, 1, 3, spin_project, huge_alloc, false, use_parity_mask> {
+    template <bool spin_project, bool huge_alloc> struct GhostNOrder<short, 1, 3, spin_project, huge_alloc, false> {
       using Float = short;
       static constexpr int Ns = 1;
       static constexpr int Nc = 3;
       static constexpr int length_ghost = 2 * Ns * Nc;
-      using Accessor = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, false, use_parity_mask>;
+      using Accessor = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       using norm_type = float;
@@ -1308,8 +1298,7 @@ namespace quda
       __device__ __host__ inline void loadGhost(complex out[length_ghost / 2], int x, int dim, int dir, int parity = 0) const
       {
         real v[length_ghost];
-        const auto parity_offset = use_parity_mask ? (parity & faceVolumeCB[dim]) : (parity * faceVolumeCB[dim]);
-        auto vecTmp = vector_load<Float, 8>(ghost[2 * dim + dir], parity_offset + x);
+        auto vecTmp = vector_load<Float, 8>(ghost[2 * dim + dir], parity * faceVolumeCB[dim] + x);
 
         // extract the norm
         norm_type nrm;
@@ -1353,8 +1342,7 @@ namespace quda
         array<Float, 6> vecTmp2;
         copy_and_scale<Float, real, 6>(vecTmp2, v, scale_inv);
         memcpy(&vecTmp, &vecTmp2, sizeof(vecTmp2));
-        const auto parity_offset = use_parity_mask ? (parity & faceVolumeCB[dim]) : (parity * faceVolumeCB[dim]);
-        vector_store(ghost[2 * dim + dir], parity_offset + x, vecTmp);
+        vector_store(ghost[2 * dim + dir], parity * faceVolumeCB[dim] + x, vecTmp);
       }
 
       /**
@@ -1384,15 +1372,15 @@ namespace quda
        pointer arithmetic for huge allocations (e.g., packed set of
        vectors).  Default is to use 32-bit pointer arithmetic.
      */
-    template <bool spin_project, bool huge_alloc, bool disable_ghost, bool use_parity_mask>
-    struct FloatNOrder<short, 1, 3, spin_project, huge_alloc, disable_ghost, use_parity_mask>
-      : GhostNOrder<short, 1, 3, spin_project, huge_alloc, disable_ghost, use_parity_mask> {
+    template <bool spin_project, bool huge_alloc, bool disable_ghost>
+    struct FloatNOrder<short, 1, 3, spin_project, huge_alloc, disable_ghost>
+      : GhostNOrder<short, 1, 3, spin_project, huge_alloc, disable_ghost> {
       using Float = short;
       static constexpr int Ns = 1;
       static constexpr int Nc = 3;
       static constexpr int length = 2 * Ns * Nc;
-      using Accessor = FloatNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost, use_parity_mask>;
-      using GhostNOrder = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost, use_parity_mask>;
+      using Accessor = FloatNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost>;
+      using GhostNOrder = GhostNOrder<Float, Ns, Nc, spin_project, huge_alloc, disable_ghost>;
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       using AllocInt = typename AllocType<huge_alloc>::type;
@@ -1417,8 +1405,7 @@ namespace quda
       __device__ __host__ inline void load(complex out[length / 2], int x, int parity = 0) const
       {
         real v[length];
-        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
-        auto vecTmp = vector_load<Float, 8>(field, parity_offset + x);
+        auto vecTmp = vector_load<Float, 8>(field, parity * offset + x);
 
         // extract the norm
         norm_type nrm;
@@ -1463,8 +1450,7 @@ namespace quda
         copy_and_scale<Float, real, 6>(vecTmp2, v, scale_inv);
         memcpy(&vecTmp, &vecTmp2, sizeof(vecTmp2));
 
-        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
-        vector_store(field, parity_offset + x, vecTmp);
+        vector_store(field, parity * offset + x, vecTmp);
       }
 
       /**
@@ -1797,10 +1783,9 @@ namespace quda
 
   } // namespace colorspinor
 
-  template <typename T, int Ns, int Nc, bool project = false, bool huge_alloc = false, bool disable_ghost = false,
-            bool use_parity_mask = false>
+  template <typename T, int Ns, int Nc, bool project = false, bool huge_alloc = false, bool disable_ghost = false>
   struct colorspinor_mapper {
-    typedef colorspinor::FloatNOrder<T, Ns, Nc, project, huge_alloc, disable_ghost, use_parity_mask> type;
+    typedef colorspinor::FloatNOrder<T, Ns, Nc, project, huge_alloc, disable_ghost> type;
   };
 
   template <typename T, QudaFieldOrder order, int Ns, int Nc> struct colorspinor_order_mapper {
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index d4689d3a7b..dba8a4900d 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1553,12 +1553,11 @@ namespace quda {
         }
       }
 
-      template <typename Float, int length_, QudaReconstructType recon, QudaStaggeredPhase stag_phase = QUDA_STAGGERED_PHASE_NO,
-                bool huge_alloc = default_huge_alloc, QudaGhostExchange ghostExchange_ = QUDA_GHOST_EXCHANGE_INVALID,
-                bool use_inphase = false, bool shifted = false, bool use_parity_mask = false>
+      template <typename Float, int length_, QudaReconstructType recon,
+                QudaStaggeredPhase stag_phase = QUDA_STAGGERED_PHASE_NO, bool huge_alloc = default_huge_alloc,
+                QudaGhostExchange ghostExchange_ = QUDA_GHOST_EXCHANGE_INVALID, bool use_inphase = false, bool shifted = false>
       struct FloatNOrder {
-        using Accessor
-          = FloatNOrder<Float, length_, recon, stag_phase, huge_alloc, ghostExchange_, use_inphase, shifted, use_parity_mask>;
+        using Accessor = FloatNOrder<Float, length_, recon, stag_phase, huge_alloc, ghostExchange_, use_inphase, shifted>;
 
         using store_t = Float;
         static constexpr int length = length_;
@@ -1632,27 +1631,26 @@ namespace quda {
       __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real phase = 1.0) const
       {
         real tmp[reconLen];
-        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first load from memory
-          auto vecTmp = vector_load<Float, N>(gauge, parity_offset + dir * (M * N + Nrem) * stride, i * stride + x);
+          auto vecTmp = vector_load<Float, N>(gauge, parity * offset + dir * (M * N + Nrem) * stride, i * stride + x);
           // second do copy converting into register type with combined scaling
           copy_and_scale(tmp + i * N, vecTmp, combined_scale, combined_shift);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
-          auto vecTmp = vector_load<Float, Nrem>(gauge, parity_offset + (dir * (M * N + Nrem) + M * N) * stride, x);
+          auto vecTmp = vector_load<Float, Nrem>(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x);
           copy_and_scale(tmp + M * N, vecTmp, combined_scale, combined_shift);
         }
 
         if constexpr (loadPhase) {
           if constexpr (isFixed<Float>::value) {
-            copy_and_scale(phase, gauge[parity_offset + phaseOffset + stride * dir + x], phase_scale, phase_shift);
+            copy_and_scale(phase, gauge[parity * offset + phaseOffset + stride * dir + x], phase_scale, phase_shift);
           } else {
-            copy(phase, gauge[parity_offset + phaseOffset + stride * dir + x]);
+            copy(phase, gauge[parity * offset + phaseOffset + stride * dir + x]);
             phase *= static_cast<real>(2.0);
           }
         }
@@ -1663,39 +1661,36 @@ namespace quda {
       template <int type> __device__ inline void prefetch(int x, int dir, int parity, int block_size = 0) const
       {
         if constexpr (type == 0) { // use per-thread prefetching
-          const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
 #pragma unroll
           for (int i = 0; i < M; i++)
-            prefetch_cache_line(gauge + (parity_offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N));
+            prefetch_cache_line(gauge + (parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N));
 
           // now load any remainder
           if constexpr (Nrem > 0)
-            prefetch_cache_line(gauge + (parity_offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem));
+            prefetch_cache_line(gauge + (parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem));
 
-          if constexpr (loadPhase) prefetch_cache_line(gauge + (parity_offset + phaseOffset + stride * dir + x));
+          if constexpr (loadPhase) prefetch_cache_line(gauge + (parity * offset + phaseOffset + stride * dir + x));
         } else if constexpr (type == 1) { // bulk prefetch
           if (block_size == 0) block_size = blockDim.x;
           if (target::is_thread_zero()) {
-            const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
 #pragma unroll
             for (int i = 0; i < M; i++)
-              prefetch_cache_bulk(gauge + (parity_offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N),
+              prefetch_cache_bulk(gauge + (parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N),
                                   block_size * N * sizeof(Float));
 
             // now load any remainder
             if constexpr (Nrem > 0)
-              prefetch_cache_bulk(gauge + (parity_offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem),
+              prefetch_cache_bulk(gauge + (parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem),
                                   block_size * Nrem * sizeof(Float));
 
             if constexpr (loadPhase)
-              prefetch_cache_bulk(gauge + (parity_offset + phaseOffset + stride * dir + x), block_size * sizeof(Float));
+              prefetch_cache_bulk(gauge + (parity * offset + phaseOffset + stride * dir + x), block_size * sizeof(Float));
           }
         } else { // n-d tensor prefetch
           if (target::is_thread_zero()) {
-            const auto parity_idx = use_parity_mask ? (parity & 1) : parity;
-            prefetch_cache_tensor_5d(tensor_desc.N, x, x / 16, 0, dir, parity_idx);
-            if constexpr (Nrem > 0) prefetch_cache_tensor_4d(tensor_desc.Nrem, x, x / 16, dir, parity_idx);
-            if constexpr (loadPhase) prefetch_cache_tensor_4d(tensor_desc.phase, x, x / 16, dir, parity_idx);
+            prefetch_cache_tensor_5d(tensor_desc.N, x, x / 16, 0, dir, parity);
+            if constexpr (Nrem > 0) prefetch_cache_tensor_4d(tensor_desc.Nrem, x, x / 16, dir, parity);
+            if constexpr (loadPhase) prefetch_cache_tensor_4d(tensor_desc.phase, x, x / 16, dir, parity);
           }
         }
       }
@@ -1704,7 +1699,6 @@ namespace quda {
       {
         real tmp[reconLen];
         reconstruct.Pack(tmp, v);
-        const auto parity_offset = use_parity_mask ? (parity & offset) : (parity * offset);
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
@@ -1713,7 +1707,7 @@ namespace quda {
 #pragma unroll
           for (int j = 0; j < N; j++) copy(vecTmp[j], tmp[i * N + j]);
           // second do vectorized copy into memory
-          vector_store(gauge, parity_offset + dir * (M * N + Nrem) * stride, x + i * stride, vecTmp);
+          vector_store(gauge, parity * offset + dir * (M * N + Nrem) * stride, x + i * stride, vecTmp);
         }
 
         // now save any remainder
@@ -1722,12 +1716,12 @@ namespace quda {
 #pragma unroll
           for (int j = 0; j < Nrem; j++) copy(vecTmp[j], tmp[M * N + j]);
           // second do vectorized copy into memory
-          vector_store(gauge, parity_offset + (dir * (M * N + Nrem) + M * N) * stride, x, vecTmp);
+          vector_store(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x, vecTmp);
         }
 
         if constexpr (hasPhase) {
           real phase = reconstruct.getPhase(v);
-          copy(gauge[parity_offset + phaseOffset + dir * stride + x], static_cast<real>(0.5) * phase);
+          copy(gauge[parity * offset + phaseOffset + dir * stride + x], static_cast<real>(0.5) * phase);
         }
       }
 
@@ -1753,12 +1747,11 @@ namespace quda {
           // This also works perfectly when phases are stored. No need to change this.
         } else {
           real tmp[reconLen];
-          const auto parity_offset = use_parity_mask ? (parity & faceVolumeCB[dir]) : (parity * faceVolumeCB[dir]);
 
 #pragma unroll
           for (int i = 0; i < M; i++) {
             // first do vectorized copy from memory into registers
-            auto vecTmp = vector_load<Float, N>(ghost[dir], (i * 2) * faceVolumeCB[dir] + parity_offset + x);
+            auto vecTmp = vector_load<Float, N>(ghost[dir], (i * 2 + parity) * faceVolumeCB[dir] + x);
 
             // second do copy converting into register type with combined scaling
             copy_and_scale(tmp + i * N, vecTmp, combined_scale, combined_shift);
@@ -1766,7 +1759,8 @@ namespace quda {
 
           // now load any remainder
           if constexpr (Nrem > 0) {
-            auto vecTmp = vector_load<Float, Nrem>(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity_offset + x);
+            auto vecTmp
+              = vector_load<Float, Nrem>(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x);
             copy_and_scale(tmp + M * N, vecTmp, combined_scale, combined_shift);
           }
 
@@ -1777,10 +1771,10 @@ namespace quda {
             //   phase = inphase < static_cast<real>(0) ? static_cast<real>(-0.5) : static_cast<real>(0.5);
             // } else {
             if constexpr (isFixed<Float>::value) {
-              copy_and_scale(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity_offset + x],
+              copy_and_scale(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x],
                              phase_scale, phase_shift);
             } else {
-              copy(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity_offset + x]);
+              copy(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x]);
               phase *= static_cast<real>(2.0);
             }
             // }
@@ -1796,7 +1790,6 @@ namespace quda {
         } else {
           real tmp[reconLen];
           reconstruct.Pack(tmp, v);
-          const auto parity_offset = use_parity_mask ? (parity & faceVolumeCB[dir]) : (parity * faceVolumeCB[dir]);
 
 #pragma unroll
           for (int i = 0; i < M; i++) {
@@ -1805,7 +1798,7 @@ namespace quda {
 #pragma unroll
             for (int j = 0; j < N; j++) copy(vecTmp[j], tmp[i * N + j]);
             // second do vectorized copy into memory
-            vector_store(ghost[dir], (i * 2) * faceVolumeCB[dir] + parity_offset + x, vecTmp);
+            vector_store(ghost[dir], (i * 2 + parity) * faceVolumeCB[dir] + x, vecTmp);
           }
 
           // now save any remainder
@@ -1814,12 +1807,13 @@ namespace quda {
 #pragma unroll
             for (int j = 0; j < Nrem; j++) copy(vecTmp[j], tmp[M * N + j]);
             // second do vectorized copy into memory
-            vector_store(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity_offset + x, vecTmp);
+            vector_store(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x, vecTmp);
           }
 
           if constexpr (hasPhase) {
             real phase = reconstruct.getPhase(v);
-            copy(ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity_offset + x], static_cast<real>(0.5) * phase);
+            copy(ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x],
+                 static_cast<real>(0.5) * phase);
           }
         }
       }
@@ -1860,13 +1854,12 @@ namespace quda {
                                                   int g, int parity, const int R[]) const
       {
         real tmp[reconLen];
-        const auto parity_idx = use_parity_mask ? (parity & 1) : parity;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first do vectorized copy from memory
           auto vecTmp = vector_load<Float, N>(ghost[dim], dir * reconLen * 2 * geometry * R[dim] * faceVolumeCB[dim],
-                                              ((i * 2 + parity_idx) * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
+                                              ((i * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
           // second do copy converting into register type with combined scaling
           copy_and_scale(tmp + i * N, vecTmp, combined_scale, combined_shift);
@@ -1876,7 +1869,7 @@ namespace quda {
         if constexpr (Nrem > 0) {
           auto vecTmp
             = vector_load<Float, Nrem>(ghost[dim], (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
-                                       (parity_idx * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
+                                       (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
           copy_and_scale(tmp + M * N, vecTmp, combined_scale, combined_shift);
         }
@@ -1886,12 +1879,12 @@ namespace quda {
           if constexpr (isFixed<Float>::value) {
             copy_and_scale(phase,
                            ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
-                                      + (parity_idx * geometry + g) * R[dim] * faceVolumeCB[dim] + x],
+                                      + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x],
                            phase_scale, phase_shift);
           } else {
             copy(phase,
                  ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
-                            + (parity_idx * geometry + g) * R[dim] * faceVolumeCB[dim] + x]);
+                            + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x]);
             phase *= static_cast<real>(2.0);
           }
         }
@@ -1905,7 +1898,6 @@ namespace quda {
       {
         real tmp[reconLen];
         reconstruct.Pack(tmp, v);
-        const auto parity_idx = use_parity_mask ? (parity & 1) : parity;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
@@ -1915,7 +1907,7 @@ namespace quda {
           for (int j = 0; j < N; j++) copy(vecTmp[j], tmp[i * N + j]);
           // second do vectorized copy to memory
           vector_store(ghost[dim], dir * reconLen * 2 * geometry * R[dim] * faceVolumeCB[dim],
-                       ((i * 2 + parity_idx) * geometry + g) * R[dim] * faceVolumeCB[dim] + x, vecTmp);
+                       ((i * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] + x, vecTmp);
         }
 
         // now save any remainder
@@ -1925,13 +1917,13 @@ namespace quda {
           for (int j = 0; j < Nrem; j++) copy(vecTmp[j], tmp[M * N + j]);
           // second do vectorized copy into memory
           vector_store(ghost[dim], (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
-                       (parity_idx * geometry + g) * R[dim] * faceVolumeCB[dim] + x, vecTmp);
+                       (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x, vecTmp);
         }
 
         if constexpr (hasPhase) {
           real phase = reconstruct.getPhase(v);
           copy(ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
-                          + (parity_idx * geometry + g) * R[dim] * faceVolumeCB[dim] + x],
+                          + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x],
                static_cast<real>(0.5) * phase);
         }
       }
@@ -2530,22 +2522,20 @@ namespace quda {
 
   template <typename T, QudaReconstructType recon, int N = 18, QudaStaggeredPhase stag = QUDA_STAGGERED_PHASE_NO,
             bool huge_alloc = gauge::default_huge_alloc, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_INVALID,
-            bool use_inphase = false, QudaGaugeFieldOrder order = QUDA_NATIVE_GAUGE_ORDER, bool shifted = false,
-            bool use_parity_mask = false>
+            bool use_inphase = false, QudaGaugeFieldOrder order = QUDA_NATIVE_GAUGE_ORDER, bool shifted = false>
   struct gauge_mapper {
-    typedef gauge::FloatNOrder<T, N, recon, stag, huge_alloc, ghostExchange, use_inphase, shifted, use_parity_mask> type;
+    typedef gauge::FloatNOrder<T, N, recon, stag, huge_alloc, ghostExchange, use_inphase, shifted> type;
   };
 
   template <typename T, QudaReconstructType recon, int N, QudaStaggeredPhase stag, bool huge_alloc,
-            QudaGhostExchange ghostExchange, bool use_inphase, bool shifted, bool use_parity_mask>
-  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_MILC_GAUGE_ORDER, shifted,
-                      use_parity_mask> {
+            QudaGhostExchange ghostExchange, bool use_inphase, bool shifted>
+  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_MILC_GAUGE_ORDER, shifted> {
     typedef gauge::MILCOrder<T, N> type;
   };
 
   template <typename T, QudaReconstructType recon, int N, QudaStaggeredPhase stag, bool huge_alloc,
-            QudaGhostExchange ghostExchange, bool use_inphase, bool shifted, bool use_parity_mask>
-  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_QDP_GAUGE_ORDER, shifted, use_parity_mask> {
+            QudaGhostExchange ghostExchange, bool use_inphase, bool shifted>
+  struct gauge_mapper<T, recon, N, stag, huge_alloc, ghostExchange, use_inphase, QUDA_QDP_GAUGE_ORDER, shifted> {
     typedef gauge::QDPOrder<T, N> type;
   };
 
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index e88473c503..e5fa03d75c 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -23,12 +23,9 @@ namespace quda
     static constexpr int nSpin = 1;
     static constexpr bool spin_project = false;
     static constexpr bool spinor_direct_load = false; // false means texture load
-    static constexpr bool use_parity_mask = true;
-    using F =
-      typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load, true, use_parity_mask>::type;
+    using F = typename colorspinor_mapper<Float, nSpin, nColor, spin_project, spinor_direct_load, true>::type;
 
-    using Ghost =
-      typename colorspinor::GhostNOrder<Float, nSpin, nColor, spin_project, spinor_direct_load, false, use_parity_mask>;
+    using Ghost = typename colorspinor::GhostNOrder<Float, nSpin, nColor, spin_project, spinor_direct_load, false>;
 
     static constexpr QudaReconstructType reconstruct_u = reconstruct_u_;
     static constexpr QudaReconstructType reconstruct_l = reconstruct_l_;
@@ -38,10 +35,10 @@ namespace quda
     static constexpr QudaStaggeredPhase phase = phase_;
     template <bool shifted>
     using GU = typename gauge_mapper<Float, reconstruct_u, 18, phase, gauge_direct_load, ghost, use_inphase,
-                                     QUDA_NATIVE_GAUGE_ORDER, shifted, use_parity_mask>::type;
+                                     QUDA_NATIVE_GAUGE_ORDER, shifted>::type;
     template <bool shifted>
     using GL = typename gauge_mapper<Float, reconstruct_l, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost,
-                                     use_inphase, QUDA_NATIVE_GAUGE_ORDER, shifted, use_parity_mask>::type;
+                                     use_inphase, QUDA_NATIVE_GAUGE_ORDER, shifted>::type;
 
     F out[MAX_MULTI_RHS];  /** output vector field */
     F in[MAX_MULTI_RHS];   /** input vector field */
@@ -107,15 +104,6 @@ namespace quda
       x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
 
       int dim2 = step / 4;
-      // Compute opposite parity/mask depending on optimization mode
-      int opposite_parity;
-      if constexpr (Arg::use_parity_mask) {
-        const int parity_orig = (parity == 0) ? 0 : 1;
-        opposite_parity = -(1 - parity_orig);
-      } else {
-        opposite_parity = 1 - parity;
-      }
-
       switch (step % 4) {
       case 0: arg.U.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
       case 1: arg.L.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
@@ -124,10 +112,10 @@ namespace quda
       case 3: arg.Lback.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
 #else
       case 2:
-        arg.U.prefetch<Arg::prefetch_tma>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, opposite_parity);
+        arg.U.prefetch<Arg::prefetch_tma>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity);
         break;
       case 3:
-        arg.L.prefetch<Arg::prefetch_tma>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, opposite_parity);
+        arg.L.prefetch<Arg::prefetch_tma>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity);
         break;
 #endif
       }
@@ -150,21 +138,7 @@ namespace quda
   {
     typedef typename mapper<typename Arg::Float>::type real;
     typedef Matrix<complex<real>, Arg::nColor> Link;
-    // Note: parity parameter is already converted to mask if use_parity_mask is true
-    // Compute their_spinor_parity and one_minus_parity in same format (mask or original)
-    int their_spinor_parity;
-    int one_minus_parity;
-    if constexpr (Arg::use_parity_mask) {
-      // parity is the parity mask (-parity_orig), recover original parity for logic operations
-      const int parity_orig = (parity == 0) ? 0 : 1;
-      their_spinor_parity = (arg.nParity == 2) ? 1 - parity_orig : 0;
-      their_spinor_parity = -their_spinor_parity; // convert to mask
-      one_minus_parity = -(1 - parity_orig);
-    } else {
-      // parity is the original parity value
-      their_spinor_parity = (arg.nParity == 2) ? 1 - parity : 0;
-      one_minus_parity = 1 - parity;
-    }
+    const int their_spinor_parity = (arg.nParity == 2) ? 1 - parity : 0;
 
     Coord coord1 = coord;
     if constexpr (arg.improved) { // need to compute 1-hop in_boundary
@@ -246,8 +220,8 @@ namespace quda
           const Link U = arg.improved ? arg.Uback(d, coord.x_cb, parity) :
                                         arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg));
 #else
-          const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, one_minus_parity) :
-                                        arg.U.Ghost(d, ghost_idx2, one_minus_parity, StaggeredPhase(coord, d, -1, arg));
+          const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, 1 - parity) :
+            arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg));
 #endif
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
@@ -264,8 +238,8 @@ namespace quda
                                           arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg));
 #else
             const int gauge_idx = back_idx;
-            const Link U = arg.improved ? arg.U(d, gauge_idx, one_minus_parity) :
-                                          arg.U(d, gauge_idx, one_minus_parity, StaggeredPhase(coord, d, -1, arg));
+            const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) :
+                                          arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg));
 #endif
 #pragma unroll
             for (auto s = 0; s < n_src_tile; s++) {
@@ -285,7 +259,7 @@ namespace quda
 #ifdef QUDA_DSLASH_DOUBLE_STORE
           const Link L = arg.Lback(d, coord.x_cb, parity);
 #else
-          const Link L = arg.L.Ghost(d, ghost_idx, one_minus_parity);
+          const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity);
 #endif
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
@@ -302,7 +276,7 @@ namespace quda
             const Link L = arg.Lback(d, coord.x_cb, parity);
 #else
             const int gauge_idx = back3_idx;
-            const Link L = arg.L(d, gauge_idx, one_minus_parity);
+            const Link L = arg.L(d, gauge_idx, 1 - parity);
 #endif
 #pragma unroll
             for (auto s = 0; s < n_src_tile; s++) {
@@ -333,22 +307,17 @@ namespace quda
         = mykernel_type == EXTERIOR_KERNEL_ALL ? false : true; // is thread active (non-trival for fused kernel only)
       int thread_dim;                                        // which dimension is thread working on (fused kernel only)
       auto coord = getCoords<QUDA_4D_PC, mykernel_type, Arg>(arg, idx, 0, parity, thread_dim);
-      int my_spinor_parity = arg.nParity == 2 ? parity : 0;
-      // Convert to parity mask for optimized indexing if enabled
-      int parity_for_load = Arg::use_parity_mask ? -parity : parity;
-      int my_spinor_parity_for_load = Arg::use_parity_mask ? -my_spinor_parity : my_spinor_parity;
+      const int my_spinor_parity = arg.nParity == 2 ? parity : 0;
 
       array<Vector, n_src_tile> out;
       if (arg.dd_out.isZero(coord)) {
         if (mykernel_type != EXTERIOR_KERNEL_ALL || active)
 #pragma unroll
-          for (auto s = 0; s < n_src_tile; s++) {
-            arg.out[src_idx + s](coord.x_cb, my_spinor_parity_for_load) = out[s];
-          }
+          for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity) = out[s]; }
         return;
       }
 
-      applyStaggered<mykernel_type, n_src_tile>(out, arg, coord, parity_for_load, idx, thread_dim, active, src_idx);
+      applyStaggered<mykernel_type, n_src_tile>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
 #pragma unroll
       for (auto s = 0; s < n_src_tile; s++) out[s] *= arg.dagger_scale;
@@ -358,19 +327,19 @@ namespace quda
       } else if (xpay && mykernel_type == INTERIOR_KERNEL) {
 #pragma unroll
         for (auto s = 0; s < n_src_tile; s++) {
-          Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity_for_load);
+          Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity);
           out[s] = axpy(arg.a, x, -out[s]);
         }
       } else if (mykernel_type != INTERIOR_KERNEL) {
 #pragma unroll
         for (auto s = 0; s < n_src_tile; s++) {
-          Vector x = arg.out[src_idx + s](coord.x_cb, my_spinor_parity_for_load);
+          Vector x = arg.out[src_idx + s](coord.x_cb, my_spinor_parity);
           out[s] = xpay ? x - out[s] : x + out[s];
         }
       }
       if (mykernel_type != EXTERIOR_KERNEL_ALL || active) {
 #pragma unroll
-        for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity_for_load) = out[s]; }
+        for (auto s = 0; s < n_src_tile; s++) { arg.out[src_idx + s](coord.x_cb, my_spinor_parity) = out[s]; }
       }
     }
 

From 50cc09a5fa870b490e2e87a3fefbd1c63e730a43 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Sun, 16 Nov 2025 15:03:59 -0800
Subject: [PATCH 045/121] Optimize FFMA2 issuance

---
 include/complex_quda.h      | 4 ++--
 include/gauge_field_order.h | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/complex_quda.h b/include/complex_quda.h
index 3e21b7fe25..d0a686a6a9 100644
--- a/include/complex_quda.h
+++ b/include/complex_quda.h
@@ -928,14 +928,14 @@ namespace quda
   template <typename real> __host__ __device__ inline complex<real> cmul(const complex<real> &x, const complex<real> &y)
   {
     complex<real> rtn = mul2({x.real(), x.real()}, y);
-    return fma2({x.imag(), x.imag()}, {-y.imag(), y.real()}, rtn);
+    return fma2({-x.imag(), x.imag()}, {y.imag(), y.real()}, rtn);
   }
 
   template <typename real>
   __host__ __device__ inline complex<real> cmac(const complex<real> &x, const complex<real> &y, const complex<real> &z)
   {
     complex<real> w = fma2({x.real(), x.real()}, y, z);
-    return fma2({x.imag(), x.imag()}, {-y.imag(), y.real()}, w);
+    return fma2({-x.imag(), x.imag()}, {y.imag(), y.real()}, w);
   }
 
   template <typename T1, typename T2, typename T3>
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index dba8a4900d..b38f4de39c 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1265,9 +1265,9 @@ namespace quda {
             sincospi(static_cast<real>(3.0) * phase, &cos_sin[1], &cos_sin[0]);
             complex A(cos_sin[0], cos_sin[1]);
             A *= scale_inv;
-            out[6] = cmul(A, out[6]);
-            out[7] = cmul(A, out[7]);
-            out[8] = cmul(A, out[8]);
+            out[6] = cmul(out[6], A);
+            out[7] = cmul(out[7], A);
+            out[8] = cmul(out[8], A);
           } else { // phase is +/- 1 so real multiply is sufficient
             phase *= scale_inv;
             out[6] *= phase;

From 4c9fa83da9bd8c78fbb22e9e65b1b1ebb68ce220 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 17 Nov 2025 13:37:13 -0800
Subject: [PATCH 046/121] Add experiment with L1 prefetching for staggered
 dslash

---
 include/gauge_field_order.h          | 12 +++++++++-
 include/kernels/dslash_staggered.cuh | 35 ++++++++++++++++------------
 include/targets/cuda/inline_ptx.h    |  8 ++++++-
 include/targets/cuda/load_store.h    | 11 +++++++++
 include/targets/generic/load_store.h |  9 +++++++
 5 files changed, 58 insertions(+), 17 deletions(-)

diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index b38f4de39c..dc6dd85bbe 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1686,12 +1686,22 @@ namespace quda {
             if constexpr (loadPhase)
               prefetch_cache_bulk(gauge + (parity * offset + phaseOffset + stride * dir + x), block_size * sizeof(Float));
           }
-        } else { // n-d tensor prefetch
+        } else if constexpr (type == 2) { // n-d tensor prefetch
           if (target::is_thread_zero()) {
             prefetch_cache_tensor_5d(tensor_desc.N, x, x / 16, 0, dir, parity);
             if constexpr (Nrem > 0) prefetch_cache_tensor_4d(tensor_desc.Nrem, x, x / 16, dir, parity);
             if constexpr (loadPhase) prefetch_cache_tensor_4d(tensor_desc.phase, x, x / 16, dir, parity);
           }
+        } else { // L1 prefetching
+#pragma unroll
+          for (int i = 0; i < M; i++)
+            prefetch_L1_cache_line(gauge + (parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N));
+
+          // now load any remainder
+          if constexpr (Nrem > 0)
+            prefetch_L1_cache_line(gauge + (parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem));
+
+          if constexpr (loadPhase) prefetch_L1_cache_line(gauge + (parity * offset + phaseOffset + stride * dir + x));
         }
       }
 
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index e5fa03d75c..17224936ac 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -56,6 +56,7 @@ namespace quda
     const bool is_last_time_slice; /** are we on the last (global) time slice */
     static constexpr bool improved = improved_;
     static constexpr int prefetch_distance = QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED;
+    static constexpr int prefetch_distance_l1 = 1;
 
     const real dagger_scale;
 
@@ -89,39 +90,43 @@ namespace quda
      @param[in] parity Partiry that we are working on
      @param[in] arg Paramter struct
    */
-  template <class coord_t, class Arg>
+  template <int prefetch_type, int distance, class coord_t, class Arg>
   __device__ __host__ void prefetch(int dim, int dir, int hop, const coord_t &coord, const coord_t &coord1, int parity,
                                     const Arg &arg)
   {
-    if constexpr (arg.prefetch_distance == 0) return;
-
     if constexpr (arg.improved) {
-      int step = 4 * dim + 2 * dir + hop + arg.prefetch_distance;
+      int step = 4 * dim + 2 * dir + hop + distance;
       if (step >= 16) return;
 
       // if using a bulk prefetch we need to use block's first coordinate
-      auto x_cb = arg.prefetch_tma ? coord.x_cb_0 : coord.x_cb;
+      auto x_cb = (prefetch_type == 1 || prefetch_type == 2) ? coord.x_cb_0 : coord.x_cb;
       x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
 
       int dim2 = step / 4;
       switch (step % 4) {
-      case 0: arg.U.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
-      case 1: arg.L.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
+      case 0: arg.U.prefetch<prefetch_type>(x_cb, dim2, parity); break;
+      case 1: arg.L.prefetch<prefetch_type>(x_cb, dim2, parity); break;
 #ifdef QUDA_DSLASH_DOUBLE_STORE
-      case 2: arg.Uback.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
-      case 3: arg.Lback.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
+      case 2: arg.Uback.prefetch<prefetch_type>(x_cb, dim2, parity); break;
+      case 3: arg.Lback.prefetch<prefetch_type>(x_cb, dim2, parity); break;
 #else
-      case 2:
-        arg.U.prefetch<Arg::prefetch_tma>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity);
-        break;
-      case 3:
-        arg.L.prefetch<Arg::prefetch_tma>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity);
-        break;
+      case 2: arg.U.prefetch<prefetch_type>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity); break;
+      case 3: arg.L.prefetch<prefetch_type>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity); break;
 #endif
       }
     }
   }
 
+  template <class coord_t, class Arg>
+  __device__ __host__ void prefetch(int dim, int dir, int hop, const coord_t &coord, const coord_t &coord1, int parity,
+                                    const Arg &arg)
+  {
+    if constexpr (Arg::prefetch_distance_l1 > 0) // L1 prefetch
+      prefetch<3, Arg::prefetch_distance_l1>(dim, dir, hop, coord, coord1, parity, arg);
+    if constexpr (Arg::prefetch_distance > 0) // L2 prefetch
+      prefetch<Arg::prefetch_tma, Arg::prefetch_distance>(dim, dir, hop, coord, coord1, parity, arg);
+  };
+
   /**
      @brief Applies the off-diagonal part of the Staggered / Asqtad
      operator.
diff --git a/include/targets/cuda/inline_ptx.h b/include/targets/cuda/inline_ptx.h
index b5a94266ba..bd8b8a2021 100644
--- a/include/targets/cuda/inline_ptx.h
+++ b/include/targets/cuda/inline_ptx.h
@@ -476,7 +476,13 @@ namespace quda {
     asm("st.cs.global.v2.s16 [%0+0], {%1, %2};" :: __PTR(addr), "h"(x), "h"(y));
   }
 
-  __device__ __forceinline__ void prefetch_L1(const void *p) { asm volatile("prefetch.global.L1 [%0];" ::"l"(p)); }
+  __device__ inline void prefetch_L1(void *smem_ptr_, const void *gmem_ptr)
+  {
+    uint32_t smem_ptr = __cvta_generic_to_shared(smem_ptr_);
+    asm volatile("cp.async.ca.shared.global [%0], [%1], 4;\n" ::"r"(smem_ptr), "l"(gmem_ptr));
+  }
+
+  __device__ __forceinline__ void prefetch_L1(const void *p) { asm volatile("prefetch.global.L2 [%0];" ::"l"(p)); }
 
   __device__ __forceinline__ void prefetch_L2(const void *p) { asm volatile("prefetch.global.L2 [%0];" ::"l"(p)); }
 
diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index 90c83ef59a..ddadf3bff0 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -164,6 +164,17 @@ namespace quda
     __device__ inline void operator()(const void *p) { prefetch_L2(p); }
   };
 
+  // pre-declaration of the prefetch_cache that we wish to specialize
+  template <bool> struct prefetch_L1_cache_line_imp;
+
+  template <> struct prefetch_L1_cache_line_imp<true> {
+    __device__ inline void operator()(const void *p)
+    {
+      static __shared__ float smem[1]; // dummy shared memory allocation
+      prefetch_L1(smem, p);
+    }
+  };
+
   // pre-declaration of the prefetch_cache that we wish to specialize
   template <bool> struct prefetch_cache_bulk_imp;
   template <bool> struct prefetch_cache_tensor_3d_imp;
diff --git a/include/targets/generic/load_store.h b/include/targets/generic/load_store.h
index 1562c9095f..355890c0d6 100644
--- a/include/targets/generic/load_store.h
+++ b/include/targets/generic/load_store.h
@@ -82,6 +82,15 @@ namespace quda
 
   __device__ __host__ inline void prefetch_cache_line(const void *p) { target::dispatch<prefetch_cache_line_imp>(p); }
 
+  template <bool is_device> struct prefetch_L1_cache_line_imp {
+    __device__ __host__ inline void operator()(const void *) { }
+  };
+
+  __device__ __host__ inline void prefetch_L1_cache_line(const void *p)
+  {
+    target::dispatch<prefetch_L1_cache_line_imp>(p);
+  }
+
   template <bool is_device> struct prefetch_cache_bulk_imp {
     constexpr void operator()(const void *, size_t) { }
   };

From 9daba3ff5fb7ecce874989d1171059c6c2f0bae9 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 20 Nov 2025 12:35:14 -0800
Subject: [PATCH 047/121] No bank conflicts when doing L1 prefetch

---
 include/targets/cuda/load_store.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index ddadf3bff0..d2d5fd3552 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -171,7 +171,9 @@ namespace quda
     __device__ inline void operator()(const void *p)
     {
       static __shared__ float smem[1]; // dummy shared memory allocation
-      prefetch_L1(smem, p);
+      auto tid = target::thread_idx_linear<dim>();
+      auto lane_id = tid & 31;
+      prefetch_L1(smem, p + lane_id);
     }
   };
 

From 84273238e728165347892055a6776118adc38c4b Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 20 Nov 2025 12:41:53 -0800
Subject: [PATCH 048/121] Fix last commit

---
 include/targets/cuda/load_store.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index d2d5fd3552..7ecc2d1843 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -171,7 +171,7 @@ namespace quda
     __device__ inline void operator()(const void *p)
     {
       static __shared__ float smem[1]; // dummy shared memory allocation
-      auto tid = target::thread_idx_linear<dim>();
+      auto tid = target::thread_idx_linear<3>();
       auto lane_id = tid & 31;
       prefetch_L1(smem, p + lane_id);
     }

From daa5a4fe2bac31994449559364759d551f9864b4 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 20 Nov 2025 14:32:38 -0800
Subject: [PATCH 049/121] Disable L1 prefetch experiment on in dslash_staggered

---
 include/kernels/dslash_staggered.cuh | 2 +-
 include/targets/cuda/load_store.h    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 17224936ac..7a8cc9a774 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -56,7 +56,7 @@ namespace quda
     const bool is_last_time_slice; /** are we on the last (global) time slice */
     static constexpr bool improved = improved_;
     static constexpr int prefetch_distance = QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED;
-    static constexpr int prefetch_distance_l1 = 1;
+    static constexpr int prefetch_distance_l1 = 0;
 
     const real dagger_scale;
 
diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index 7ecc2d1843..d699b3f4fd 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -170,10 +170,10 @@ namespace quda
   template <> struct prefetch_L1_cache_line_imp<true> {
     __device__ inline void operator()(const void *p)
     {
-      static __shared__ float smem[1]; // dummy shared memory allocation
+      static __shared__ float smem[32]; // dummy shared memory allocation
       auto tid = target::thread_idx_linear<3>();
       auto lane_id = tid & 31;
-      prefetch_L1(smem, p + lane_id);
+      prefetch_L1(smem + lane_id, p);
     }
   };
 

From 4b0600a5b1fe045bb815472a199adc298b1e8589 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 4 Dec 2025 13:59:11 -0800
Subject: [PATCH 050/121] Fix 32-byte alignment when gauge field is padded

---
 lib/lattice_field.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index 5b51f03014..5ee5acfb0e 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -158,6 +158,7 @@ namespace quda {
     volumeCB = (siteSubset == QUDA_FULL_SITE_SUBSET) ? volume / 2 : volume;
     localVolumeCB = (siteSubset == QUDA_FULL_SITE_SUBSET) ? localVolume / 2 : localVolume;
     stride = volumeCB + pad;
+    stride = (stride + 31) & ~31; // round up to be a multiple of 32 to guarantee alignment
 
     // for parity fields the factor of half is present for all surfaces dimensions except x, so add it manually
     for (int i = 0; i < nDim; i++) {

From bbd8ac69f7881d3e5ebc1385fe8f77dea2aba792 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 8 Dec 2025 23:25:29 -0800
Subject: [PATCH 051/121] Fix a double4 compiler conflict

---
 lib/restrictor_mma.in.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/restrictor_mma.in.cu b/lib/restrictor_mma.in.cu
index 71b9efa235..d36b7390d8 100644
--- a/lib/restrictor_mma.in.cu
+++ b/lib/restrictor_mma.in.cu
@@ -1,3 +1,4 @@
+#include <quda_internal.h>
 #ifdef QUDA_MMA_AVAILABLE
 #include <cub/block/block_reduce.cuh>
 #endif

From 1ed2db17491f8174b561b206ed1e519706a1979b Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 9 Dec 2025 13:03:36 -0800
Subject: [PATCH 052/121] Fix conflict between block_size definitions

---
 include/kernels/block_orthogonalize.cuh       | 4 ++--
 include/targets/cuda/block_reduction_kernel.h | 8 ++++----
 include/targets/hip/block_reduction_kernel.h  | 8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/kernels/block_orthogonalize.cuh b/include/kernels/block_orthogonalize.cuh
index 3a70a4a096..f84967a443 100644
--- a/include/kernels/block_orthogonalize.cuh
+++ b/include/kernels/block_orthogonalize.cuh
@@ -80,7 +80,7 @@ namespace quda {
   };
 
   template <typename Arg> struct BlockOrtho_Params {
-    static constexpr int mVec = tile_size<Arg::nColor, Arg::nVec, Arg::block_size>();
+    static constexpr int mVec = tile_size<Arg::nColor, Arg::nVec, Arg::block_size_cxpr>();
     using dot_t = array<complex<typename Arg::sum_t>, mVec>;
     static constexpr int block_dim = 1;
     using BlockReduceDot = BlockReduce<dot_t, block_dim>;
@@ -90,7 +90,7 @@ namespace quda {
 
   template <typename Arg> struct BlockOrtho_ : BlockOrtho_Params<Arg>::Ops {
     const Arg &arg;
-    static constexpr unsigned block_size = Arg::block_size;
+    static constexpr unsigned block_size = Arg::block_size_cxpr;
     static constexpr int fineSpin = Arg::fineSpin;
     static constexpr int spinBlock = (fineSpin == 1) ? 1 : fineSpin / Arg::coarseSpin; // size of spin block
     static constexpr int nColor = Arg::nColor;
diff --git a/include/targets/cuda/block_reduction_kernel.h b/include/targets/cuda/block_reduction_kernel.h
index bf41cde6d3..639501c421 100644
--- a/include/targets/cuda/block_reduction_kernel.h
+++ b/include/targets/cuda/block_reduction_kernel.h
@@ -61,9 +61,9 @@ namespace quda
      @tparam block_size x-dimension block-size
      @param[in] arg Kernel argument
    */
-  template <unsigned int block_size_, typename Arg_> struct BlockKernelArg : Arg_ {
+  template <unsigned int block_size, typename Arg_> struct BlockKernelArg : Arg_ {
     using Arg = Arg_;
-    static constexpr unsigned int block_size = block_size_;
+    static constexpr unsigned int block_size_cxpr = block_size;
     BlockKernelArg(const Arg &arg) : Arg(arg) { }
   };
 
@@ -112,7 +112,7 @@ namespace quda
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
   __launch_bounds__(Arg::launch_bounds ?
-                      Arg::block_size :
+                      Arg::block_size_cxpr :
                       0) __global__ std::enable_if_t<device::use_kernel_arg<Arg>(), void> BlockKernel2D(Arg arg)
   {
     static_assert(!grid_stride, "grid_stride not supported for BlockKernel");
@@ -135,7 +135,7 @@ namespace quda
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
   __launch_bounds__(Arg::launch_bounds ?
-                      Arg::block_size :
+                      Arg::block_size_cxpr :
                       0) __global__ std::enable_if_t<!device::use_kernel_arg<Arg>(), void> BlockKernel2D()
   {
     static_assert(!grid_stride, "grid_stride not supported for BlockKernel");
diff --git a/include/targets/hip/block_reduction_kernel.h b/include/targets/hip/block_reduction_kernel.h
index d81c213dd9..0daff25af4 100644
--- a/include/targets/hip/block_reduction_kernel.h
+++ b/include/targets/hip/block_reduction_kernel.h
@@ -43,9 +43,9 @@ namespace quda
      size to be set statically at launch time in the actual argument
      class that is passed to the kernel.
    */
-  template <unsigned int block_size_, typename Arg_> struct BlockKernelArg : Arg_ {
+  template <unsigned int block_size, typename Arg_> struct BlockKernelArg : Arg_ {
     using Arg = Arg_;
-    static constexpr unsigned int block_size = block_size_;
+    static constexpr unsigned int block_size_cxpr = block_size;
     BlockKernelArg(const Arg &arg) : Arg(arg) { }
   };
 
@@ -89,7 +89,7 @@ namespace quda
      @param[in] arg Kernel argument
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __launch_bounds__(Arg::block_size)
+  __launch_bounds__(Arg::block_size_cxpr)
     __global__ std::enable_if_t<device::use_kernel_arg<Arg>() && (Arg::launch_bounds), void> BlockKernel2D(Arg arg)
   {
     static_assert(!grid_stride, "grid_stride not supported for BlockKernel");
@@ -132,7 +132,7 @@ namespace quda
      @param[in] arg Kernel argument
    */
   template <template <typename> class Functor, typename Arg, bool grid_stride = false>
-  __launch_bounds__(Arg::block_size)
+  __launch_bounds__(Arg::block_size_cxpr)
     __global__ std::enable_if_t<(!device::use_kernel_arg<Arg>()) && (Arg::launch_bounds), void> BlockKernel2D()
   {
     static_assert(!grid_stride, "grid_stride not supported for BlockKernel");

From 9de5021248245e5b8f14e48e545d3f5a9cb76936 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 9 Dec 2025 14:16:13 -0800
Subject: [PATCH 053/121] Forbid NVSHMEM and TMA prefetching.  Fix autotuner so
 that only valid configs should be chosen when doing full dslash (whether or
 not TMA is used)

---
 include/dslash.h          | 13 +++++++++++++
 include/dslash_helper.cuh | 10 ++++++----
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/include/dslash.h b/include/dslash.h
index 5e8c12dbe6..d8993731c1 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -12,6 +12,10 @@
 namespace quda
 {
 
+#if defined(NVSHMEM_COMMS) && QUDA_DSLASH_PREFETCH_TMA > 0
+#error NVSHMEM cannot be used in combination with TMA prefetching at present
+#endif
+
   /**
      @brief This is the generic driver for launching Dslash kernels
      (the base kernel of which is defined in dslash_helper.cuh).  This
@@ -226,6 +230,15 @@ namespace quda
       }
     }
 
+    virtual bool advanceBlockDim(TuneParam &param) const override
+    {
+      // if TMA is enabled we must keep parity separate in the block (2-d tuning)
+      if constexpr (QUDA_DSLASH_PREFETCH_TMA > 0)
+        return TunableKernel2D::advanceBlockDim(param);
+      else
+        return TunableKernel3D::advanceBlockDim(param);
+    }
+
     virtual bool advanceTuneParam(TuneParam &param) const override
     {
       return advanceAux(param) || advanceSharedBytes(param) || advanceBlockDim(param) || advanceSharedCarveOut(param)
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index c55a588486..5aeaa9e6a5 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -750,10 +750,12 @@ namespace quda
     {
       typename Arg::D dslash(*this);
 
-      // FIXME need warp uniform parity which is not composable with
-      // NVSHMEM since the latter requires blockDim.y and blockDim.z to
-      // cover the entire extent
-      parity = target::block_idx().z; // ensure parity is warp uniform
+      if constexpr (QUDA_DSLASH_PREFETCH_TMA > 0) {
+        // FIXME need warp uniform parity which is not composable with
+        // NVSHMEM since the latter requires blockDim.y and blockDim.z to
+        // cover the entire extent
+        parity = target::block_idx().z; // ensure parity is warp uniform
+      }
 
       // for full fields set parity from z thread index else use arg setting
       if (arg.nParity == 1) parity = arg.parity;

From 30ae50294bfdbd9893bf52ab8f254f3957567acf Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 11 Dec 2025 11:45:30 -0800
Subject: [PATCH 054/121] Fix ambiguity from multi-inheritance with fused DWF
 kernel

---
 include/kernels/dslash_domain_wall_4d_fused_m5.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/kernels/dslash_domain_wall_4d_fused_m5.cuh b/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
index 46e0ae876a..ea90177228 100644
--- a/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
+++ b/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
@@ -25,6 +25,7 @@ namespace quda
     using DomainWall4DArg::threads;
     using DomainWall4DArg::x;
     using DomainWall4DArg::xpay;
+    using DomainWall4DArg::block_size;
 
     using F = typename DomainWall4DArg::F;
 

From 79934bbed4ce5cb419040b4f5b98cde7f2bcd0f6 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 11 Dec 2025 12:49:35 -0800
Subject: [PATCH 055/121] Cleanup of abstraction of TMA to allow for clean
 building on modern and legacy architectures.  Updated some deprecated calls
 to modern equivalents

---
 include/dslash.h                       |   2 +-
 include/gauge_field.h                  |  10 ---
 include/gauge_field_order.h            |   1 +
 include/kernels/dslash_coarse_mma.cuh  |   2 +-
 include/targets/cuda/inline_ptx.h      |   6 +-
 include/targets/cuda/load_store.h      |  13 +--
 include/targets/cuda/tma_helper.hpp    |  41 ++++++++-
 include/targets/generic/load_store.h   |  13 +--
 include/targets/generic/tma_helper.hpp |  20 +++++
 lib/dslash_wilson.hpp                  |   5 +-
 lib/gauge_field.cpp                    |  97 ---------------------
 lib/targets/cuda/CMakeLists.txt        |   2 +-
 lib/targets/cuda/tma_helper.cpp        | 112 +++++++++++++++++++++++++
 13 files changed, 192 insertions(+), 132 deletions(-)
 create mode 100644 include/targets/generic/tma_helper.hpp
 create mode 100644 lib/targets/cuda/tma_helper.cpp

diff --git a/include/dslash.h b/include/dslash.h
index d8993731c1..ed2cc91655 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -234,7 +234,7 @@ namespace quda
     {
       // if TMA is enabled we must keep parity separate in the block (2-d tuning)
       if constexpr (QUDA_DSLASH_PREFETCH_TMA > 0)
-        return TunableKernel2D::advanceBlockDim(param);
+        return TunableKernel2D_base<false>::advanceBlockDim(param);
       else
         return TunableKernel3D::advanceBlockDim(param);
     }
diff --git a/include/gauge_field.h b/include/gauge_field.h
index 38cc4b2de9..1d0fe72098 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -4,7 +4,6 @@
 #include <quda_internal.h>
 #include <quda.h>
 #include <lattice_field.h>
-
 #include <comm_key.h>
 
 namespace quda {
@@ -44,12 +43,6 @@ namespace quda {
       return 0;
     }
 
-    struct tensor_desc_t {
-      CUtensorMap N;
-      CUtensorMap Nrem;
-      CUtensorMap phase;
-    };
-
   } // namespace gauge
 
   struct GaugeFieldParam : public LatticeFieldParam {
@@ -673,9 +666,6 @@ namespace quda {
      */
     void PrintMatrix(int dim, int parity, unsigned int x_cb, int rank = 0) const;
 
-    gauge::tensor_desc_t create_tensor_descriptor(uint32_t block_size) const;
-    gauge::tensor_desc_t &get_tensor_descriptor(uint32_t block_size) const;
-
     friend struct GaugeFieldParam;
   };
 
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index dc6dd85bbe..c533dd7dd7 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -23,6 +23,7 @@
 #include <load_store.h>
 #include <aos.h>
 #include <transform_reduce.h>
+#include <tma_helper.hpp>
 
 namespace quda {
 
diff --git a/include/kernels/dslash_coarse_mma.cuh b/include/kernels/dslash_coarse_mma.cuh
index 0a8d5ea9ac..0cc3ac31e5 100644
--- a/include/kernels/dslash_coarse_mma.cuh
+++ b/include/kernels/dslash_coarse_mma.cuh
@@ -216,7 +216,7 @@ namespace quda
       // Initialize barrier. All `blockDim.x` threads in block participate.
       init(bar, blockDim.x * blockDim.y * blockDim.z);
       // Make initialized barrier visible in async proxy.
-      cde::fence_proxy_async_shared_cta();
+      cuda::ptx::fence_proxy_async();
     }
     // Syncthreads so initialized barrier is visible to all threads.
     __syncthreads();
diff --git a/include/targets/cuda/inline_ptx.h b/include/targets/cuda/inline_ptx.h
index bd8b8a2021..0fe9d743d3 100644
--- a/include/targets/cuda/inline_ptx.h
+++ b/include/targets/cuda/inline_ptx.h
@@ -493,21 +493,21 @@ namespace quda {
 
   using tensor_desc_t = CUtensorMap;
 
-  __device__ __forceinline__ void prefetch_tma_3d(const tensor_desc_t &tensor_map, int x, int y, int z)
+  __device__ __forceinline__ void prefetch_tma_3d(const CUtensorMap &tensor_map, int x, int y, int z)
   {
     asm volatile("cp.async.bulk.prefetch.tensor.3d.L2.global.tile [%0, {%1, %2, %3}];" ::"l"(&tensor_map), "r"(x),
                  "r"(y), "r"(z)
                  : "memory");
   }
 
-  __device__ __forceinline__ void prefetch_tma_4d(const tensor_desc_t &tensor_map, int x, int y, int z, int w)
+  __device__ __forceinline__ void prefetch_tma_4d(const CUtensorMap &tensor_map, int x, int y, int z, int w)
   {
     asm volatile("cp.async.bulk.prefetch.tensor.4d.L2.global.tile [%0, {%1, %2, %3, %4}];" ::"l"(&tensor_map), "r"(x),
                  "r"(y), "r"(z), "r"(w)
                  : "memory");
   }
 
-  __device__ __forceinline__ void prefetch_tma_5d(const tensor_desc_t &tensor_map, int x, int y, int z, int w, int u)
+  __device__ __forceinline__ void prefetch_tma_5d(const CUtensorMap &tensor_map, int x, int y, int z, int w, int u)
   {
     asm volatile("cp.async.bulk.prefetch.tensor.5d.L2.global.tile [%0, {%1, %2, %3, %4, %5}];" ::"l"(&tensor_map),
                  "r"(x), "r"(y), "r"(z), "r"(w), "r"(u)
diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index d699b3f4fd..1cd57007ca 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -2,6 +2,7 @@
 
 #include <register_traits.h>
 #include <inline_ptx.h>
+#include <tma_helper.hpp>
 
 namespace quda
 {
@@ -191,25 +192,25 @@ namespace quda
 
   // CUDA specialization of the prefetch_cache_tensor_3d that uses TMA (requires Hopper+)
   template <> struct prefetch_cache_tensor_3d_imp<true> {
-    __device__ inline void operator()(const tensor_desc_t &desc, int x, int y, int z)
+    __device__ inline void operator()(const tma_descriptor_t &desc, int x, int y, int z)
     {
-      prefetch_tma_3d(desc, x, y, z);
+      prefetch_tma_3d(desc.map, x, y, z);
     }
   };
 
   // CUDA specialization of the prefetch_cache_tensor_4d that uses TMA (requires Hopper+)
   template <> struct prefetch_cache_tensor_4d_imp<true> {
-    __device__ inline void operator()(const tensor_desc_t &desc, int x, int y, int z, int w)
+    __device__ inline void operator()(const tma_descriptor_t &desc, int x, int y, int z, int w)
     {
-      prefetch_tma_4d(desc, x, y, z, w);
+      prefetch_tma_4d(desc.map, x, y, z, w);
     }
   };
 
   // CUDA specialization of the prefetch_cache_tensor_5d that uses TMA (requires Hopper+)
   template <> struct prefetch_cache_tensor_5d_imp<true> {
-    __device__ inline void operator()(const tensor_desc_t &desc, int x, int y, int z, int w, int u)
+    __device__ inline void operator()(const tma_descriptor_t &desc, int x, int y, int z, int w, int u)
     {
-      prefetch_tma_5d(desc, x, y, z, w, u);
+      prefetch_tma_5d(desc.map, x, y, z, w, u);
     }
   };
 #endif
diff --git a/include/targets/cuda/tma_helper.hpp b/include/targets/cuda/tma_helper.hpp
index 39adae2abe..423a696653 100644
--- a/include/targets/cuda/tma_helper.hpp
+++ b/include/targets/cuda/tma_helper.hpp
@@ -1,17 +1,24 @@
 #pragma once
 
 #include <quda_define.h>
+#include <gauge_field.h>
+#include <complex_quda.h>
 
 #if (__COMPUTE_CAPABILITY__ >= 900) && (CUDA_VERSION >= 12060)
 #define USE_TENSOR_MEMORY_ACCELERATOR
 #endif
 
-#ifdef USE_TENSOR_MEMORY_ACCELERATOR
+#ifndef USE_TENSOR_MEMORY_ACCELERATOR
+
+#include "../generic/tma_helper.hpp"
+
+#else
 #include <cuda.h>
 #include <unordered_map>
+#include <cuda/ptx>
+#include <cuda/barrier>
 
 using barrier_t = cuda::barrier<cuda::thread_scope_block>;
-namespace cde = cuda::device::experimental;
 
 namespace quda
 {
@@ -175,9 +182,13 @@ namespace quda
   __device__ void inline tma_load_gmem_5d_box_2d(complex<T> *smem_ptr, const CUtensorMap *map, int offset_a,
                                                  int offset_b, int offset_c, int offset_d, int offset_e, barrier_t *bar)
   {
+#ifdef __CUDACC__
     static_assert(box_a <= tma_box_limit);
     static_assert(box_b <= tma_box_limit);
-    cde::cp_async_bulk_tensor_5d_global_to_shared(smem_ptr, map, offset_a, offset_b, offset_c, offset_d, offset_e, *bar);
+    int32_t coords[5] = {offset_a, offset_b, offset_c, offset_d, offset_e};
+    cuda::ptx::cp_async_bulk_tensor(cuda::ptx::space_shared, cuda::ptx::space_global, smem_ptr, map, coords,
+                                    reinterpret_cast<uint64_t *>(bar));
+#endif
   }
 
   /**
@@ -194,11 +205,33 @@ namespace quda
   __device__ void inline tma_load_gmem_4d_box_2d(complex<T> *smem_ptr, const CUtensorMap *map, int offset_a,
                                                  int offset_b, int offset_c, int offset_d, barrier_t *bar)
   {
+#ifdef __CUDACC__
     static_assert(box_a <= tma_box_limit);
     static_assert(box_b <= tma_box_limit);
-    cde::cp_async_bulk_tensor_4d_global_to_shared(smem_ptr, map, offset_a, offset_b, offset_c, offset_d, *bar);
+    int32_t coords[4] = {offset_a, offset_b, offset_c, offset_d};
+    cuda::ptx::cp_async_bulk_tensor(cuda::ptx::space_shared, cuda::ptx::space_global, smem_ptr, map, coords,
+                                    reinterpret_cast<uint64_t *>(bar));
+#endif
   }
 
+  namespace gauge
+  {
+
+    struct tensor_desc_t {
+      tma_descriptor_t N;
+      tma_descriptor_t Nrem;
+      tma_descriptor_t phase;
+    };
+
+  } // namespace gauge
+
+  /*
+   * @brief Create a tensor descriptor associated with a GaugeField instance with the supplied block size
+   * @param[in] u the gauge field we are getting the descriptor for
+   * @param[in] block_size the thread block size we associate with this descriptor
+   */
+  gauge::tensor_desc_t &get_tensor_descriptor(const GaugeField &u, uint32_t block_size);
+
 } // namespace quda
 
 #endif
diff --git a/include/targets/generic/load_store.h b/include/targets/generic/load_store.h
index 355890c0d6..9c4d263b11 100644
--- a/include/targets/generic/load_store.h
+++ b/include/targets/generic/load_store.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <target_device.h>
+#include <tma_helper.hpp>
 
 namespace quda
 {
@@ -101,28 +102,28 @@ namespace quda
   }
 
   template <bool is_device> struct prefetch_cache_tensor_3d_imp {
-    constexpr void operator()(const tensor_desc_t &, int, int, int) { }
+    constexpr void operator()(const tma_descriptor_t &, int, int, int) { }
   };
 
-  __device__ __host__ inline void prefetch_cache_tensor_3d(const tensor_desc_t &desc, int x, int y, int z)
+  __device__ __host__ inline void prefetch_cache_tensor_3d(const tma_descriptor_t &desc, int x, int y, int z)
   {
     target::dispatch<prefetch_cache_tensor_3d_imp>(desc, x, y, z);
   }
 
   template <bool is_device> struct prefetch_cache_tensor_4d_imp {
-    constexpr void operator()(const tensor_desc_t &, int, int, int, int) { }
+    constexpr void operator()(const tma_descriptor_t &, int, int, int, int) { }
   };
 
-  __device__ __host__ inline void prefetch_cache_tensor_4d(const tensor_desc_t &desc, int x, int y, int z, int w)
+  __device__ __host__ inline void prefetch_cache_tensor_4d(const tma_descriptor_t &desc, int x, int y, int z, int w)
   {
     target::dispatch<prefetch_cache_tensor_4d_imp>(desc, x, y, z, w);
   }
 
   template <bool is_device> struct prefetch_cache_tensor_5d_imp {
-    constexpr void operator()(const tensor_desc_t &, int, int, int, int, int) { }
+    constexpr void operator()(const tma_descriptor_t &, int, int, int, int, int) { }
   };
 
-  __device__ __host__ inline void prefetch_cache_tensor_5d(const tensor_desc_t &desc, int x, int y, int z, int w, int u)
+  __device__ __host__ inline void prefetch_cache_tensor_5d(const tma_descriptor_t &desc, int x, int y, int z, int w, int u)
   {
     target::dispatch<prefetch_cache_tensor_5d_imp>(desc, x, y, z, w, u);
   }
diff --git a/include/targets/generic/tma_helper.hpp b/include/targets/generic/tma_helper.hpp
new file mode 100644
index 0000000000..0acb5fe298
--- /dev/null
+++ b/include/targets/generic/tma_helper.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+namespace quda
+{
+
+  struct tma_descriptor_t {
+  };
+
+  namespace gauge
+  {
+    struct tensor_desc_t {
+      tma_descriptor_t N;
+      tma_descriptor_t Nrem;
+      tma_descriptor_t phase;
+    };
+  } // namespace gauge
+
+  inline gauge::tensor_desc_t get_tensor_descriptor(const GaugeField &, uint32_t) { return gauge::tensor_desc_t {}; }
+
+} // namespace quda
diff --git a/lib/dslash_wilson.hpp b/lib/dslash_wilson.hpp
index 16f81bf391..a4b96d42ab 100644
--- a/lib/dslash_wilson.hpp
+++ b/lib/dslash_wilson.hpp
@@ -31,9 +31,8 @@ namespace quda
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       Dslash::setParam(tp);
-      const_cast<quda::gauge::tensor_desc_t&>(Dslash::arg.U.tensor_desc) = U.get_tensor_descriptor(tp.block.x);
-      const_cast<quda::gauge::tensor_desc_t &>(Dslash::arg.Uback.tensor_desc)
-        = (U.shift(1)).get_tensor_descriptor(tp.block.x);
+      const_cast<quda::gauge::tensor_desc_t &>(Dslash::arg.U.tensor_desc) = get_tensor_descriptor(U, tp.block.x);
+      const_cast<quda::gauge::tensor_desc_t &>(Dslash::arg.Uback.tensor_desc) = get_tensor_descriptor(U.shift(1), tp.block.x);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
   };
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index a5566a9a4f..a1db10ee0d 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -239,103 +239,6 @@ namespace quda {
     if (param.compute_fat_link_max) fat_link_max = this->abs_max();
   }
 
-  static std::map<int, gauge::tensor_desc_t> tensor_map;
-
-  gauge::tensor_desc_t GaugeField::create_tensor_descriptor(uint32_t block_size) const
-  {
-    gauge::tensor_desc_t tensor;
-
-#if __COMPUTE_CAPABILITY__ >= 900
-    auto get_tensor_data_type = [&](size_t word_size) {
-      switch (word_size) {
-      case 1: return CU_TENSOR_MAP_DATA_TYPE_UINT8;
-      case 2: return CU_TENSOR_MAP_DATA_TYPE_UINT16;
-      case 4: return CU_TENSOR_MAP_DATA_TYPE_UINT32;
-      case 8: return CU_TENSOR_MAP_DATA_TYPE_UINT64;
-      default: errorQuda("Unsupported word size %d", precision);
-      }
-      return CU_TENSOR_MAP_DATA_TYPE_UINT8;
-    };
-
-    auto hasPhase = reconstruct == 9 || reconstruct == 13;
-    uint32_t N = gauge::get_vector_order(precision, reconstruct - hasPhase);
-    uint32_t M = (reconstruct - hasPhase) / N;
-    uint32_t Nrem = reconstruct - hasPhase - M * N;
-
-    CUtensorMapDataType dtype = get_tensor_data_type(precision);
-    {
-      if (stride % 16 != 0) errorQuda("Volume requirements not met: stride mod 16 = %lu", stride % 16);
-      uint64_t global_dim[] = {16llu * N, uint64_t(stride / 16), uint64_t(M), uint64_t(geometry), 2llu};
-      uint64_t global_stride[]
-        = {precision * 16llu * N, precision * stride * N, precision * stride * (N * M + Nrem), bytes / 2};
-      uint32_t box_dim[] = {16u * N, std::max(1u, block_size / 16), M, 1, 1};
-      uint32_t element_stride[] = {1, 1, 1, 1, 1};
-      auto data = this->data();
-      if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
-      auto res = cuTensorMapEncodeTiled(&tensor.N, dtype, 5, data, global_dim, global_stride, box_dim, element_stride,
-                                        CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
-                                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
-      if (res != CUDA_SUCCESS) {
-        const char *errStr = nullptr;
-        cuGetErrorString(res, &errStr);
-        errorQuda("cuTensorMapEncodeTiled failed: %s", errStr);
-      }
-    }
-
-    if (Nrem > 0) {
-      if (stride % 16 != 0) errorQuda("Volume requirements not met: stride mod 16 = %lu", stride % 16);
-      uint64_t global_dim[]
-        = {16llu * Nrem, uint64_t(stride / 16), uint64_t(geometry), 2llu}; // can remove the M dimension?
-      uint64_t global_stride[] = {precision * 16llu * Nrem, precision * stride * (N * M + Nrem), bytes / 2};
-      uint32_t box_dim[] = {16u * Nrem, std::max(1u, block_size / 16), 1, 1, 1};
-      uint32_t element_stride[] = {1, 1, 1, 1};
-      auto data = this->data<char *>() + M * N * stride * precision;
-      if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
-      auto res = cuTensorMapEncodeTiled(&tensor.Nrem, dtype, 4, data, global_dim, global_stride, box_dim,
-                                        element_stride, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
-                                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
-      if (res != CUDA_SUCCESS) {
-        const char *errStr = nullptr;
-        cuGetErrorString(res, &errStr);
-        errorQuda("cuTensorMapEncodeTiled failed: %s box = {%u, %u, %u, %u}", errStr, box_dim[0], box_dim[1],
-                  box_dim[2], box_dim[3]);
-      }
-    }
-
-    if (hasPhase) {
-      if (stride % 16 != 0) errorQuda("Volume requirements not met: stride mod 16 = %lu", stride % 16);
-      uint64_t global_dim[] = {16llu, uint64_t(stride / 16), uint64_t(geometry), 2llu};
-      uint64_t global_stride[] = {precision * 16llu, precision * stride, bytes / 2};
-      uint32_t box_dim[] = {16u, std::max(1u, block_size / 16u), 1, 1};
-      uint32_t element_stride[] = {1, 1, 1, 1};
-      auto data = this->data<char *>() + PhaseOffset();
-      if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
-      auto res = cuTensorMapEncodeTiled(&tensor.phase, dtype, 4, data, global_dim, global_stride, box_dim,
-                                        element_stride, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
-                                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
-      if (res != CUDA_SUCCESS) {
-        const char *errStr = nullptr;
-        cuGetErrorString(res, &errStr);
-        errorQuda("cuTensorMapEncodeTiled failed: %s box = {%u, %u, %u, %u}", errStr, box_dim[0], box_dim[1],
-                  box_dim[2], box_dim[3]);
-      }
-    }
-#endif // __COMPUTE_CAPABILITY__ >= 900
-
-    return tensor;
-  }
-
-  gauge::tensor_desc_t &GaugeField::get_tensor_descriptor(uint32_t block_size) const
-  {
-    auto tensor = tensor_map.find(block_size);
-    if (tensor != tensor_map.end()) {
-      return tensor->second;
-    } else {
-      tensor_map[block_size] = create_tensor_descriptor(block_size);
-    }
-    return tensor_map[block_size];
-  }
-
   void GaugeField::move(GaugeField &&src)
   {
     init = std::exchange(src.init, {});
diff --git a/lib/targets/cuda/CMakeLists.txt b/lib/targets/cuda/CMakeLists.txt
index c735036a7a..c3eddee81f 100644
--- a/lib/targets/cuda/CMakeLists.txt
+++ b/lib/targets/cuda/CMakeLists.txt
@@ -1,6 +1,6 @@
 # ######################################################################################################################
 # additonal sources
-target_sources(quda_cpp PRIVATE quda_api.cpp device.cpp malloc.cpp blas_lapack_cublas.cpp comm_target.cpp)
+target_sources(quda_cpp PRIVATE quda_api.cpp device.cpp malloc.cpp blas_lapack_cublas.cpp comm_target.cpp tma_helper.cpp)
 
 if(QUDA_JITIFY)
   target_sources(quda_cpp PRIVATE jitify_helper.cpp)
diff --git a/lib/targets/cuda/tma_helper.cpp b/lib/targets/cuda/tma_helper.cpp
new file mode 100644
index 0000000000..67e5b34895
--- /dev/null
+++ b/lib/targets/cuda/tma_helper.cpp
@@ -0,0 +1,112 @@
+#include <cuda.h>
+#include <tma_helper.hpp>
+#include <map>
+
+#ifdef USE_TENSOR_MEMORY_ACCELERATOR
+
+namespace quda
+{
+
+  auto create_descriptor(const GaugeField &u, uint32_t block_size)
+  {
+    auto precision = u.Precision();
+    auto reconstruct = u.Reconstruct();
+    auto stride = u.Stride();
+    auto geometry = u.Geometry();
+
+    auto get_tensor_data_type = [&](size_t word_size) {
+      switch (word_size) {
+      case 1: return CU_TENSOR_MAP_DATA_TYPE_UINT8;
+      case 2: return CU_TENSOR_MAP_DATA_TYPE_UINT16;
+      case 4: return CU_TENSOR_MAP_DATA_TYPE_UINT32;
+      case 8: return CU_TENSOR_MAP_DATA_TYPE_UINT64;
+      default: errorQuda("Unsupported word size %d", precision);
+      }
+      return CU_TENSOR_MAP_DATA_TYPE_UINT8;
+    };
+
+    auto hasPhase = reconstruct == 9 || reconstruct == 13;
+    uint32_t N = gauge::get_vector_order(precision, reconstruct - hasPhase);
+    uint32_t M = (reconstruct - hasPhase) / N;
+    uint32_t Nrem = reconstruct - hasPhase - M * N;
+
+    CUtensorMapDataType dtype = get_tensor_data_type(precision);
+    gauge::tensor_desc_t tensor;
+
+    {
+      if (stride % 16 != 0) errorQuda("Volume requirements not met: stride mod 16 = %lu", stride % 16);
+      uint64_t global_dim[] = {16llu * N, uint64_t(stride / 16), uint64_t(M), uint64_t(geometry), 2llu};
+      uint64_t global_stride[]
+        = {precision * 16llu * N, precision * stride * N, precision * stride * (N * M + Nrem), u.Bytes() / 2};
+      uint32_t box_dim[] = {16u * N, std::max(1u, block_size / 16), M, 1, 1};
+      uint32_t element_stride[] = {1, 1, 1, 1, 1};
+      auto data = u.data();
+      if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
+      auto res = cuTensorMapEncodeTiled(&tensor.N.map, dtype, 5, data, global_dim, global_stride, box_dim,
+                                        element_stride, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+                                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+      if (res != CUDA_SUCCESS) {
+        const char *errStr = nullptr;
+        cuGetErrorString(res, &errStr);
+        errorQuda("cuTensorMapEncodeTiled failed: %s", errStr);
+      }
+    }
+
+    if (Nrem > 0) {
+      if (stride % 16 != 0) errorQuda("Volume requirements not met: stride mod 16 = %lu", stride % 16);
+      uint64_t global_dim[]
+        = {16llu * Nrem, uint64_t(stride / 16), uint64_t(geometry), 2llu}; // can remove the M dimension?
+      uint64_t global_stride[] = {precision * 16llu * Nrem, precision * stride * (N * M + Nrem), u.Bytes() / 2};
+      uint32_t box_dim[] = {16u * Nrem, std::max(1u, block_size / 16), 1, 1, 1};
+      uint32_t element_stride[] = {1, 1, 1, 1};
+      auto data = u.data<char *>() + M * N * stride * precision;
+      if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
+      auto res = cuTensorMapEncodeTiled(&tensor.Nrem.map, dtype, 4, data, global_dim, global_stride, box_dim,
+                                        element_stride, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+                                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+      if (res != CUDA_SUCCESS) {
+        const char *errStr = nullptr;
+        cuGetErrorString(res, &errStr);
+        errorQuda("cuTensorMapEncodeTiled failed: %s box = {%u, %u, %u, %u}", errStr, box_dim[0], box_dim[1],
+                  box_dim[2], box_dim[3]);
+      }
+    }
+
+    if (hasPhase) {
+      if (stride % 16 != 0) errorQuda("Volume requirements not met: stride mod 16 = %lu", stride % 16);
+      uint64_t global_dim[] = {16llu, uint64_t(stride / 16), uint64_t(geometry), 2llu};
+      uint64_t global_stride[] = {precision * 16llu, precision * stride, u.Bytes() / 2};
+      uint32_t box_dim[] = {16u, std::max(1u, block_size / 16u), 1, 1};
+      uint32_t element_stride[] = {1, 1, 1, 1};
+      auto data = u.data<char *>() + u.PhaseOffset();
+      if (reinterpret_cast<uintptr_t>(data) % 16 != 0) errorQuda("Pointer is not 16-byte aligned");
+      auto res = cuTensorMapEncodeTiled(&tensor.phase.map, dtype, 4, data, global_dim, global_stride, box_dim,
+                                        element_stride, CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
+                                        CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+      if (res != CUDA_SUCCESS) {
+        const char *errStr = nullptr;
+        cuGetErrorString(res, &errStr);
+        errorQuda("cuTensorMapEncodeTiled failed: %s box = {%u, %u, %u, %u}", errStr, box_dim[0], box_dim[1],
+                  box_dim[2], box_dim[3]);
+      }
+    }
+
+    return tensor;
+  }
+
+  static std::map<int, gauge::tensor_desc_t> tensor_map;
+
+  gauge::tensor_desc_t &get_tensor_descriptor(const GaugeField &u, uint32_t block_size)
+  {
+    auto tensor = tensor_map.find(block_size);
+    if (tensor != tensor_map.end()) {
+      return tensor->second;
+    } else {
+      tensor_map[block_size] = create_descriptor(u, block_size);
+    }
+    return tensor_map[block_size];
+  }
+
+} // namespace quda
+
+#endif

From 04b4faebf92e35265104a76c3afc51b5334465d5 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 11 Dec 2025 19:46:34 -0800
Subject: [PATCH 056/121] We should only be aligning the stride with native
 gauge fields

---
 include/lattice_field.h |  5 +++--
 lib/gauge_field.cpp     |  2 +-
 lib/lattice_field.cpp   | 15 +++++++++------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/include/lattice_field.h b/include/lattice_field.h
index da7538d680..3ad257d251 100644
--- a/include/lattice_field.h
+++ b/include/lattice_field.h
@@ -160,8 +160,9 @@ namespace quda {
     /**
        @brief Create the field as specified by the param
        @param[in] Parameter struct
+       @param[in] native_gauge Whether the field is a native gauge field
     */
-    void create(const LatticeFieldParam &param);
+    void create(const LatticeFieldParam &param, bool is_native_gauge);
 
     /**
        @brief Move the contents of a field to this
@@ -500,7 +501,7 @@ namespace quda {
        @brief Constructor for creating a LatticeField from a LatticeFieldParam
        @param param Contains the metadata for creating the field
     */
-    LatticeField(const LatticeFieldParam &param);
+    LatticeField(const LatticeFieldParam &param, bool is_native_gauge = false);
 
     /**
        @brief Destructor for LatticeField
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 7ffae620c2..e3bf6058d4 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -7,7 +7,7 @@ namespace quda {
 
   GaugeFieldParam::GaugeFieldParam(const GaugeField &u) : LatticeFieldParam(u) { u.fill(*this); }
 
-  GaugeField::GaugeField(const GaugeFieldParam &param) : LatticeField(param)
+  GaugeField::GaugeField(const GaugeFieldParam &param) : LatticeField(param, param.order == QUDA_NATIVE_GAUGE_ORDER)
   {
     create(param);
 
diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index 5ee5acfb0e..18ea207220 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -25,7 +25,7 @@ namespace quda {
     }
   }
 
-  LatticeField::LatticeField(const LatticeFieldParam &param) :
+  LatticeField::LatticeField(const LatticeFieldParam &param, bool is_native_gauge) :
     volume(1),
     localVolume(1),
     pad(param.pad),
@@ -57,7 +57,7 @@ namespace quda {
     mh_send_rdma {},
     mem_type(param.mem_type)
   {
-    create(param);
+    create(param, is_native_gauge);
   }
 
   LatticeField::LatticeField(const LatticeField &field) noexcept :
@@ -98,7 +98,7 @@ namespace quda {
   {
     LatticeFieldParam param;
     field.fill(param);
-    create(param);
+    create(param, field.isNative());
   }
 
   LatticeField::LatticeField(LatticeField &&field) noexcept { move(std::move(field)); }
@@ -111,7 +111,7 @@ namespace quda {
       destroyComms();
       LatticeFieldParam param;
       src.fill(param);
-      create(param);
+      create(param, src.isNative());
     }
     return *this;
   }
@@ -125,7 +125,7 @@ namespace quda {
     return *this;
   }
 
-  void LatticeField::create(const LatticeFieldParam &param)
+  void LatticeField::create(const LatticeFieldParam &param, bool is_native_gauge)
   {
     if (param.location == QUDA_INVALID_FIELD_LOCATION) errorQuda("Invalid field location");
     location = param.location;
@@ -158,7 +158,10 @@ namespace quda {
     volumeCB = (siteSubset == QUDA_FULL_SITE_SUBSET) ? volume / 2 : volume;
     localVolumeCB = (siteSubset == QUDA_FULL_SITE_SUBSET) ? localVolume / 2 : localVolume;
     stride = volumeCB + pad;
-    stride = (stride + 31) & ~31; // round up to be a multiple of 32 to guarantee alignment
+    if (is_native_gauge) { // if a native gauge field we need to ensure padded volume is aligned
+      stride = (stride + 31) & ~31; // round up to be a multiple of 32 to guarantee alignment
+      pad = stride - volumeCB;
+    }
 
     // for parity fields the factor of half is present for all surfaces dimensions except x, so add it manually
     for (int i = 0; i < nDim; i++) {

From 0cf1286c0f3e67213b1dd40592d7e71198677b73 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 12 Dec 2025 16:56:22 -0800
Subject: [PATCH 057/121] Remove FMA optimied I2F, as it introduces floating
 point rounding that can lead to catestrophic cancelation

---
 include/color_spinor_field_order.h | 16 +++---
 include/convert.h                  | 81 ++----------------------------
 include/gauge_field_order.h        | 24 ++++-----
 3 files changed, 19 insertions(+), 102 deletions(-)

diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index e2d1316521..46ae45c645 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -1027,19 +1027,18 @@ namespace quda
         real v[length_ghost];
         norm_type nrm
           = isFixed<Float>::value ? vector_load<float, 1>(ghost_norm[2 * dim + dir], parity * faceVolumeCB[dim] + x)[0] : 0.0;
-        norm_type nrm_shift = -nrm * 12582912.0f;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
           auto vecTmp = vector_load<Float, N>(ghost[2 * dim + dir] + parity * faceVolumeCB[dim] * length_ghost,
                                               i * faceVolumeCB[dim] + x);
-          copy_and_scale(v + i * N, vecTmp, nrm, nrm_shift);
+          copy_and_scale(v + i * N, vecTmp, nrm);
         }
 
         if constexpr (Nrem > 0) { // now load any remainder
           auto vecTmp = vector_load<Float, Nrem>(
             ghost[2 * dim + dir] + parity * faceVolumeCB[dim] * length_ghost + faceVolumeCB[dim] * M * N, x);
-          copy_and_scale(v + M * N, vecTmp, nrm, nrm_shift);
+          copy_and_scale(v + M * N, vecTmp, nrm);
         }
 
 #pragma unroll
@@ -1166,20 +1165,19 @@ namespace quda
         auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns)); // FIXME - optimize 64-bit indexing here
 #endif
         norm_type nrm = isFixed<Float>::value ? vector_load<float, 1>(norm, x + parity * norm_offset)[0] : 0.0;
-        norm_type nrm_shift = -nrm * 12582912.0f;
 
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first load from memory
           auto vecTmp = vector_load<Float, N>(field, parity * offset, volumeCB * i + x);
           // now copy into output and scale
-          copy_and_scale(v + i * N, vecTmp, nrm, nrm_shift);
+          copy_and_scale(v + i * N, vecTmp, nrm);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
           auto vecTmp = vector_load<Float, Nrem>(field, parity * offset + volumeCB * M * N, x);
-          copy_and_scale(v + M * N, vecTmp, nrm, nrm_shift);
+          copy_and_scale(v + M * N, vecTmp, nrm);
         }
 
 #pragma unroll
@@ -1303,10 +1301,9 @@ namespace quda
         // extract the norm
         norm_type nrm;
         memcpy(&nrm, &vecTmp[6], sizeof(norm_type));
-        norm_type nrm_shift = -nrm * 12582912.0f;
         array<Float, 6> vecTmp2;
         memcpy(&vecTmp2, &vecTmp, sizeof(vecTmp2));
-        copy_and_scale(v, vecTmp2, nrm, nrm_shift);
+        copy_and_scale(v, vecTmp2, nrm);
 
 #pragma unroll
         for (int i = 0; i < length_ghost / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]);
@@ -1412,9 +1409,8 @@ namespace quda
         memcpy(&nrm, &vecTmp[6], sizeof(norm_type));
         array<Float, 6> vecTmp2;
         memcpy(&vecTmp2, &vecTmp, sizeof(vecTmp2));
-        norm_type nrm_shift = -nrm * 12582912.0f;
         // now copy into output and scale
-        copy_and_scale(v, vecTmp2, nrm, nrm_shift);
+        copy_and_scale(v, vecTmp2, nrm);
 
 #pragma unroll
         for (int i = 0; i < length / 2; i++) out[i] = complex(v[2 * i + 0], v[2 * i + 1]);
diff --git a/include/convert.h b/include/convert.h
index f87ef514f1..6c608117da 100644
--- a/include/convert.h
+++ b/include/convert.h
@@ -104,6 +104,7 @@ namespace quda
         int32_t i = a + 0x4B400000;
         float f;
         memcpy(&f, &i, sizeof(int32_t));
+        assert(f - 12582912.0f == static_cast<float>(a));
         return f - 12582912.0f;
       }
     }
@@ -119,43 +120,13 @@ namespace quda
         int2 i = {a + 0x4B400000, b + 0x4B400000};
         float2 f;
         memcpy(&f, &i, sizeof(int2));
+        assert(f.x - 12582912.0f == static_cast<float>(a));
+        assert(f.y - 12582912.0f == static_cast<float>(b));
         return add2(f, {-12582912.0f, -12582912.0f});
       }
     }
   };
 
-  template <> struct i2f_fma<true> {
-    template <typename T, typename alternative_t>
-    __device__ std::enable_if_t<std::is_same_v<alternative_t, std::integral_constant<bool, alternative_t::value>>, float>
-    operator()(T a, alternative_t, float b, float c)
-    {
-      if constexpr (!alternative_t::value) {
-        return b * static_cast<float>(a);
-      } else {
-        // will work for up to 23-bit int
-        int32_t i = a + 0x4B400000;
-        float f;
-        memcpy(&f, &i, sizeof(int32_t));
-        return b * f + c;
-      }
-    }
-
-    template <typename T, typename alternative_t>
-    __device__ std::enable_if_t<std::is_same_v<alternative_t, std::integral_constant<bool, alternative_t::value>>, float2>
-    operator()(const T &a1, const T &a2, alternative_t, float b, float c)
-    {
-      if constexpr (!alternative_t::value) {
-        return mul2(float2 {b, b}, float2 {static_cast<float>(a1), static_cast<float>(a2)});
-      } else {
-        // will work for up to 23-bit int
-        int2 i = {a1 + 0x4B400000, a2 + 0x4B400000};
-        float2 f;
-        memcpy(&f, &i, sizeof(int2));
-        return fma2({b, b}, f, {c, c});
-      }
-    }
-  };
-
   /**
      @brief Regular float-to-integer round used on the host
   */
@@ -327,52 +298,6 @@ namespace quda
     });
   }
 
-  /**
-     @brief Specialized variants of the copy_and_scale that passes the
-     alternative i2f constant to be subtracted (this allows for
-     optimal FMA issuance).  Note the scale factors are ignored unless
-     the input type (b) is either a short or char vector.
-  */
-  template <typename T1, typename T2, typename T3>
-  constexpr std::enable_if_t<!isFixed<T1>::value && !isFixed<T2>::value, void> copy_and_scale(T1 &a, const T2 &b,
-                                                                                              const T3 &, const T3 &)
-  {
-    copy(a, b);
-  }
-
-  template <typename T1, typename T2, typename T3>
-  constexpr std::enable_if_t<!isFixed<T1>::value && isFixed<T2>::value, void> copy_and_scale(T1 &a, const T2 &b,
-                                                                                             const T3 &c, const T3 &d)
-  {
-    a = target::dispatch<i2f_fma>(b, std::integral_constant<bool, i2f_i[0]>(), c, d);
-  }
-
-  template <typename T1, typename T2, int n, typename T3>
-  constexpr std::enable_if_t<!isFixed<T1>::value && !isFixed<T2>::value, void>
-  copy_and_scale(T1 *a, const array<T2, n> &b, const T3 &, const T3 &)
-  {
-    for (int i = 0; i < n; i++) copy(a[i], b[i]);
-  }
-
-  template <typename T1, typename T2, int n, typename T3>
-  constexpr std::enable_if_t<!isFixed<T1>::value && !isFixed<T2>::value, void>
-  copy_and_scale(array<T1, n> &a, const T2 *b, const T3 &, const T3 &)
-  {
-    for (int i = 0; i < n; i++) copy(a[i], b[i]);
-  }
-
-  template <typename T1, typename T2, int n, typename T3>
-  constexpr std::enable_if_t<!isFixed<T1>::value && isFixed<T2>::value, void>
-  copy_and_scale(T1 *a, const array<T2, n> &b, const T3 &c, const T3 &d)
-  {
-    static_assert(n % 2 == 0);
-    constexpr_for<0, n, 2>([&](auto i) {
-      auto ai = target::dispatch<i2f_fma>(b[i + 0], b[i + 1], std::integral_constant<bool, i2f_i[(i / 2) % 4]>(), c, d);
-      a[i + 0] = ai.x;
-      a[i + 1] = ai.y;
-    });
-  }
-
   template <class fixed_t, class float_t> __device__ __host__ fixed_t f2i_round(float_t f)
   {
 #if 1
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 01ff66447d..b0011b87a6 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1588,9 +1588,7 @@ namespace quda {
         size_t bytes;
         gauge::tensor_desc_t tensor_desc;
         const real combined_scale; // Precomputed scale for copy_and_scale: fixedInvMaxValue * reconstruct.scale
-        const real combined_shift; // Precomputed shift for the alternate i2f_fma combined_scale * -12582912.0f
         const real phase_scale; // Precomputed scale for phase loading: fixedInvMaxValue * 2.0 (or just 2.0 for float)
-        const real phase_shift; // Precomputed shift for the alternative i2f_fma phase_scale *  -12582912.0f
 
         FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) :
           reconstruct(u),
@@ -1611,10 +1609,8 @@ namespace quda {
               return isFixed<Float>::value ? fixedInvMaxValue<Float>::value : 1.0;
             }
           }()),
-          combined_shift(combined_scale * -12582912.0f),
           phase_scale(isFixed<Float>::value ? fixedInvMaxValue<Float>::value * static_cast<real>(2.0) :
-                                              static_cast<real>(2.0)),
-          phase_shift(phase_scale * -12582912.0f)
+                                              static_cast<real>(2.0))
         {
           if (geometry == QUDA_COARSE_GEOMETRY)
             errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
@@ -1638,18 +1634,18 @@ namespace quda {
           // first load from memory
           auto vecTmp = vector_load<Float, N>(gauge, parity * offset + dir * (M * N + Nrem) * stride, i * stride + x);
           // second do copy converting into register type with combined scaling
-          copy_and_scale(tmp + i * N, vecTmp, combined_scale, combined_shift);
+          copy_and_scale(tmp + i * N, vecTmp, combined_scale);
         }
 
         // now load any remainder
         if constexpr (Nrem > 0) {
           auto vecTmp = vector_load<Float, Nrem>(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x);
-          copy_and_scale(tmp + M * N, vecTmp, combined_scale, combined_shift);
+          copy_and_scale(tmp + M * N, vecTmp, combined_scale);
         }
 
         if constexpr (loadPhase) {
           if constexpr (isFixed<Float>::value) {
-            copy_and_scale(phase, gauge[parity * offset + phaseOffset + stride * dir + x], phase_scale, phase_shift);
+            copy_and_scale(phase, gauge[parity * offset + phaseOffset + stride * dir + x], phase_scale);
           } else {
             copy(phase, gauge[parity * offset + phaseOffset + stride * dir + x]);
             phase *= static_cast<real>(2.0);
@@ -1765,14 +1761,14 @@ namespace quda {
             auto vecTmp = vector_load<Float, N>(ghost[dir], (i * 2 + parity) * faceVolumeCB[dir] + x);
 
             // second do copy converting into register type with combined scaling
-            copy_and_scale(tmp + i * N, vecTmp, combined_scale, combined_shift);
+            copy_and_scale(tmp + i * N, vecTmp, combined_scale);
           }
 
           // now load any remainder
           if constexpr (Nrem > 0) {
             auto vecTmp
               = vector_load<Float, Nrem>(ghost[dir], 2 * faceVolumeCB[dir] * M * N, parity * faceVolumeCB[dir] + x);
-            copy_and_scale(tmp + M * N, vecTmp, combined_scale, combined_shift);
+            copy_and_scale(tmp + M * N, vecTmp, combined_scale);
           }
 
           real phase = 0.;
@@ -1783,7 +1779,7 @@ namespace quda {
             // } else {
             if constexpr (isFixed<Float>::value) {
               copy_and_scale(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x],
-                             phase_scale, phase_shift);
+                             phase_scale);
             } else {
               copy(phase, ghost[dir][2 * faceVolumeCB[dir] * (reconLen - 1) + parity * faceVolumeCB[dir] + x]);
               phase *= static_cast<real>(2.0);
@@ -1873,7 +1869,7 @@ namespace quda {
                                               ((i * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
           // second do copy converting into register type with combined scaling
-          copy_and_scale(tmp + i * N, vecTmp, combined_scale, combined_shift);
+          copy_and_scale(tmp + i * N, vecTmp, combined_scale);
         }
 
         // now load any remainder
@@ -1882,7 +1878,7 @@ namespace quda {
             = vector_load<Float, Nrem>(ghost[dim], (dir * reconLen + M * N) * 2 * geometry * R[dim] * faceVolumeCB[dim],
                                        (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x);
 
-          copy_and_scale(tmp + M * N, vecTmp, combined_scale, combined_shift);
+          copy_and_scale(tmp + M * N, vecTmp, combined_scale);
         }
 
         real phase = 0.;
@@ -1891,7 +1887,7 @@ namespace quda {
             copy_and_scale(phase,
                            ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]
                                       + (parity * geometry + g) * R[dim] * faceVolumeCB[dim] + x],
-                           phase_scale, phase_shift);
+                           phase_scale);
           } else {
             copy(phase,
                  ghost[dim][(dir * reconLen + M * N + Nrem) * 2 * geometry * R[dim] * faceVolumeCB[dim]

From aaa629db44b84d9d84b7e2dab7970eeba578b538 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 12 Dec 2025 16:57:01 -0800
Subject: [PATCH 058/121] We only ever need to resize the pad when creating a
 gauge field from fresh, not copying or moving it

---
 lib/lattice_field.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index 18ea207220..8c87cf8f4d 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -98,7 +98,7 @@ namespace quda {
   {
     LatticeFieldParam param;
     field.fill(param);
-    create(param, field.isNative());
+    create(param, false);
   }
 
   LatticeField::LatticeField(LatticeField &&field) noexcept { move(std::move(field)); }
@@ -111,7 +111,7 @@ namespace quda {
       destroyComms();
       LatticeFieldParam param;
       src.fill(param);
-      create(param, src.isNative());
+      create(param, false);
     }
     return *this;
   }

From 5653947a74fc898f8da88c24f03178ce8de01042 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 15 Dec 2025 11:05:53 -0800
Subject: [PATCH 059/121] Tweak block CG tolerance for staggered eigensovler. 
 Laplace eigensolver tests should all now pass

---
 tests/staggered_eigensolve_test_gtest.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/staggered_eigensolve_test_gtest.hpp b/tests/staggered_eigensolve_test_gtest.hpp
index a0e89006e5..62c3776802 100644
--- a/tests/staggered_eigensolve_test_gtest.hpp
+++ b/tests/staggered_eigensolve_test_gtest.hpp
@@ -173,6 +173,9 @@ TEST_P(StaggeredEigensolveTest, verify)
     tol *= 5;
   }
 
+  // with block TRLM some of eigenvectors can have a small deviation
+  if (::testing::get<1>(GetParam()) == QUDA_EIG_BLK_TR_LANCZOS) tol *= 2;
+
   // account for summation error scaling with number of processors
   auto dof = 6lu * dim[0] * dim[1] * dim[2] * dim[3];
   tol *= (1 + log(quda::comm_size()) / log(dof));

From c5cd6693671cfdc384efd58925068941bb543d50 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 15 Dec 2025 22:06:52 -0800
Subject: [PATCH 060/121] Fix issue with MRHS Shamir DWF operator (pre-computed
 constant should not include RHS dimension).  Remove legacy dslash constants
 no longer used

---
 include/dslash_quda.h                     | 5 -----
 include/kernels/dslash_domain_wall_5d.cuh | 2 +-
 lib/color_spinor_field.cpp                | 4 ----
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/include/dslash_quda.h b/include/dslash_quda.h
index 09404ddb5a..4017baa69f 100644
--- a/include/dslash_quda.h
+++ b/include/dslash_quda.h
@@ -36,11 +36,6 @@ namespace quda
     int X3X2X1;
     int X4X3X2X1;
     int X5X4X3X2X1;
-
-    int X2X1mX1;
-    int X3X2X1mX2X1;
-    int X4X3X2X1mX3X2X1;
-    int X5X4X3X2X1mX4X3X2X1;
   };
 
   /**
diff --git a/include/kernels/dslash_domain_wall_5d.cuh b/include/kernels/dslash_domain_wall_5d.cuh
index 0cb3190293..e1f9171763 100644
--- a/include/kernels/dslash_domain_wall_5d.cuh
+++ b/include/kernels/dslash_domain_wall_5d.cuh
@@ -25,7 +25,7 @@ namespace quda
     {
       // remove the batch dimension from these constants, since these are used for 5-d checkerboard indexing
       DslashArg<Float, nDim, DDArg>::dc.X[4] = in.X(4);
-      DslashArg<Float, nDim, DDArg>::dc.X5X4X3X2X1mX4X3X2X1 = (in.X(4) - 1) * DslashArg<Float, nDim, DDArg>::dc.X4X3X2X1;
+      DslashArg<Float, nDim, DDArg>::dc.X5X4X3X2X1 = in.X(4) * DslashArg<Float, nDim, DDArg>::dc.X4X3X2X1;
     }
   };
 
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index e598695c6e..6022340fe4 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -385,10 +385,6 @@ namespace quda
       dc.X3X2X1 = X[2] * X[1] * X[0];
       dc.X4X3X2X1 = X[3] * X[2] * X[1] * X[0];
       dc.X5X4X3X2X1 = X[4] * X[3] * X[2] * X[1] * X[0];
-      dc.X2X1mX1 = (X[1] - 1) * X[0];
-      dc.X3X2X1mX2X1 = (X[2] - 1) * X[1] * X[0];
-      dc.X4X3X2X1mX3X2X1 = (X[3] - 1) * X[2] * X[1] * X[0];
-      dc.X5X4X3X2X1mX4X3X2X1 = (X[4] - 1) * X[3] * X[2] * X[1] * X[0];
     }
 
     spin_project_allocated = spin_project;

From 20a70e430a618cf219697dc8984b47653bb54419 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 15 Dec 2025 23:45:47 -0800
Subject: [PATCH 061/121] Fix warning

---
 include/targets/cuda/load_store.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index 1cd57007ca..943f2bfe2d 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -65,19 +65,19 @@ namespace quda
     }
 
     template <size_t prefetch_size>
-    __device__ inline void operator()(short2 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
+    __device__ inline void operator()(short2 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
       load_cached_short2<prefetch_size>(value, reinterpret_cast<const short2 *>(ptr) + idx);      
     }
 
     template <size_t prefetch_size>
-    __device__ inline void operator()(short4 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
+    __device__ inline void operator()(short4 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
       load_cached_short4<prefetch_size>(value, reinterpret_cast<const short4 *>(ptr) + idx);      
     }
 
     template <size_t prefetch_size>
-    __device__ inline void operator()(short8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
+    __device__ inline void operator()(short8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
       float4 tmp;
       operator()(tmp, ptr, idx, prefetch);
@@ -85,7 +85,7 @@ namespace quda
     }
 
     template <size_t prefetch_size>
-    __device__ inline void operator()(char8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
+    __device__ inline void operator()(char8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
       float2 tmp;
       operator()(tmp, ptr, idx, prefetch);

From 74dd48874994150346a518081e94de13a7e80974 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 16 Dec 2025 15:39:26 -0800
Subject: [PATCH 062/121] Fix bug in mdw_dslash5_tensor_core (was ignorant of
 the reworked accessors

---
 include/kernels/dslash_mdw_fused.cuh          |   2 +-
 include/targets/cuda/load_store.h             |   4 +-
 .../targets/cuda/mdw_dslash5_tensor_core.cuh  | 101 +++++-------------
 3 files changed, 28 insertions(+), 79 deletions(-)

diff --git a/include/kernels/dslash_mdw_fused.cuh b/include/kernels/dslash_mdw_fused.cuh
index 2b57d5b0a9..65bfeb6c76 100644
--- a/include/kernels/dslash_mdw_fused.cuh
+++ b/include/kernels/dslash_mdw_fused.cuh
@@ -37,7 +37,7 @@ namespace quda {
       static constexpr bool reload = reload_;
       static constexpr bool spin_project = true;
       static constexpr bool spinor_direct_load = true; // false means texture load
-      using F = typename colorspinor_mapper<storage_type, 4, nColor, spin_project, spinor_direct_load>::type; // color spin field order
+      using F = typename colorspinor_mapper<storage_type, 4, nColor, spin_project, spinor_direct_load, true>::type; // color spin field order
       static constexpr bool gauge_direct_load = true;                          // false means texture load
       static constexpr QudaGhostExchange ghost = QUDA_GHOST_EXCHANGE_EXTENDED; // gauge field used is an extended one
       using G = typename gauge_mapper<storage_type, recon, 18, QUDA_STAGGERED_PHASE_NO, gauge_direct_load, ghost>::type; // gauge field order
diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index 943f2bfe2d..4b11df34b3 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -77,7 +77,7 @@ namespace quda
     }
 
     template <size_t prefetch_size>
-    __device__ inline void operator()(short8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
+    __device__ inline void operator()(short8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
     {
       float4 tmp;
       operator()(tmp, ptr, idx, prefetch);
@@ -85,7 +85,7 @@ namespace quda
     }
 
     template <size_t prefetch_size>
-    __device__ inline void operator()(char8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
+    __device__ inline void operator()(char8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
     {
       float2 tmp;
       operator()(tmp, ptr, idx, prefetch);
diff --git a/include/targets/cuda/mdw_dslash5_tensor_core.cuh b/include/targets/cuda/mdw_dslash5_tensor_core.cuh
index 0f1a1abc95..4517731b2f 100644
--- a/include/targets/cuda/mdw_dslash5_tensor_core.cuh
+++ b/include/targets/cuda/mdw_dslash5_tensor_core.cuh
@@ -186,31 +186,6 @@ namespace quda
     }
   }
 
-  template <class integer_vec> __device__ inline integer_vec __2half22integer4_rn(const half2 &a, const half2 &b)
-  {
-    integer_vec c;
-    c.x = __half2short_rn(a.x);
-    c.y = __half2short_rn(a.y);
-    c.z = __half2short_rn(b.x);
-    c.w = __half2short_rn(b.y);
-    return c;
-  }
-
-  template <class integer_vec>
-  __device__ inline integer_vec __4half22integer8_rn(const half2 &a, const half2 &b, const half2 &c, const half2 &d)
-  {
-    integer_vec e;
-    e.x.x = __half2short_rn(a.x);
-    e.x.y = __half2short_rn(a.y);
-    e.x.z = __half2short_rn(b.x);
-    e.x.w = __half2short_rn(b.y);
-    e.y.x = __half2short_rn(c.x);
-    e.y.y = __half2short_rn(c.y);
-    e.y.z = __half2short_rn(d.x);
-    e.y.w = __half2short_rn(d.y);
-    return e;
-  }
-
   __device__ inline void __half_max_abs_half2__(half &max, const half2 &input)
   {
     half2 lh = habs2(input);
@@ -309,57 +284,31 @@ namespace quda
     norm[sid] = __half2float(max_) * scale * fixedInvMaxValue<storage_type>::value;
 
     const half2 max_i_div_max2_ = __half2half2(__hdiv(fixedMaxValue<storage_type>::value, max_));
-#if QUDA_ORDER_FP == 8 // use float8/short8
-    typedef typename VectorType<storage_type, 8>::type storage_vec;
-    storage_vec *out = reinterpret_cast<storage_vec *>(output.field);
-    half2 a, b, c, d;
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 0) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 0) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    c = __hmul2(sm_b[(threadIdx.y * 4 + 0) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    d = __hmul2(sm_b[(threadIdx.y * 4 + 1) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    vector_store(&out[sid + 0 * output.volumeCB], 0, __4half22integer8_rn<storage_vec>(a, b, c, d));
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 1) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 1) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    c = __hmul2(sm_b[(threadIdx.y * 4 + 2) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    d = __hmul2(sm_b[(threadIdx.y * 4 + 2) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    vector_store(&out[sid + 1 * output.volumeCB], 0, __4half22integer8_rn<storage_vec>(a, b, c, d));
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 2) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 3) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    c = __hmul2(sm_b[(threadIdx.y * 4 + 3) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    d = __hmul2(sm_b[(threadIdx.y * 4 + 3) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    vector_store(&out[sid + 2 * output.volumeCB], 0, __4half22integer8_rn<storage_vec>(a, b, c, d));
-#elif QUDA_ORDER_FP == 4
-    typedef typename VectorType<storage_type, 4>::type storage_vec;
-    storage_vec *out = reinterpret_cast<storage_vec *>(output.field);
-    half2 a, b;
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 0) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 0) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    out[sid + 0 * output.volumeCB] = __2half22integer4_rn<storage_vec>(a, b);
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 0) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 1) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    out[sid + 1 * output.volumeCB] = __2half22integer4_rn<storage_vec>(a, b);
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 1) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 1) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    out[sid + 2 * output.volumeCB] = __2half22integer4_rn<storage_vec>(a, b);
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 2) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 2) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    out[sid + 3 * output.volumeCB] = __2half22integer4_rn<storage_vec>(a, b);
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 2) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 3) * N_sm_d2 + 3 * threadIdx.x + 0], max_i_div_max2_);
-    out[sid + 4 * output.volumeCB] = __2half22integer4_rn<storage_vec>(a, b);
-
-    a = __hmul2(sm_b[(threadIdx.y * 4 + 3) * N_sm_d2 + 3 * threadIdx.x + 1], max_i_div_max2_);
-    b = __hmul2(sm_b[(threadIdx.y * 4 + 3) * N_sm_d2 + 3 * threadIdx.x + 2], max_i_div_max2_);
-    out[sid + 5 * output.volumeCB] = __2half22integer4_rn<storage_vec>(a, b);
-#endif
+    array<short2, 12> o;
+    for (int s = 0; s < 4; s++) {
+#pragma unroll
+      for (int c = 0; c < 3; c++) {
+        auto tmp = __hmul2(sm_b[(threadIdx.y * 4 + s) * N_sm_d2 + 3 * threadIdx.x + c], max_i_div_max2_);
+        o[s * 3 + c] = {__half2short_rn(tmp.x), __half2short_rn(tmp.y)};
+      }
+    }
+
+    constexpr int N = colorspinor::get_vector_order<storage_type>(24);
+    constexpr int M = 24 / N;
+    constexpr int Nrem = 24 - N * M;
+
+    array<short, N> outN;
+#pragma unroll
+    for (int i = 0; i < M; i++) {
+      memcpy(&outN, &o[i * N / 2], sizeof(outN));
+      vector_store(output.field, i * output.volumeCB + sid, outN);
+    }
+
+    if constexpr (Nrem > 0) {
+      array<short, Nrem> outNrem;
+      memcpy(&outNrem, &o[N * M / 2], sizeof(outNrem));
+      vector_store(output.field, N * M * output.volumeCB, sid, outNrem);
+    }
   }
 
   template <class mma_t, int BlockDimX, int Ls, int M, int N, int M_PAD, int N_PAD, bool reload, class T>

From b2e6e882399e2f0a6f5a954007bdd441d55f5821 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 09:56:46 -0800
Subject: [PATCH 063/121] Minor optimization mdw_dslash5_tensor_core.cuh and
 fix quarter precision

---
 .../targets/cuda/mdw_dslash5_tensor_core.cuh  | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/targets/cuda/mdw_dslash5_tensor_core.cuh b/include/targets/cuda/mdw_dslash5_tensor_core.cuh
index 4517731b2f..4d9ab8d8c5 100644
--- a/include/targets/cuda/mdw_dslash5_tensor_core.cuh
+++ b/include/targets/cuda/mdw_dslash5_tensor_core.cuh
@@ -251,14 +251,14 @@ namespace quda
     }
     if (store) {
       scale = block_wise_reduce_vector(ftor, v);
+      auto scale_inv = __fdividef(1.0f, scale);
 #pragma unroll
       for (int spin = 0; spin < 4; spin++) {
 #pragma unroll
         for (int color = 0; color < 3; color++) {
-          float real = v(spin, color).real() / scale;
-          float imag = v(spin, color).imag() / scale;
+          auto c = v(spin, color) * scale_inv;
           int idx = (threadIdx.y * 4 + spin) * N_sm_d2 + 3 * threadIdx.x + color;
-          sm_b[idx] = __floats2half2_rn(real, imag);
+          sm_b[idx] = __floats2half2_rn(c.real(), c.imag());
         }
       }
     }
@@ -266,7 +266,7 @@ namespace quda
 
   // Store results(scaled short/char values and scale) in shared memroy to global
   // memroy.
-  template <class storage_type, int N_sm, class Output>
+  template <class store_t, int N_sm, class Output>
   __device__ inline void store_matrix_c(Output &output, half2 *sm_b, int sid, const float scale)
   {
     half max_ = 0.0f;
@@ -281,23 +281,23 @@ namespace quda
     }
 
     auto norm = reinterpret_cast<float *>(output.field + output.volumeCB * 24);
-    norm[sid] = __half2float(max_) * scale * fixedInvMaxValue<storage_type>::value;
+    norm[sid] = __half2float(max_) * scale * fixedInvMaxValue<store_t>::value;
 
-    const half2 max_i_div_max2_ = __half2half2(__hdiv(fixedMaxValue<storage_type>::value, max_));
-    array<short2, 12> o;
+    const half2 max_i_div_max2_ = __half2half2(__hdiv(fixedMaxValue<store_t>::value, max_));
+    array<typename VectorType<store_t, 2>::type, 12> o;
     for (int s = 0; s < 4; s++) {
 #pragma unroll
       for (int c = 0; c < 3; c++) {
         auto tmp = __hmul2(sm_b[(threadIdx.y * 4 + s) * N_sm_d2 + 3 * threadIdx.x + c], max_i_div_max2_);
-        o[s * 3 + c] = {__half2short_rn(tmp.x), __half2short_rn(tmp.y)};
+        o[s * 3 + c] = {static_cast<store_t>(__half2short_rn(tmp.x)), static_cast<store_t>(__half2short_rn(tmp.y))};
       }
     }
 
-    constexpr int N = colorspinor::get_vector_order<storage_type>(24);
+    constexpr int N = colorspinor::get_vector_order<store_t>(24);
     constexpr int M = 24 / N;
     constexpr int Nrem = 24 - N * M;
 
-    array<short, N> outN;
+    array<store_t, N> outN;
 #pragma unroll
     for (int i = 0; i < M; i++) {
       memcpy(&outN, &o[i * N / 2], sizeof(outN));
@@ -305,7 +305,7 @@ namespace quda
     }
 
     if constexpr (Nrem > 0) {
-      array<short, Nrem> outNrem;
+      array<store_t, Nrem> outNrem;
       memcpy(&outNrem, &o[N * M / 2], sizeof(outNrem));
       vector_store(output.field, N * M * output.volumeCB, sid, outNrem);
     }

From 9b5545fbd14e35a231ac8e397131b50c150ee08d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 09:59:43 -0800
Subject: [PATCH 064/121] Reduce carve-out autotuner overhead - default carve
 out step size is 100, e.g., we only tune over max L1 or max shared mem.  No
 observed effect on performance, and default can be overriden with an envarg

---
 lib/tune.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/tune.cpp b/lib/tune.cpp
index 68ece2aa39..362801db46 100644
--- a/lib/tune.cpp
+++ b/lib/tune.cpp
@@ -746,7 +746,7 @@ namespace quda
   }
 
   static std::string carve_out_step_str;
-  static int carve_out_step = 25; // default is 25% increment
+  static int carve_out_step = 100; // default is 100% increment
 
   void set_carve_out_step()
   {

From d7568e6741530079a602e02e026bb121bb26097f Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 12:00:38 -0800
Subject: [PATCH 065/121] Backwards gauge tensor descriptor copy only done if
 double store enabled

---
 lib/dslash_wilson.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/dslash_wilson.hpp b/lib/dslash_wilson.hpp
index 3aeeb720d0..0c2cd4a1a8 100644
--- a/lib/dslash_wilson.hpp
+++ b/lib/dslash_wilson.hpp
@@ -32,7 +32,8 @@ namespace quda
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       Dslash::setParam(tp);
       const_cast<quda::gauge::tensor_desc_t &>(Dslash::arg.U.tensor_desc) = get_tensor_descriptor(U, tp.block.x);
-      const_cast<quda::gauge::tensor_desc_t &>(Dslash::arg.Uback.tensor_desc) = get_tensor_descriptor(U.shift(1), tp.block.x);
+      if constexpr (dslash_double_store())
+        const_cast<quda::gauge::tensor_desc_t &>(Dslash::arg.Uback.tensor_desc) = get_tensor_descriptor(U.shift(1), tp.block.x);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
   };

From c92f3cd835dd3adb4e4b3fe00c1d3b02ca5df74d Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 17:09:28 -0800
Subject: [PATCH 066/121] Hopefully fix compiler warning

---
 include/kernels/dslash_staggered.cuh | 15 ++++++++-------
 include/kernels/dslash_wilson.cuh    | 19 ++++++++-----------
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 7a8cc9a774..4cfd521683 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -106,13 +106,14 @@ namespace quda
       switch (step % 4) {
       case 0: arg.U.prefetch<prefetch_type>(x_cb, dim2, parity); break;
       case 1: arg.L.prefetch<prefetch_type>(x_cb, dim2, parity); break;
-#ifdef QUDA_DSLASH_DOUBLE_STORE
-      case 2: arg.Uback.prefetch<prefetch_type>(x_cb, dim2, parity); break;
-      case 3: arg.Lback.prefetch<prefetch_type>(x_cb, dim2, parity); break;
-#else
-      case 2: arg.U.prefetch<prefetch_type>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity); break;
-      case 3: arg.L.prefetch<prefetch_type>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity); break;
-#endif
+      case 2:
+        if constexpr (dslash_double_store()) arg.Uback.prefetch<prefetch_type>(x_cb, dim2, parity);
+        else arg.U.prefetch<prefetch_type>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity);
+        break;
+      case 3:
+        if constexpr (dslash_double_store()) arg.Lback.prefetch<prefetch_type>(x_cb, dim2, parity);
+        else arg.L.prefetch<prefetch_type>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity);
+        break;
       }
     }
   }
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 06352cfd0e..9327f6a770 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -84,9 +84,7 @@ namespace quda
     int step = 2 * dim + dir + arg.prefetch_distance;
     if (step >= 8) return;
 
-    // for TMA use arg.block_size
     int dim2 = step / 2;
-    // need warp uniform variants of these and parity
 
     // if using a bulk prefetch we need to use block's first coordinate
     auto x_cb = arg.prefetch_tma ? coord.x_cb_0 : coord.x_cb;
@@ -94,15 +92,14 @@ namespace quda
 
     switch (step % 2) {
     case 0: arg.U.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
-#ifdef QUDA_DSLASH_DOUBLE_STORE
-    case 1: arg.Uback.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
-#else
-    case 1: {
-      const int back_idx = getNeighborIndexCB(coord, dim2, -1, arg.dc);
-      const int idx1 = (Arg::nDim == 5 ? back_idx % arg.dc.volume_4d_cb : back_idx);
-      arg.U.prefetch<Arg::prefetch_tma>(idx1, dim2, 1 - parity);
-    } break;
-#endif
+    case 1:
+      if (dslash_double_store()) {
+        arg.Uback.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity);
+      } else {
+        int idx = getNeighborIndexCB(coord, dim2, -1, arg.dc);
+        arg.U.prefetch<Arg::prefetch_tma>(Arg::nDim == 5 ? idx % arg.dc.volume_4d_cb : idx, dim2, 1 - parity);
+      }
+      break;
     }
   }
 

From 35da04f171230ac9f02f4881e1ae4c7c8fcede44 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 17:30:05 -0800
Subject: [PATCH 067/121] Fix HIP compilation

---
 include/targets/hip/target_device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/targets/hip/target_device.h b/include/targets/hip/target_device.h
index 9274a5e6ed..63b0a1cd36 100644
--- a/include/targets/hip/target_device.h
+++ b/include/targets/hip/target_device.h
@@ -149,7 +149,7 @@ namespace quda
       return thread_idx_linear<dim>() == 0;
     }
 
-    template __device__ __host__ inline bool is_lane_zero()
+    __device__ __host__ inline bool is_lane_zero()
     {
       return (thread_idx_linear<3>() % 64) == 0; // switch this to warp_size
     }

From 6041ec68e06319a24cf43f5162033b2e384bcde9 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 17:32:55 -0800
Subject: [PATCH 068/121] Always use ::cuda::maximum() now that we install our
 own CCCL

---
 include/targets/cuda/mma_tensor_op/gmem_loader.cuh | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/include/targets/cuda/mma_tensor_op/gmem_loader.cuh b/include/targets/cuda/mma_tensor_op/gmem_loader.cuh
index 0d4023bdbc..2a8a9c722e 100644
--- a/include/targets/cuda/mma_tensor_op/gmem_loader.cuh
+++ b/include/targets/cuda/mma_tensor_op/gmem_loader.cuh
@@ -504,11 +504,7 @@ namespace quda
             // block all-reduce thread_max
             using block_reduce_t = cub::BlockReduce<float, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS, block_y, block_z>;
             __shared__ typename block_reduce_t::TempStorage temp_storage;
-#if CUDA_VERSION >= 12090
             float block_max = block_reduce_t(temp_storage).Reduce(thread_max, ::cuda::maximum());
-#else
-            float block_max = block_reduce_t(temp_storage).Reduce(thread_max, ::cub::Max());
-#endif
 
             __shared__ float block_max_all;
             if (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z) == 0) {
@@ -670,11 +666,7 @@ namespace quda
             // block all-reduce thread_max
             using block_reduce_t = cub::BlockReduce<float, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS, block_y, block_z>;
             __shared__ typename block_reduce_t::TempStorage temp_storage;
-#if CUDA_VERSION >= 12090
             float block_max = block_reduce_t(temp_storage).Reduce(thread_max, ::cuda::maximum());
-#else
-            float block_max = block_reduce_t(temp_storage).Reduce(thread_max, cub::Max());
-#endif
 
             __shared__ float block_max_all;
             if (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z) == 0) {

From 982f41b84a0203c2d9c02e72d69b735e3d4505f7 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 17:38:26 -0800
Subject: [PATCH 069/121] Always use ::cuda::maximum() now that we install our
 own CCCL

---
 include/kernels/restrictor_mma.cuh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/kernels/restrictor_mma.cuh b/include/kernels/restrictor_mma.cuh
index 73f7f16b17..a1501a8ea2 100644
--- a/include/kernels/restrictor_mma.cuh
+++ b/include/kernels/restrictor_mma.cuh
@@ -174,11 +174,7 @@ namespace quda
       // block all-reduce thread_max
       using block_reduce_t = cub::BlockReduce<float, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS, Arg::block_y, Arg::block_z>;
       __shared__ typename block_reduce_t::TempStorage temp_storage;
-#if CUDA_VERSION >= 12090
       float block_max = block_reduce_t(temp_storage).Reduce(thread_max, ::cuda::maximum());
-#else
-      float block_max = block_reduce_t(temp_storage).Reduce(thread_max, ::cub::Max());
-#endif
 
       __shared__ float block_max_all;
       if (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z) == 0) {

From 60a746b435849428a902cdc6377c025be6e9e22c Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 18:09:48 -0800
Subject: [PATCH 070/121] Update cub block interfaces

---
 include/targets/cuda/block_reduce_helper.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/targets/cuda/block_reduce_helper.h b/include/targets/cuda/block_reduce_helper.h
index 8b9dcc16e6..7b742b5331 100644
--- a/include/targets/cuda/block_reduce_helper.h
+++ b/include/targets/cuda/block_reduce_helper.h
@@ -100,7 +100,7 @@ namespace quda
     template <typename T, typename reducer_t, typename param_t>
     __device__ inline T operator()(const T &value_, bool all, const reducer_t &r, const param_t &)
     {
-      using warp_reduce_t = cub::WarpReduce<T, param_t::width, __COMPUTE_CAPABILITY__>;
+      using warp_reduce_t = cub::WarpReduce<T, param_t::width>;
       typename warp_reduce_t::TempStorage dummy_storage;
       warp_reduce_t warp_reduce(dummy_storage);
       T value = {};
@@ -111,7 +111,7 @@ namespace quda
       }
 
       if (all) {
-        using warp_scan_t = cub::WarpScan<T, param_t::width, __COMPUTE_CAPABILITY__>;
+        using warp_scan_t = cub::WarpScan<T, param_t::width>;
         typename warp_scan_t::TempStorage dummy_storage;
         warp_scan_t warp_scan(dummy_storage);
         value = warp_scan.Broadcast(value, 0);

From 4918c98e6ffdca632525f8ad23262512fc870043 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 18:13:57 -0800
Subject: [PATCH 071/121] Fix HIP load_store.h

---
 include/targets/hip/load_store.h | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/targets/hip/load_store.h b/include/targets/hip/load_store.h
index d1bfe4a955..e4be469664 100644
--- a/include/targets/hip/load_store.h
+++ b/include/targets/hip/load_store.h
@@ -14,21 +14,27 @@ namespace quda
   // pre-declaration of vector_load that we wish to specialize
   template <bool> struct vector_load_impl;
 
-  // CUDA specializations of the vector_load
+  // pre-declaration of the prefetch type
+  template <size_t prefetch> struct prefetch_t;
+
+  // HIP specializations of the vector_load
   template <> struct vector_load_impl<true> {
-    template <typename T> __device__ inline void operator()(T &value, const void *ptr, int idx)
+    template <typename T, size_t prefetch_size>
+    __device__ inline void operator()(T &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
       value = reinterpret_cast<const T *>(ptr)[idx];
     }
 
-    __device__ inline void operator()(short8 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(short8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
       float4 tmp;
       operator()(tmp, ptr, idx);
       memcpy(&value, &tmp, sizeof(float4));
     }
 
-    __device__ inline void operator()(char8 &value, const void *ptr, int idx)
+    template <size_t prefetch_size>
+    __device__ inline void operator()(char8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
     {
       float2 tmp;
       operator()(tmp, ptr, idx);

From af2be33d8986b528de82b79f05680f32eab713d9 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 18:20:10 -0800
Subject: [PATCH 072/121] Fix compilation warning with CUDA clang

---
 include/targets/cuda/tma_helper.hpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/targets/cuda/tma_helper.hpp b/include/targets/cuda/tma_helper.hpp
index 423a696653..1bee8d087a 100644
--- a/include/targets/cuda/tma_helper.hpp
+++ b/include/targets/cuda/tma_helper.hpp
@@ -167,6 +167,7 @@ namespace quda
     return get_tma_descriptor<T, 4>(key);
   }
 
+#ifdef QUDA_CUDA_CC
   /**
     @brief Launch TMA load from a 5-d tensor in global memory to a 2-d box in shared memory.
     @param smem_ptr The destination shared memory pointer
@@ -182,13 +183,11 @@ namespace quda
   __device__ void inline tma_load_gmem_5d_box_2d(complex<T> *smem_ptr, const CUtensorMap *map, int offset_a,
                                                  int offset_b, int offset_c, int offset_d, int offset_e, barrier_t *bar)
   {
-#ifdef __CUDACC__
     static_assert(box_a <= tma_box_limit);
     static_assert(box_b <= tma_box_limit);
     int32_t coords[5] = {offset_a, offset_b, offset_c, offset_d, offset_e};
     cuda::ptx::cp_async_bulk_tensor(cuda::ptx::space_shared, cuda::ptx::space_global, smem_ptr, map, coords,
                                     reinterpret_cast<uint64_t *>(bar));
-#endif
   }
 
   /**
@@ -205,14 +204,13 @@ namespace quda
   __device__ void inline tma_load_gmem_4d_box_2d(complex<T> *smem_ptr, const CUtensorMap *map, int offset_a,
                                                  int offset_b, int offset_c, int offset_d, barrier_t *bar)
   {
-#ifdef __CUDACC__
     static_assert(box_a <= tma_box_limit);
     static_assert(box_b <= tma_box_limit);
     int32_t coords[4] = {offset_a, offset_b, offset_c, offset_d};
     cuda::ptx::cp_async_bulk_tensor(cuda::ptx::space_shared, cuda::ptx::space_global, smem_ptr, map, coords,
                                     reinterpret_cast<uint64_t *>(bar));
-#endif
   }
+#endif
 
   namespace gauge
   {

From 02baeaa67ab06021a3c09d8f2462dfa32f0229bb Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 18:28:49 -0800
Subject: [PATCH 073/121] Add missing target_device.h

---
 include/targets/cuda/tma_helper.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/targets/cuda/tma_helper.hpp b/include/targets/cuda/tma_helper.hpp
index 1bee8d087a..6620d58648 100644
--- a/include/targets/cuda/tma_helper.hpp
+++ b/include/targets/cuda/tma_helper.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <quda_define.h>
+#include <target_device.h>
 #include <gauge_field.h>
 #include <complex_quda.h>
 

From 4b8352c8680cfba4b254f39e655db61836867b48 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 23:15:44 -0800
Subject: [PATCH 074/121] Fix clang warning

---
 include/kernels/gauge_shift.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/kernels/gauge_shift.cuh b/include/kernels/gauge_shift.cuh
index dced98e50c..81c79b540f 100644
--- a/include/kernels/gauge_shift.cuh
+++ b/include/kernels/gauge_shift.cuh
@@ -33,7 +33,6 @@ namespace quda
 
     __device__ __host__ void operator()(int x_cb, int parity, int dir)
     {
-      using real = typename Arg::real;
       using Link = typename Arg::Link;
 
       byte_array<int8_t, 4> x = {};

From 13a192b4ddb75d22796000b82e28dbc5ea37b207 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 23:17:38 -0800
Subject: [PATCH 075/121] Fix HIP function call

---
 include/targets/hip/load_store.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/targets/hip/load_store.h b/include/targets/hip/load_store.h
index e4be469664..0d9058f098 100644
--- a/include/targets/hip/load_store.h
+++ b/include/targets/hip/load_store.h
@@ -26,18 +26,18 @@ namespace quda
     }
 
     template <size_t prefetch_size>
-    __device__ inline void operator()(short8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
+    __device__ inline void operator()(short8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
     {
       float4 tmp;
-      operator()(tmp, ptr, idx);
+      operator()(tmp, ptr, idx, prefetch);
       memcpy(&value, &tmp, sizeof(float4));
     }
 
     template <size_t prefetch_size>
-    __device__ inline void operator()(char8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &)
+    __device__ inline void operator()(char8 &value, const void *ptr, int idx, const prefetch_t<prefetch_size> &prefetch)
     {
       float2 tmp;
-      operator()(tmp, ptr, idx);
+      operator()(tmp, ptr, idx, prefetch);
       memcpy(&value, &tmp, sizeof(float2));
     }
   };

From 274cbad266bb71b8654c66429ae281f4a5d862f6 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 17 Dec 2025 23:20:35 -0800
Subject: [PATCH 076/121] Fix TMA instruction exposure

---
 include/targets/cuda/tma_helper.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/targets/cuda/tma_helper.hpp b/include/targets/cuda/tma_helper.hpp
index 6620d58648..173bb89f8e 100644
--- a/include/targets/cuda/tma_helper.hpp
+++ b/include/targets/cuda/tma_helper.hpp
@@ -168,7 +168,7 @@ namespace quda
     return get_tma_descriptor<T, 4>(key);
   }
 
-#ifdef QUDA_CUDA_CC
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA) || (defined(__clang__) && defined(__CUDA__))
   /**
     @brief Launch TMA load from a 5-d tensor in global memory to a 2-d box in shared memory.
     @param smem_ptr The destination shared memory pointer

From 89e8886b8f2512e754d2b44b372c7de87a9d9ac7 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 18 Dec 2025 17:01:28 -0800
Subject: [PATCH 077/121] Fix clang warning

---
 include/kernels/dslash_wilson.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 9327f6a770..4cd7bc7aff 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -79,15 +79,15 @@ namespace quda
   template <class coord_t, class Arg>
   __device__ __host__ void prefetch(int dim, int dir, const coord_t &coord, int parity, const Arg &arg)
   {
-    if constexpr (arg.prefetch_distance == 0) return;
+    if constexpr (Arg::prefetch_distance == 0) return;
 
-    int step = 2 * dim + dir + arg.prefetch_distance;
+    int step = 2 * dim + dir + Arg::prefetch_distance;
     if (step >= 8) return;
 
     int dim2 = step / 2;
 
     // if using a bulk prefetch we need to use block's first coordinate
-    auto x_cb = arg.prefetch_tma ? coord.x_cb_0 : coord.x_cb;
+    auto x_cb = Arg::prefetch_tma ? coord.x_cb_0 : coord.x_cb;
     x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
 
     switch (step % 2) {

From 866a38998f4cb00262e59449183c1232086815d3 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 18 Dec 2025 17:55:01 -0800
Subject: [PATCH 078/121] Fix clang error

---
 include/kernels/dslash_staggered.cuh | 16 ++++++++++------
 include/kernels/dslash_wilson.cuh    |  6 +++---
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 4cfd521683..53ad1264b9 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -104,15 +104,19 @@ namespace quda
 
       int dim2 = step / 4;
       switch (step % 4) {
-      case 0: arg.U.prefetch<prefetch_type>(x_cb, dim2, parity); break;
-      case 1: arg.L.prefetch<prefetch_type>(x_cb, dim2, parity); break;
+      case 0: arg.U.template prefetch<prefetch_type>(x_cb, dim2, parity); break;
+      case 1: arg.L.template prefetch<prefetch_type>(x_cb, dim2, parity); break;
       case 2:
-        if constexpr (dslash_double_store()) arg.Uback.prefetch<prefetch_type>(x_cb, dim2, parity);
-        else arg.U.prefetch<prefetch_type>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity);
+        if constexpr (dslash_double_store())
+          arg.Uback.template prefetch<prefetch_type>(x_cb, dim2, parity);
+        else
+          arg.U.template prefetch<prefetch_type>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity);
         break;
       case 3:
-        if constexpr (dslash_double_store()) arg.Lback.prefetch<prefetch_type>(x_cb, dim2, parity);
-        else arg.L.prefetch<prefetch_type>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity);
+        if constexpr (dslash_double_store())
+          arg.Lback.template prefetch<prefetch_type>(x_cb, dim2, parity);
+        else
+          arg.L.template prefetch<prefetch_type>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity);
         break;
       }
     }
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 4cd7bc7aff..37b1e28461 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -91,13 +91,13 @@ namespace quda
     x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
 
     switch (step % 2) {
-    case 0: arg.U.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
+    case 0: arg.U.template prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
     case 1:
       if (dslash_double_store()) {
-        arg.Uback.prefetch<Arg::prefetch_tma>(x_cb, dim2, parity);
+        arg.Uback.template prefetch<Arg::prefetch_tma>(x_cb, dim2, parity);
       } else {
         int idx = getNeighborIndexCB(coord, dim2, -1, arg.dc);
-        arg.U.prefetch<Arg::prefetch_tma>(Arg::nDim == 5 ? idx % arg.dc.volume_4d_cb : idx, dim2, 1 - parity);
+        arg.U.template prefetch<Arg::prefetch_tma>(Arg::nDim == 5 ? idx % arg.dc.volume_4d_cb : idx, dim2, 1 - parity);
       }
       break;
     }

From 63b97b9bff5b04f8fd61bd7f4c63725a787059f3 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 18 Dec 2025 19:15:47 -0800
Subject: [PATCH 079/121] Fix another clang error

---
 include/kernels/dslash_staggered.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 53ad1264b9..3e147c3665 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -94,7 +94,7 @@ namespace quda
   __device__ __host__ void prefetch(int dim, int dir, int hop, const coord_t &coord, const coord_t &coord1, int parity,
                                     const Arg &arg)
   {
-    if constexpr (arg.improved) {
+    if constexpr (Arg::improved) {
       int step = 4 * dim + 2 * dir + hop + distance;
       if (step >= 16) return;
 

From bcfaa5064368deeac3ccd2d5250c696ddc034a72 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 18 Dec 2025 20:24:36 -0800
Subject: [PATCH 080/121] Hopefully the last clang error

---
 include/kernels/dslash_staggered.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 3e147c3665..be5ac3d1cb 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -151,7 +151,7 @@ namespace quda
     const int their_spinor_parity = (arg.nParity == 2) ? 1 - parity : 0;
 
     Coord coord1 = coord;
-    if constexpr (arg.improved) { // need to compute 1-hop in_boundary
+    if constexpr (Arg::improved) { // need to compute 1-hop in_boundary
 #pragma unroll
       for (int d = 0; d < 4; d++) {
         coord1.in_boundary[1][d] = -(coord[d] + 1 >= arg.dc.X[d]);

From b95f9b42fcd272686080fb73f4cf53f3b795cf48 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 8 Jan 2026 12:51:33 -0800
Subject: [PATCH 081/121] I2F is encoded in half precision fields

---
 lib/clover_field.cpp       | 1 +
 lib/color_spinor_field.cpp | 1 +
 lib/gauge_field.cpp        | 1 +
 3 files changed, 3 insertions(+)

diff --git a/lib/clover_field.cpp b/lib/clover_field.cpp
index 2f0078aacf..3985511e8b 100644
--- a/lib/clover_field.cpp
+++ b/lib/clover_field.cpp
@@ -187,6 +187,7 @@ namespace quda {
     std::stringstream aux_ss;
     aux_ss << "vol=" << volume << "precision=" << precision << "Nc=" << nColor << ",order=" << order;
     if (isNative()) aux_ss << ",N=" << clover::get_vector_order(precision, 128);
+    if (precision < QUDA_SINGLE_PRECISION) aux_ss << ",alt_i2f=" << QUDA_ALTERNATIVE_I_TO_F;
     aux_string = aux_ss.str();
     if (aux_string.size() >= TuneKey::aux_n / 2) errorQuda("Aux string too large %lu", aux_string.size());
   }
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 6022340fe4..ea387eb44b 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -298,6 +298,7 @@ namespace quda
       aux_ss << "vol=" << volume << ",parity=" << siteSubset << ",precision=" << precision << ",Ns=" << nSpin
              << ",Nc=" << nColor << ",order=" << fieldOrder;
       if (isNative()) aux_ss << ",N=" << colorspinor::get_vector_order(precision, 128);
+      if (precision < QUDA_SINGLE_PRECISION) aux_ss << ",alt_i2f=" << QUDA_ALTERNATIVE_I_TO_F;
       if (nVec > 1) aux_ss << ",nVec=" << nVec;
       if (twistFlavor != QUDA_TWIST_NO && twistFlavor != QUDA_TWIST_INVALID) aux_ss << ",TwistFlavor=" << twistFlavor;
       aux_string = aux_ss.str();
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index e3bf6058d4..f0d1bd1783 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -321,6 +321,7 @@ namespace quda {
     aux_ss << "vol=" << volume << ",stride=" << stride << ",precision=" << precision << ",geometry=" << geometry
            << ",Nc=" << nColor << ",order=" << order;
     if (isNative()) aux_ss << ",N=" << gauge::get_vector_order(precision, 128);
+    if (precision < QUDA_SINGLE_PRECISION) aux_ss << ",alt_i2f=" << QUDA_ALTERNATIVE_I_TO_F;
     if (ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED) aux_ss << ",r=" << r[0] << r[1] << r[2] << r[3];
     aux_string = aux_ss.str();
     if (aux_string.size() >= TuneKey::aux_n / 2) errorQuda("Aux string too large %lu", aux_string.size());

From 1b7364364dd99efc08208d0d69f62d89306f2866 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 9 Jan 2026 11:50:16 -0800
Subject: [PATCH 082/121] Remove LEGACY_ACCESSOR_NORM path from
 colorspinor::FloatNOrder, and optimize the norm index computation

---
 include/color_spinor_field_order.h | 47 ++++++++----------------------
 1 file changed, 12 insertions(+), 35 deletions(-)

diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 46ae45c645..cba059e199 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -1126,16 +1126,9 @@ namespace quda
       using real = typename mapper<Float>::type;
       using complex = complex<real>;
       using AllocInt = typename AllocType<huge_alloc>::type;
-      using norm_type = float;
+      using norm_t = float;
       Float *field = nullptr;
-      //#define LEGACY_ACCESSOR_NORM // legacy code where norm pointer and offset are stored instead of computed
-#ifdef LEGACY_ACCESSOR_NORM
-      norm_type *norm = nullptr;
-#endif
       AllocInt offset = 0; // offset can be 32-bit or 64-bit
-#ifdef LEGACY_ACCESSOR_NORM
-      AllocInt norm_offset = 0;
-#endif
       int volumeCB = 0;
 
       FloatNOrder() = default;
@@ -1144,14 +1137,7 @@ namespace quda
       FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) :
         GhostNOrder(a, nFace, ghost_),
         field(buffer ? buffer : a.data<Float *>()),
-#ifdef LEGACY_ACCESSOR_NORM
-        norm(buffer ? reinterpret_cast<norm_type *>(reinterpret_cast<char *>(buffer) + a.NormOffset()) :
-                      const_cast<norm_type *>(reinterpret_cast<const norm_type *>(a.Norm()))),
-#endif
         offset(a.Bytes() / (2 * sizeof(Float))),
-#ifdef LEGACY_ACCESSOR_NORM
-        norm_offset(a.Bytes() / (2 * sizeof(norm_type))),
-#endif
         volumeCB(a.VolumeCB())
       {
       }
@@ -1160,12 +1146,8 @@ namespace quda
       __device__ __host__ inline void load(complex out[length / 2], int x, int parity = 0) const
       {
         real v[length];
-#ifndef LEGACY_ACCESSOR_NORM
-        auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
-        auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns)); // FIXME - optimize 64-bit indexing here
-#endif
-        norm_type nrm = isFixed<Float>::value ? vector_load<float, 1>(norm, x + parity * norm_offset)[0] : 0.0;
-
+        auto norm_offset = (volumeCB * 2 * Nc * Ns + parity * offset) * sizeof(Float) / sizeof(norm_t);
+        norm_t nrm = isFixed<Float>::value ? vector_load<norm_t, 1>(field, x + norm_offset)[0] : 0.0;
 #pragma unroll
         for (int i = 0; i < M; i++) {
           // first load from memory
@@ -1186,11 +1168,8 @@ namespace quda
 
       __device__ __host__ inline void prefetch(int x, int parity = 0) const
       {
-#ifndef LEGACY_ACCESSOR_NORM
-        auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
-        auto norm = reinterpret_cast<float *>(field + volumeCB * (2 * Nc * Ns));
-#endif
-        if constexpr (isFixed<Float>::value) prefetch_cache_line(norm + (x + parity * norm_offset));
+        auto norm_offset = (volumeCB * 2 * Nc * Ns + parity * offset) * sizeof(Float) / sizeof(norm_t);
+        if constexpr (isFixed<Float>::value) prefetch_cache_line(reinterpret_cast<norm_t*>(field) + (x + norm_offset));
 
 #pragma unroll
         for (int i = 0; i < M; i++) prefetch_cache_line(field + (parity * offset + (volumeCB * i + x) * N));
@@ -1202,27 +1181,25 @@ namespace quda
       __device__ __host__ inline void save(const complex in[length / 2], int x, int parity = 0) const
       {
         real v[length];
-#ifndef LEGACY_ACCESSOR_NORM
-        auto norm_offset = offset / (sizeof(Float) < sizeof(float) ? sizeof(norm_type) / sizeof(Float) : 1);
-        auto norm = reinterpret_cast<float *>(field + (volumeCB * 2 * Nc * Ns));
-#endif
+        auto norm_offset = (volumeCB * 2 * Nc * Ns + parity * offset) * sizeof(Float) / sizeof(norm_t);
+
 #pragma unroll
         for (int i = 0; i < length / 2; i++) {
           v[2 * i + 0] = in[i].real();
           v[2 * i + 1] = in[i].imag();
         }
 
-        norm_type scale = 0.0;
-        norm_type scale_inv = 0.0;
+        norm_t scale = 0.0;
+        norm_t scale_inv = 0.0;
         if constexpr (isFixed<Float>::value) {
-          norm_type max_[length / 2];
+          norm_t max_[length / 2];
           // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued)
 #pragma unroll
           for (int i = 0; i < length / 2; i++)
-            max_[i] = fmaxf(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length / 2]));
+            max_[i] = fmaxf(fabsf((norm_t)v[i]), fabsf((norm_t)v[i + length / 2]));
 #pragma unroll
           for (int i = 0; i < length / 2; i++) scale = fmaxf(max_[i], scale);
-          norm[x + parity * norm_offset] = scale * fixedInvMaxValue<Float>::value;
+          reinterpret_cast<norm_t*>(field)[x + norm_offset] = scale * fixedInvMaxValue<Float>::value;
           scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
         }
 

From 510b0a274dafd41ee4d01ed79a6f72c9ca2f9527 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 9 Jan 2026 11:50:42 -0800
Subject: [PATCH 083/121] Use CCCL 3.1.4 instead of latest main branch commit

---
 lib/targets/cuda/target_cuda.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index 4bdcb2ea09..3611ca5a36 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -422,7 +422,7 @@ endif()
 CPMAddPackage(
     NAME CCCL
     GITHUB_REPOSITORY nvidia/cccl
-    GIT_TAG main # Fetches the latest commit on the main branch
+    GIT_TAG v3.1.4 # Fetches this tagged commit
 )
 target_link_libraries(quda PRIVATE CCCL::CCCL)
 

From 55ee7cc06b476f76fbdca242df94a85bd6cb4b33 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 9 Jan 2026 14:53:07 -0800
Subject: [PATCH 084/121] Add some clarifying comments

---
 include/color_spinor_field_order.h   | 7 +++----
 include/kernels/dslash_staggered.cuh | 8 ++++----
 include/kernels/dslash_wilson.cuh    | 6 +++---
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index cba059e199..1d63a37900 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -1169,7 +1169,7 @@ namespace quda
       __device__ __host__ inline void prefetch(int x, int parity = 0) const
       {
         auto norm_offset = (volumeCB * 2 * Nc * Ns + parity * offset) * sizeof(Float) / sizeof(norm_t);
-        if constexpr (isFixed<Float>::value) prefetch_cache_line(reinterpret_cast<norm_t*>(field) + (x + norm_offset));
+        if constexpr (isFixed<Float>::value) prefetch_cache_line(reinterpret_cast<norm_t *>(field) + (x + norm_offset));
 
 #pragma unroll
         for (int i = 0; i < M; i++) prefetch_cache_line(field + (parity * offset + (volumeCB * i + x) * N));
@@ -1195,11 +1195,10 @@ namespace quda
           norm_t max_[length / 2];
           // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued)
 #pragma unroll
-          for (int i = 0; i < length / 2; i++)
-            max_[i] = fmaxf(fabsf((norm_t)v[i]), fabsf((norm_t)v[i + length / 2]));
+          for (int i = 0; i < length / 2; i++) max_[i] = fmaxf(fabsf((norm_t)v[i]), fabsf((norm_t)v[i + length / 2]));
 #pragma unroll
           for (int i = 0; i < length / 2; i++) scale = fmaxf(max_[i], scale);
-          reinterpret_cast<norm_t*>(field)[x + norm_offset] = scale * fixedInvMaxValue<Float>::value;
+          reinterpret_cast<norm_t *>(field)[x + norm_offset] = scale * fixedInvMaxValue<Float>::value;
           scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
         }
 
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index be5ac3d1cb..1e130777a8 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -187,7 +187,7 @@ namespace quda
               out[s] = mv_add(U, in, out[s]);
             }
           }
-          prefetch(d, 0, 0, coord, coord1, parity, arg);
+          prefetch(d, 0, 0, coord, coord1, parity, arg); // prefetch the gauge link Arg::prefetch_distance ahead
         }
       }
 
@@ -215,7 +215,7 @@ namespace quda
               out[s] = mv_add(L, in, out[s]);
             }
           }
-          prefetch(d, 0, 1, coord, coord1, parity, arg);
+          prefetch(d, 0, 1, coord, coord1, parity, arg); // prefetch the gauge link Arg::prefetch_distance ahead
         }
       }
 
@@ -257,7 +257,7 @@ namespace quda
               out[s] = mv_sub(conj(U), in, out[s]);
             }
           }
-          prefetch(d, 1, 0, coord, coord1, parity, arg);
+          prefetch(d, 1, 0, coord, coord1, parity, arg); // prefetch the gauge link Arg::prefetch_distance ahead
         }
       }
 
@@ -294,7 +294,7 @@ namespace quda
               out[s] = mv_sub(conj(L), in, out[s]);
             }
           }
-          prefetch(d, 1, 1, coord, coord1, parity, arg);
+          prefetch(d, 1, 1, coord, coord1, parity, arg); // prefetch the gauge link Arg::prefetch_distance ahead
         }
       }
     } // nDim
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 37b1e28461..6157b20af7 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -164,7 +164,7 @@ namespace quda
             out += fwd_coeff * (U * in.project(d, proj_dir)).reconstruct(d, proj_dir);
           }
 
-          prefetch(d, 0, coord, parity, arg);
+          prefetch(d, 0, coord, parity, arg); // prefetch the gauge link Arg::prefetch_distance ahead
         }
       }
 
@@ -199,7 +199,7 @@ namespace quda
           out += bwd_coeff * (conj(U) * in).reconstruct(d, proj_dir);
         }
 
-        if (doBulk<kernel_type>()) {
+        if constexpr (doBulk<kernel_type>()) {
           if (!ghost) {
 #ifdef QUDA_DSLASH_DOUBLE_STORE
             Link U = arg.Uback(d, gauge_idx, gauge_parity);
@@ -210,7 +210,7 @@ namespace quda
             out += bwd_coeff * (conj(U) * in.project(d, proj_dir)).reconstruct(d, proj_dir);
           }
 
-          prefetch(d, 1, coord, parity, arg);
+          prefetch(d, 1, coord, parity, arg); // prefetch the gauge link Arg::prefetch_distance ahead
         }
       }
     } // nDim

From 9d2175203c8579f40b86a9f98cd1ec64a8657447 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 9 Jan 2026 15:23:50 -0800
Subject: [PATCH 085/121] Fix compiler warning in domain_decomposition.h

---
 include/domain_decomposition.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/domain_decomposition.h b/include/domain_decomposition.h
index 24e653ac37..8ada3ae905 100644
--- a/include/domain_decomposition.h
+++ b/include/domain_decomposition.h
@@ -39,8 +39,7 @@ namespace quda
       flags[(int)flag] = true;
 
       if ((int)flag == (int)DD::reset) {
-#pragma unroll
-        for (auto i = 0u; i < (int)DD::size; i++) flags[i] = 0;
+        flags = {};
         type = QUDA_DD_NO;
       } else if ((int)flag >= (int)DD::red_black_type) {
         type = QUDA_DD_RED_BLACK;

From 8d04ac1d22995a06db00e358cb91aa5b64b49e54 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 9 Jan 2026 16:17:27 -0800
Subject: [PATCH 086/121] Add prefetching support for native staggered

---
 include/kernels/dslash_staggered.cuh | 23 +++++++++++++++++------
 include/kernels/dslash_wilson.cuh    |  2 +-
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 1e130777a8..957df948dc 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -94,14 +94,14 @@ namespace quda
   __device__ __host__ void prefetch(int dim, int dir, int hop, const coord_t &coord, const coord_t &coord1, int parity,
                                     const Arg &arg)
   {
-    if constexpr (Arg::improved) {
-      int step = 4 * dim + 2 * dir + hop + distance;
-      if (step >= 16) return;
+    int step = 4 * dim + 2 * dir + hop + distance;
+    if (step >= Arg::improved ? 16 : 8) return;
 
-      // if using a bulk prefetch we need to use block's first coordinate
-      auto x_cb = (prefetch_type == 1 || prefetch_type == 2) ? coord.x_cb_0 : coord.x_cb;
-      x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
+    // if using a bulk prefetch we need to use block's first coordinate
+    auto x_cb = (prefetch_type == 1 || prefetch_type == 2) ? coord.x_cb_0 : coord.x_cb;
+    x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
 
+    if constexpr (Arg::improved) {
       int dim2 = step / 4;
       switch (step % 4) {
       case 0: arg.U.template prefetch<prefetch_type>(x_cb, dim2, parity); break;
@@ -119,6 +119,17 @@ namespace quda
           arg.L.template prefetch<prefetch_type>(getNeighborIndexCB<3>(coord, dim2, -1, arg.dc), dim2, 1 - parity);
         break;
       }
+    } else {
+      int dim2 = step / 2;
+      switch (step % 2) {
+      case 0: arg.U.template prefetch<prefetch_type>(x_cb, dim2, parity); break;
+      case 1:
+        if constexpr (dslash_double_store())
+          arg.Uback.template prefetch<prefetch_type>(x_cb, dim2, parity);
+        else
+          arg.U.template prefetch<prefetch_type>(getNeighborIndexCB<1>(coord1, dim2, -1, arg.dc), dim2, 1 - parity);
+        break;
+      }
     }
   }
 
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 6157b20af7..d08f42908f 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -93,7 +93,7 @@ namespace quda
     switch (step % 2) {
     case 0: arg.U.template prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
     case 1:
-      if (dslash_double_store()) {
+      if constexpr (dslash_double_store()) {
         arg.Uback.template prefetch<Arg::prefetch_tma>(x_cb, dim2, parity);
       } else {
         int idx = getNeighborIndexCB(coord, dim2, -1, arg.dc);

From ca2a85a2a9e1ebd3dc15481a45151d5249ac0c23 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 13 Jan 2026 13:05:41 -0800
Subject: [PATCH 087/121] Remove stray debug asserts

---
 include/convert.h | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/include/convert.h b/include/convert.h
index 6c608117da..f56751873c 100644
--- a/include/convert.h
+++ b/include/convert.h
@@ -60,14 +60,6 @@ namespace quda
     }
   };
 
-  template <bool is_device> struct i2f_fma {
-    template <typename T> constexpr float operator()(int a, T, float b, float) { return static_cast<float>(a) * b; }
-    template <typename T> constexpr float2 operator()(int a1, int a2, T, float b, float)
-    {
-      return mul2(float2 {static_cast<float>(a1), static_cast<float>(a2)}, float2 {b, b});
-    }
-  };
-
   /**
      @brief This is a LUT which is used to determine whether a given
      int-to-float conversion in a array of numbers to be converted
@@ -104,7 +96,6 @@ namespace quda
         int32_t i = a + 0x4B400000;
         float f;
         memcpy(&f, &i, sizeof(int32_t));
-        assert(f - 12582912.0f == static_cast<float>(a));
         return f - 12582912.0f;
       }
     }
@@ -120,8 +111,6 @@ namespace quda
         int2 i = {a + 0x4B400000, b + 0x4B400000};
         float2 f;
         memcpy(&f, &i, sizeof(int2));
-        assert(f.x - 12582912.0f == static_cast<float>(a));
-        assert(f.y - 12582912.0f == static_cast<float>(b));
         return add2(f, {-12582912.0f, -12582912.0f});
       }
     }

From 0bc3ad35adde1819bbe0c703920158cfac9fa4b2 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 21 Jan 2026 15:54:53 -0800
Subject: [PATCH 088/121] Small clean up to tune_key

---
 include/tune_key.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/tune_key.h b/include/tune_key.h
index 28c6b7668b..0cbad5b01a 100644
--- a/include/tune_key.h
+++ b/include/tune_key.h
@@ -7,9 +7,9 @@ namespace quda {
 
   struct TuneKey {
 
-    static const int volume_n = 32;
-    static const int name_n = 512;
-    static const int aux_n = 256;
+    static constexpr int volume_n = 32;
+    static constexpr int name_n = 512;
+    static constexpr int aux_n = 256;
     char volume[volume_n];
     char name[name_n];
     char aux[aux_n];

From 44b90008ff0cc0be1d414cbecb79d5fd86fdcf73 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 21 Jan 2026 16:07:12 -0800
Subject: [PATCH 089/121] tensor descriptor cache should work as expected now

---
 lib/targets/cuda/tma_helper.cpp | 63 ++++++++++++++++++++++++++++-----
 1 file changed, 54 insertions(+), 9 deletions(-)

diff --git a/lib/targets/cuda/tma_helper.cpp b/lib/targets/cuda/tma_helper.cpp
index 67e5b34895..4df0f7c43b 100644
--- a/lib/targets/cuda/tma_helper.cpp
+++ b/lib/targets/cuda/tma_helper.cpp
@@ -1,6 +1,7 @@
+#include <map>
+#include <array>
 #include <cuda.h>
 #include <tma_helper.hpp>
-#include <map>
 
 #ifdef USE_TENSOR_MEMORY_ACCELERATOR
 
@@ -94,17 +95,61 @@ namespace quda
     return tensor;
   }
 
-  static std::map<int, gauge::tensor_desc_t> tensor_map;
+  struct tensor_key_t {
+    static constexpr std::size_t volume_n = 32;
+    static constexpr std::size_t aux_n = 256;
+
+    uint32_t block_size {};
+    std::array<char, volume_n> volume {}; // zero-filled
+    std::array<char, aux_n> aux {};       // zero-filled
+    void *ptr {};
+
+    bool operator<(const tensor_key_t &other) const noexcept
+    {
+      if (block_size != other.block_size) return block_size < other.block_size;
+      int vc = std::memcmp(volume.data(), other.volume.data(), tensor_key_t::volume_n);
+      if (vc != 0) return vc < 0;
+      int ac = std::memcmp(aux.data(), other.aux.data(), tensor_key_t::aux_n);
+      if (ac != 0) return ac < 0;
+      // Required for strict weak ordering on arbitrary pointers
+      return std::less<void *> {}(ptr, other.ptr);
+    }
+
+    friend std::ostream &operator<<(std::ostream &os, const tensor_key_t &key)
+    {
+      auto print_buf = [&](auto const &buf) {
+        auto end = std::find(buf.begin(), buf.end(), '\0');
+        os.write(buf.data(), std::distance(buf.begin(), end));
+      };
+
+      os << "block_size=" << key.block_size << ", volume=\"";
+      print_buf(key.volume);
+      os << "\", aux=\"";
+      print_buf(key.aux);
+      os << "\", ptr=" << key.ptr;
+      return os;
+    }
+  };
+
+  static std::map<tensor_key_t, gauge::tensor_desc_t> tensor_map;
 
   gauge::tensor_desc_t &get_tensor_descriptor(const GaugeField &u, uint32_t block_size)
   {
-    auto tensor = tensor_map.find(block_size);
-    if (tensor != tensor_map.end()) {
-      return tensor->second;
-    } else {
-      tensor_map[block_size] = create_descriptor(u, block_size);
-    }
-    return tensor_map[block_size];
+    tensor_key_t key {}; // zero-inits arrays + ptr
+    key.block_size = block_size;
+    key.ptr = u.data();
+
+    const std::size_t vlen = std::min(u.VolString().size(), tensor_key_t::volume_n);
+    const std::size_t alen = std::min(u.AuxString().size(), tensor_key_t::aux_n);
+
+    std::memcpy(key.volume.data(), u.VolString().data(), vlen);
+    std::memcpy(key.aux.data(), u.AuxString().data(), alen);
+
+    auto it = tensor_map.find(key);
+    if (it != tensor_map.end()) return it->second;
+
+    auto [ins_it, inserted] = tensor_map.emplace(key, create_descriptor(u, block_size));
+    return ins_it->second;
   }
 
 } // namespace quda

From 96a39129611785acd35d58632e509f23b42422df Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 21 Jan 2026 16:08:07 -0800
Subject: [PATCH 090/121] CMake will error out if TMA prefetch is requested but
 double-store is not enabled

---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 90947f3406..9115ac0f12 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -158,6 +158,9 @@ mark_as_advanced(QUDA_DSLASH_DOUBLE_STORE)
 set(QUDA_DSLASH_PREFETCH_TMA "0" CACHE STRING "enable TMA prefetching (Hopper+, 0 - disable, 1 - bulk, 2 - tensor)")
 set_property(CACHE QUDA_DSLASH_PREFETCH_TMA PROPERTY STRINGS 0 1 2)
 mark_as_advanced(QUDA_DSLASH_PREFETCH_TMA)
+if(QUDA_DSLASH_PREFETCH_TMA GREATER 0 AND NOT QUDA_DSLASH_DOUBLE_STORE)
+  message(SEND_ERROR "QUDA_DSLASH_PREFETCH_TMA cannot be enabled without QUDA_DSLASH_DOUBLE_STORE")
+endif()
 
 set(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON "0" CACHE STRING "set prefetch distance for Wilson-like fermions")
 set(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED "0" CACHE STRING "set prefetch distance for staggered-like fermions")

From 6360e16ba72bf15151a7f061ca90f73fc2cdc779 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 26 Jan 2026 16:09:28 -0800
Subject: [PATCH 091/121] Small cleanup to Wilson dslash

---
 include/kernels/dslash_wilson.cuh | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index d08f42908f..86f2a5d933 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -172,11 +172,8 @@ namespace quda
       if (arg.dd_in.doHopping(coord, d, -1)) {
         const real bwd_coeff = (d < 3) ? 1.0 : bwd_coeff_3;
         const int back_idx = getNeighborIndexCB(coord, d, -1, arg.dc);
-#ifdef QUDA_DSLASH_DOUBLE_STORE
-        const int gauge_idx = (Arg::nDim == 5 ? coord.x_cb % arg.dc.volume_4d_cb : coord.x_cb);
-#else
-        const int gauge_idx = (Arg::nDim == 5 ? back_idx % arg.dc.volume_4d_cb : back_idx);
-#endif
+        const int gauge_idx = dslash_double_store() ? coord.x_cb : back_idx;
+        if constexpr (Arg::nDim == 5) gauge_idx = gauge_idx % arg.dc.volume_4d_cb;
         constexpr int proj_dir = dagger ? -1 : +1;
 
         const bool ghost = coord.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
@@ -188,11 +185,8 @@ namespace quda
             idx;
 
           const int gauge_ghost_idx = (Arg::nDim == 5 ? ghost_idx % arg.dc.ghostFaceCB[d] : ghost_idx);
-#ifdef QUDA_DSLASH_DOUBLE_STORE
-          Link U = arg.Uback(d, gauge_idx, gauge_parity);
-#else
-          Link U = arg.U.Ghost(d, gauge_ghost_idx, 1 - gauge_parity);
-#endif
+          Link U = dslash_double_store() ? static_cast<const Link&>(arg.Uback(d, gauge_idx, gauge_parity)) :
+                                           static_cast<const Link &>(arg.U.Ghost(d, gauge_ghost_idx, 1 - gauge_parity));
           HalfVector in = arg.halo.Ghost(d, 0, ghost_idx + (src_idx * arg.Ls + coord.s) * arg.dc.ghostFaceCB[d],
                                          their_spinor_parity);
 
@@ -201,11 +195,8 @@ namespace quda
 
         if constexpr (doBulk<kernel_type>()) {
           if (!ghost) {
-#ifdef QUDA_DSLASH_DOUBLE_STORE
-            Link U = arg.Uback(d, gauge_idx, gauge_parity);
-#else
-            Link U = arg.U(d, gauge_idx, 1 - gauge_parity);
-#endif
+            Link U = dslash_double_store() ? static_cast<const Link &>(arg.Uback(d, gauge_idx, gauge_parity)) :
+                                             static_cast<const Link &>(arg.U(d, gauge_idx, 1 - gauge_parity));
             Vector in = arg.in[src_idx](back_idx + coord.s * arg.dc.volume_4d_cb, their_spinor_parity);
             out += bwd_coeff * (conj(U) * in.project(d, proj_dir)).reconstruct(d, proj_dir);
           }

From 48e870b1dc22572cc30f07a67641176374ecd2cf Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 27 Jan 2026 11:19:44 -0800
Subject: [PATCH 092/121] indexfromFaceIndexStaggered should not be constexpr

---
 include/index_helper.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/index_helper.cuh b/include/index_helper.cuh
index 425f5ec690..7eff25a882 100644
--- a/include/index_helper.cuh
+++ b/include/index_helper.cuh
@@ -807,7 +807,7 @@ namespace quda {
   // int idx = indexFromFaceIndex<4,QUDA_4D_PC,dim,nFace,0>(ghost_idx, parity, arg);
 
   template <int nDim, typename Arg>
-  constexpr int indexFromFaceIndexStaggered(int dim, int face_num, int face_idx_in, int parity, int nLayers, QudaPCType, const Arg &arg)
+  __host__ __device__ inline int indexFromFaceIndexStaggered(int dim, int face_num, int face_idx_in, int parity, int nLayers, QudaPCType, const Arg &arg)
   {
     const auto *X = arg.dc.X;            // grid dimension
     const auto &V4 = arg.dc.volume_4d;   // 4-d volume
@@ -866,7 +866,7 @@ namespace quda {
   }
 
   template <int nDim, QudaPCType type, int dim, int nLayers, int face_num, typename Arg>
-  constexpr int indexFromFaceIndexStaggered(int face_idx_in, int parity, const Arg &arg)
+  __host__ __device__ int indexFromFaceIndexStaggered(int face_idx_in, int parity, const Arg &arg)
   {
     return indexFromFaceIndexStaggered<nDim>(dim, face_num, face_idx_in, parity, nLayers, type, arg);
   }

From 051dd43d738bb1f7cf75fb850815cecde14925db Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 27 Jan 2026 15:33:34 -0800
Subject: [PATCH 093/121] Fix compilation issue tripping up some CI

---
 include/kernels/dslash_wilson.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 86f2a5d933..fa5821c46a 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -172,7 +172,7 @@ namespace quda
       if (arg.dd_in.doHopping(coord, d, -1)) {
         const real bwd_coeff = (d < 3) ? 1.0 : bwd_coeff_3;
         const int back_idx = getNeighborIndexCB(coord, d, -1, arg.dc);
-        const int gauge_idx = dslash_double_store() ? coord.x_cb : back_idx;
+        int gauge_idx = dslash_double_store() ? coord.x_cb : back_idx;
         if constexpr (Arg::nDim == 5) gauge_idx = gauge_idx % arg.dc.volume_4d_cb;
         constexpr int proj_dir = dagger ? -1 : +1;
 

From 16a787c35080a117efa93c5ea2b745b975bb78af Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 27 Jan 2026 15:52:44 -0800
Subject: [PATCH 094/121] Add 2-d TMA prefetch accessors

---
 include/targets/cuda/inline_ptx.h    | 6 ++++++
 include/targets/cuda/load_store.h    | 6 ++++++
 include/targets/generic/load_store.h | 9 +++++++++
 3 files changed, 21 insertions(+)

diff --git a/include/targets/cuda/inline_ptx.h b/include/targets/cuda/inline_ptx.h
index 0fe9d743d3..2ad01cd3ac 100644
--- a/include/targets/cuda/inline_ptx.h
+++ b/include/targets/cuda/inline_ptx.h
@@ -493,6 +493,12 @@ namespace quda {
 
   using tensor_desc_t = CUtensorMap;
 
+  __device__ __forceinline__ void prefetch_tma_2d(const CUtensorMap &tensor_map, int x, int y)
+  {
+    asm volatile("cp.async.bulk.prefetch.tensor.2d.L2.global.tile [%0, {%1, %2}];" ::"l"(&tensor_map), "r"(x), "r"(y)
+                 : "memory");
+  }
+
   __device__ __forceinline__ void prefetch_tma_3d(const CUtensorMap &tensor_map, int x, int y, int z)
   {
     asm volatile("cp.async.bulk.prefetch.tensor.3d.L2.global.tile [%0, {%1, %2, %3}];" ::"l"(&tensor_map), "r"(x),
diff --git a/include/targets/cuda/load_store.h b/include/targets/cuda/load_store.h
index 4b11df34b3..c441ba9232 100644
--- a/include/targets/cuda/load_store.h
+++ b/include/targets/cuda/load_store.h
@@ -180,6 +180,7 @@ namespace quda
 
   // pre-declaration of the prefetch_cache that we wish to specialize
   template <bool> struct prefetch_cache_bulk_imp;
+  template <bool> struct prefetch_cache_tensor_2d_imp;
   template <bool> struct prefetch_cache_tensor_3d_imp;
   template <bool> struct prefetch_cache_tensor_4d_imp;
   template <bool> struct prefetch_cache_tensor_5d_imp;
@@ -190,6 +191,11 @@ namespace quda
     __device__ inline void operator()(const void *p, size_t bytes) { prefetch_tma(p, bytes); }
   };
 
+  // CUDA specialization of the prefetch_cache_tensor_2d that uses TMA (requires Hopper+)
+  template <> struct prefetch_cache_tensor_2d_imp<true> {
+    __device__ inline void operator()(const tma_descriptor_t &desc, int x, int y) { prefetch_tma_2d(desc.map, x, y); }
+  };
+
   // CUDA specialization of the prefetch_cache_tensor_3d that uses TMA (requires Hopper+)
   template <> struct prefetch_cache_tensor_3d_imp<true> {
     __device__ inline void operator()(const tma_descriptor_t &desc, int x, int y, int z)
diff --git a/include/targets/generic/load_store.h b/include/targets/generic/load_store.h
index 9c4d263b11..80c89439b9 100644
--- a/include/targets/generic/load_store.h
+++ b/include/targets/generic/load_store.h
@@ -101,6 +101,15 @@ namespace quda
     target::dispatch<prefetch_cache_bulk_imp>(p, bytes);
   }
 
+  template <bool is_device> struct prefetch_cache_tensor_2d_imp {
+    constexpr void operator()(const tma_descriptor_t &, int, int) { }
+  };
+
+  __device__ __host__ inline void prefetch_cache_tensor_2d(const tma_descriptor_t &desc, int x, int y)
+  {
+    target::dispatch<prefetch_cache_tensor_2d_imp>(desc, x, y);
+  }
+
   template <bool is_device> struct prefetch_cache_tensor_3d_imp {
     constexpr void operator()(const tma_descriptor_t &, int, int, int) { }
   };

From 32fd0c3c9f2a3dc992d58e68576163dc9b95eba2 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 27 Jan 2026 16:55:40 -0800
Subject: [PATCH 095/121] Add run-time launch check when TMA is enabled to
 ensure parity is block separated

---
 include/dslash.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/dslash.h b/include/dslash.h
index ed2cc91655..a521ac9fab 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -288,6 +288,7 @@ namespace quda
     inline void launch(TuneParam &tp, const qudaStream_t &stream)
     {
       tp.set_max_shared_bytes = true;
+      if (QUDA_DSLASH_PREFETCH_TMA > 0 && tp.block.z > 1) errorQuda("Z-dimension block size must be 1 when using TMA");
       launch_device<dslash_functor>(
         tp, stream, dslash_functor_arg<D, P, dagger, xpay, kernel_type, Arg>(arg, tp.block.x * tp.grid.x));
     }

From 9fb3260d25bd08eedaa465148e38938d916ab390 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 28 Jan 2026 10:08:32 -0800
Subject: [PATCH 096/121] Cleanup of staggered dslash kernel

---
 include/kernels/dslash_staggered.cuh | 46 +++++++++-------------------
 1 file changed, 14 insertions(+), 32 deletions(-)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 957df948dc..11490f9e4f 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -1,12 +1,10 @@
 #pragma once
 
-#include <dslash_helper.cuh>
 #include <color_spinor_field_order.h>
 #include <gauge_field_order.h>
 #include <color_spinor.h>
 #include <dslash_helper.cuh>
-#include <index_helper.cuh>
-#include <kernels/dslash_pack.cuh> // forthe packing kernel
+#include <kernels/dslash_pack.cuh> // for the packing kernel
 
 namespace quda
 {
@@ -179,7 +177,7 @@ namespace quda
 
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dc.X, d, 1);
-          const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) : arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
+          const Link U = arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             Vector in = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.dc.ghostFaceCB[d], their_spinor_parity);
@@ -190,8 +188,7 @@ namespace quda
         if constexpr (doBulk<kernel_type>()) {
           if (!ghost) {
             const int fwd_idx = getNeighborIndexCB<1>(coord1, d, 1, arg.dc);
-            const Link U = arg.improved ? arg.U(d, coord.x_cb, parity) :
-                                          arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
+            const Link U = arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
 #pragma unroll
             for (auto s = 0; s < n_src_tile; s++) {
               Vector in = arg.in[src_idx + s](fwd_idx, their_spinor_parity);
@@ -237,13 +234,9 @@ namespace quda
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx2 = ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 1);
           const int ghost_idx = arg.improved ? ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 3) : ghost_idx2;
-#ifdef QUDA_DSLASH_DOUBLE_STORE
-          const Link U = arg.improved ? arg.Uback(d, coord.x_cb, parity) :
-                                        arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg));
-#else
-          const Link U = arg.improved ? arg.U.Ghost(d, ghost_idx2, 1 - parity) :
-            arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg));
-#endif
+          const Link U = dslash_double_store() ?
+            static_cast<const Link>(arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg))) :
+            static_cast<const Link>(arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg)));
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             Vector in = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.dc.ghostFaceCB[d], their_spinor_parity);
@@ -254,14 +247,9 @@ namespace quda
         if constexpr (doBulk<kernel_type>()) {
           if (!ghost) {
             const int back_idx = getNeighborIndexCB<1>(coord1, d, -1, arg.dc);
-#ifdef QUDA_DSLASH_DOUBLE_STORE
-            const Link U = arg.improved ? arg.Uback(d, coord.x_cb, parity) :
-                                          arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg));
-#else
-            const int gauge_idx = back_idx;
-            const Link U = arg.improved ? arg.U(d, gauge_idx, 1 - parity) :
-                                          arg.U(d, gauge_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg));
-#endif
+            const Link U = dslash_double_store() ?
+              static_cast<const Link>(arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg))) :
+              static_cast<const Link>(arg.U(d, back_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg)));
 #pragma unroll
             for (auto s = 0; s < n_src_tile; s++) {
               Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity);
@@ -277,11 +265,8 @@ namespace quda
         const bool ghost = coord.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 1);
-#ifdef QUDA_DSLASH_DOUBLE_STORE
-          const Link L = arg.Lback(d, coord.x_cb, parity);
-#else
-          const Link L = arg.L.Ghost(d, ghost_idx, 1 - parity);
-#endif
+          const Link L = dslash_double_store() ? static_cast<const Link>(arg.Lback(d, coord.x_cb, parity)) :
+                                                 static_cast<const Link>(arg.L.Ghost(d, ghost_idx, 1 - parity));
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             const Vector in
@@ -293,12 +278,8 @@ namespace quda
         if constexpr (doBulk<kernel_type>()) {
           if (!ghost) {
             const int back3_idx = getNeighborIndexCB<3>(coord, d, -1, arg.dc);
-#ifdef QUDA_DSLASH_DOUBLE_STORE
-            const Link L = arg.Lback(d, coord.x_cb, parity);
-#else
-            const int gauge_idx = back3_idx;
-            const Link L = arg.L(d, gauge_idx, 1 - parity);
-#endif
+            const Link L = dslash_double_store() ? static_cast<const Link>(arg.Lback(d, coord.x_cb, parity)) :
+                                                   static_cast<const Link>(arg.L(d, back3_idx, 1 - parity));
 #pragma unroll
             for (auto s = 0; s < n_src_tile; s++) {
               const Vector in = arg.in[src_idx + s](back3_idx, their_spinor_parity);
@@ -308,6 +289,7 @@ namespace quda
           prefetch(d, 1, 1, coord, coord1, parity, arg); // prefetch the gauge link Arg::prefetch_distance ahead
         }
       }
+
     } // nDim
   }
 

From cc6e837566a0f0f3518e8153080844d927457f0e Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 28 Jan 2026 14:14:35 -0800
Subject: [PATCH 097/121] Add FloatNOrder raw_load and raw_save functions

---
 include/gauge_field_order.h | 42 +++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index b0011b87a6..31ee60ddd6 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1655,6 +1655,25 @@ namespace quda {
         reconstruct.Unpack(v, tmp, x, dir, phase, X, R);
       }
 
+      __device__ __host__ inline void raw_load(array<store_t, reconLen> &v, int x, int dir, int parity) const
+      {
+#pragma unroll
+        for (int i = 0; i < M; i++) {
+          // first load from memory
+          auto vecTmp = vector_load<Float, N>(gauge, parity * offset + dir * (M * N + Nrem) * stride, i * stride + x);
+          memcpy(&v[i * N], &vecTmp, sizeof(vecTmp));
+        }
+
+        // now load any remainder
+        if constexpr (Nrem > 0) {
+          auto vecTmp = vector_load<Float, Nrem>(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x);
+          memcpy(&v[M * N], &vecTmp, sizeof(vecTmp));
+        }
+
+        if constexpr (loadPhase)
+          memcpy(&v[M * N + Nrem], &gauge[parity * offset + phaseOffset + stride * dir + x], sizeof(store_t));
+      }
+
       template <int type> __device__ inline void prefetch(int x, int dir, int parity, int block_size = 0) const
       {
         if constexpr (type == 0) { // use per-thread prefetching
@@ -1732,6 +1751,29 @@ namespace quda {
         }
       }
 
+      __device__ __host__ inline void raw_save(const array<store_t, reconLen> &v, int x, int dir, int parity) const
+      {
+#pragma unroll
+        for (int i = 0; i < M; i++) {
+          array<Float, N> vecTmp;
+          // first do copy converting into storage type
+          memcpy(&vecTmp, &v[i * N], sizeof(vecTmp));
+          // second do vectorized copy into memory
+          vector_store(gauge, parity * offset + dir * (M * N + Nrem) * stride, x + i * stride, vecTmp);
+        }
+
+        // now save any remainder
+        if constexpr (Nrem > 0) {
+          array<Float, Nrem> vecTmp;
+          memcpy(&vecTmp, &v[M * N], sizeof(vecTmp));
+          // second do vectorized copy into memory
+          vector_store(gauge, parity * offset + (dir * (M * N + Nrem) + M * N) * stride, x, vecTmp);
+        }
+
+        if constexpr (hasPhase)
+          memcpy(&gauge[parity * offset + phaseOffset + dir * stride + x], &v[M * N + Nrem], sizeof(store_t));
+      }
+
       /**
 	 @brief This accessor routine returns a gauge_wrapper to this object,
 	 allowing us to overload various operators for manipulating at

From 3229363d4df2d6e71b65ee656905ce3ae5dd6c10 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 28 Jan 2026 14:16:17 -0800
Subject: [PATCH 098/121] Gauge shift now operates on raw packed elements

---
 include/kernels/gauge_shift.cuh | 17 +++++++++--------
 lib/gauge_shift.cu              |  1 +
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/kernels/gauge_shift.cuh b/include/kernels/gauge_shift.cuh
index 81c79b540f..72b790f9f6 100644
--- a/include/kernels/gauge_shift.cuh
+++ b/include/kernels/gauge_shift.cuh
@@ -11,16 +11,17 @@ namespace quda
 
   template <typename store_t, int nColor, QudaReconstructType recon> struct GaugeShiftArg : kernel_param<> {
     using real = typename mapper<store_t>::type;
-    using Link = Matrix<complex<real>, nColor>;
+    using RawLink = array<store_t, recon>;
     using Gauge = typename gauge_mapper<store_t, recon>::type;
 
     int X[4]; // true grid dimensions
     Gauge out;
     const Gauge in;
     int shift;
+    int volume_cb;
 
     GaugeShiftArg(GaugeField &out, const GaugeField &in, int shift) :
-      kernel_param(dim3(in.VolumeCB(), 2, 4)), out(out), in(in), shift(shift)
+      kernel_param(dim3(in.VolumeCB(), 2, 4)), out(out), in(in), shift(shift), volume_cb(in.VolumeCB())
     {
       for (int dir = 0; dir < 4; dir++) X[dir] = in.X()[dir];
     }
@@ -33,21 +34,21 @@ namespace quda
 
     __device__ __host__ void operator()(int x_cb, int parity, int dir)
     {
-      using Link = typename Arg::Link;
-
       byte_array<int8_t, 4> x = {};
       getCoords(x, x_cb, arg.X, parity);
 
+      typename Arg::RawLink link;
+
       if (x[dir] < arg.shift && arg.comms_dim[dir] > 1) { // on the boundary so we need to fetch from the ghost zone
         const int ghost_idx = ghostFaceIndex<0, 4>(x, arg.X, dir, arg.shift);
-        Link U = arg.in.Ghost(dir, ghost_idx, 1 - parity);
-        arg.out(dir, x_cb, parity) = U;
+        arg.in.raw_load(link, arg.volume_cb + ghost_idx, dir, 1 - parity);
+        arg.out.raw_save(link, x_cb, dir, parity);
       } else { // simple shift
         byte_array<int8_t, 4> dx = {};
         dx[dir] = dx[dir] - arg.shift;
         int x_cb_back = linkIndexShift(x, dx, arg.X);
-        Link U = arg.in(dir, x_cb_back, 1 - parity);
-        arg.out(dir, x_cb, parity) = U;
+        arg.in.raw_load(link, x_cb_back, dir, 1 - parity);
+        arg.out.raw_save(link, x_cb, dir, parity);
       }
     }
   };
diff --git a/lib/gauge_shift.cu b/lib/gauge_shift.cu
index 1bf692a096..2991f445ea 100644
--- a/lib/gauge_shift.cu
+++ b/lib/gauge_shift.cu
@@ -62,6 +62,7 @@ namespace quda
     const_cast<double&>(out.LinkMax()) = in.LinkMax();
     instantiate<GaugeShifter>(out, in, shift);
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
+    if (out.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) out.exchangeGhost();
     return out;
   }
 

From 35e734a8a2345fc70fc8fc32bbdac9e39c70c9ca Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 29 Jan 2026 10:33:04 -0800
Subject: [PATCH 099/121] Matrix::L1/L2/Linf method should be const qualified

---
 include/quda_matrix.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/quda_matrix.h b/include/quda_matrix.h
index 8eb579dab3..3eef975308 100644
--- a/include/quda_matrix.h
+++ b/include/quda_matrix.h
@@ -103,7 +103,8 @@ namespace quda {
            the absolute column sums.
            @return Compute L1 norm
         */
-        __device__ __host__ inline real L1() {
+        __device__ __host__ inline real L1() const
+        {
           real l1 = 0;
 #pragma unroll
           for (int j=0; j<N; j++) {
@@ -122,7 +123,8 @@ namespace quda {
            Frobenius norm which is an upper bound on the L2 norm.
            @return Computed L2 norm
         */
-        __device__ __host__ inline real L2() {
+        __device__ __host__ inline real L2() const
+        {
           real l2 = 0;
 #pragma unroll
           for (int j=0; j<N; j++) {
@@ -139,7 +141,8 @@ namespace quda {
            the absolute row sums.
            @return Computed Linfinity norm
         */
-        __device__ __host__ inline real Linf() {
+        __device__ __host__ inline real Linf() const
+        {
           real linf = 0;
 #pragma unroll
           for (int i=0; i<N; i++) {

From a5055cc9750c5c4d0cf7763bd7609f21a759ee1e Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 29 Jan 2026 13:24:08 -0800
Subject: [PATCH 100/121] Fix printing bug with LatticeField

---
 lib/lattice_field.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/lattice_field.cpp b/lib/lattice_field.cpp
index 8c87cf8f4d..1bc10cd0f5 100644
--- a/lib/lattice_field.cpp
+++ b/lib/lattice_field.cpp
@@ -640,7 +640,7 @@ namespace quda {
     output << "localVolume = " << field.localVolume << std::endl;
     output << "localVolumeCB = " << field.localVolumeCB << std::endl;
     output << "stride = " << field.stride << std::endl;
-    output << "pad = " << field.stride << std::endl;
+    output << "pad = " << field.pad << std::endl;
     output << "total_bytes = " << field.total_bytes << std::endl;
     output << "nDim = " << field.nDim << std::endl;
     output << "x = " << field.x << std::endl;

From e38501a75ec2b26122f581760923e556e18cc106 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Thu, 29 Jan 2026 13:35:17 -0800
Subject: [PATCH 101/121] Add kernel_param::comms_dim_partitioned which mirrors
 comm_dim_partitioned to allow for kernels to query

---
 include/kernel_helper.h                 | 5 ++++-
 include/kernels/extract_gauge_ghost.cuh | 6 ++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/kernel_helper.h b/include/kernel_helper.h
index bf8fd17d2a..075295f9b6 100644
--- a/include/kernel_helper.h
+++ b/include/kernel_helper.h
@@ -26,6 +26,7 @@ namespace quda
     int comms_rank_global; /** per process value comm_rank_global() */
     int comms_coord[4];    /** array storing {comm_coord(0), ..., comm_coord(3)} */
     int comms_dim[4];      /** array storing {comm_dim(0), ..., comm_dim(3)} */
+    int comms_dim_partitioned[4]; /** array storing {comm_dim_partitioned(0), ..., comm_dim_partiitoned(3)} */
 
     constexpr kernel_param() = default;
 
@@ -34,7 +35,9 @@ namespace quda
       comms_rank(comm_rank()),
       comms_rank_global(comm_rank_global()),
       comms_coord {comm_coord(0), comm_coord(1), comm_coord(2), comm_coord(3)},
-      comms_dim {comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3)}
+      comms_dim {comm_dim(0), comm_dim(1), comm_dim(2), comm_dim(3)},
+      comms_dim_partitioned {comm_dim_partitioned(0), comm_dim_partitioned(1), comm_dim_partitioned(2),
+                             comm_dim_partitioned(3)}
     {
     }
 
diff --git a/include/kernels/extract_gauge_ghost.cuh b/include/kernels/extract_gauge_ghost.cuh
index 42fef0b4ae..5ea2cdaa3f 100644
--- a/include/kernels/extract_gauge_ghost.cuh
+++ b/include/kernels/extract_gauge_ghost.cuh
@@ -24,7 +24,6 @@ namespace quda {
     int f[nDim][nDim];
     bool localParity[nDim];
     int faceVolumeCB[nDim];
-    int comm_dim[QUDA_MAX_DIM];
     const int offset;
     ExtractGhostArg(const GaugeField &u, Float **Ghost, int offset, uint64_t size) :
       kernel_param(dim3(size, 1, 1)),
@@ -34,7 +33,6 @@ namespace quda {
     {
       for (int d=0; d<nDim; d++) {
 	X[d] = u.X()[d];
-	comm_dim[d] = comm_dim_partitioned(d);
 	faceVolumeCB[d] = u.SurfaceCB(d)*u.Nface();
       }
 
@@ -79,7 +77,7 @@ namespace quda {
       int dim = parity_dim % Arg::nDim;
 
       // for now we never inject unless we have partitioned in that dimension
-      if (!arg.comm_dim[dim] && !Arg::extract) return;
+      if (!arg.comms_dim_partitioned[dim] && !Arg::extract) return;
 
       // linear index used for writing into ghost buffer
       if (X >= 2*arg.faceVolumeCB[dim]) return;
@@ -128,7 +126,7 @@ namespace quda {
       int dim = parity_dim % Arg::nDim;
 
       // for now we never inject unless we have partitioned in that dimension
-      if (!arg.comm_dim[dim] && !Arg::extract) return;
+      if (!arg.comms_dim_partitioned[dim] && !Arg::extract) return;
 
       // linear index used for writing into ghost buffer
       if (X >= 2*arg.faceVolumeCB[dim]) return;

From a8d4a0ac15e997f965afab2c76c5eff43c1695c3 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 30 Jan 2026 11:44:21 -0800
Subject: [PATCH 102/121] Gause shift kernel now fills in the ghost region of
 the shifted field, placing the end face (which is otherwise lost) into the
 ghost

---
 include/kernels/gauge_shift.cuh | 60 ++++++++++++++++++++++++++-------
 lib/gauge_shift.cu              | 41 ++++++++++++----------
 2 files changed, 71 insertions(+), 30 deletions(-)

diff --git a/include/kernels/gauge_shift.cuh b/include/kernels/gauge_shift.cuh
index 72b790f9f6..f64134d09b 100644
--- a/include/kernels/gauge_shift.cuh
+++ b/include/kernels/gauge_shift.cuh
@@ -9,10 +9,13 @@
 namespace quda
 {
 
-  template <typename store_t, int nColor, QudaReconstructType recon> struct GaugeShiftArg : kernel_param<> {
+  template <typename store_t, int nColor, QudaReconstructType recon, bool verify_ = false>
+  struct GaugeShiftArg : kernel_param<> {
     using real = typename mapper<store_t>::type;
+    using Link = Matrix<complex<real>, nColor>;
     using RawLink = array<store_t, recon>;
     using Gauge = typename gauge_mapper<store_t, recon>::type;
+    static constexpr bool verify = verify_;
 
     int X[4]; // true grid dimensions
     Gauge out;
@@ -37,18 +40,49 @@ namespace quda
       byte_array<int8_t, 4> x = {};
       getCoords(x, x_cb, arg.X, parity);
 
-      typename Arg::RawLink link;
-
-      if (x[dir] < arg.shift && arg.comms_dim[dir] > 1) { // on the boundary so we need to fetch from the ghost zone
-        const int ghost_idx = ghostFaceIndex<0, 4>(x, arg.X, dir, arg.shift);
-        arg.in.raw_load(link, arg.volume_cb + ghost_idx, dir, 1 - parity);
-        arg.out.raw_save(link, x_cb, dir, parity);
-      } else { // simple shift
-        byte_array<int8_t, 4> dx = {};
-        dx[dir] = dx[dir] - arg.shift;
-        int x_cb_back = linkIndexShift(x, dx, arg.X);
-        arg.in.raw_load(link, x_cb_back, dir, 1 - parity);
-        arg.out.raw_save(link, x_cb, dir, parity);
+      if constexpr (!Arg::verify) {
+        typename Arg::RawLink link;
+        if (x[dir] < arg.shift
+            && arg.comms_dim_partitioned[dir]) { // on the boundary so we need to fetch from the ghost zone
+          const int ghost_idx = ghostFaceIndexStaggered<0>(x, arg.X, dir, arg.shift);
+          arg.in.raw_load(link, arg.volume_cb + ghost_idx, dir, 1 - parity);
+          arg.out.raw_save(link, x_cb, dir, parity);
+        } else { // simple shift
+          byte_array<int8_t, 4> dx = {};
+          dx[dir] = dx[dir] - arg.shift;
+          int x_cb_back = linkIndexShift(x, dx, arg.X);
+          arg.in.raw_load(link, x_cb_back, dir, 1 - parity);
+          arg.out.raw_save(link, x_cb, dir, parity);
+
+          if (x[dir] >= arg.X[dir] - arg.shift && arg.comms_dim_partitioned[dir]) { // write the ghost
+            const int ghost_idx = ghostFaceIndexStaggered<1>(x, arg.X, dir, arg.shift);
+            arg.in.raw_load(link, x_cb, dir, parity);
+            arg.out.raw_save(link, arg.volume_cb + ghost_idx, dir, 1 - parity);
+          }
+        }
+      } else {
+        // verify the shifting has worked
+        using Link = typename Arg::Link;
+        if (x[dir] < arg.shift && arg.comms_dim_partitioned[dir]) {
+          const int ghost_idx = ghostFaceIndexStaggered<0>(x, arg.X, dir, arg.shift);
+          Link in = arg.in(dir, arg.volume_cb + ghost_idx, 1 - parity);
+          Link out = arg.out(dir, x_cb, parity);
+          assert(in.L1() == out.L1());
+        } else {
+          byte_array<int8_t, 4> dx = {};
+          dx[dir] = dx[dir] - arg.shift;
+          int x_cb_back = linkIndexShift(x, dx, arg.X);
+          Link in = arg.in(dir, x_cb_back, 1 - parity);
+          Link out = arg.out(dir, x_cb, parity);
+          assert(in.L1() == out.L1());
+
+          if (x[dir] >= arg.X[dir] - arg.shift && arg.comms_dim_partitioned[dir]) {
+            const int ghost_idx = ghostFaceIndexStaggered<1>(x, arg.X, dir, arg.shift);
+            Link in = arg.in(dir, x_cb, parity);
+            Link out = arg.out.Ghost(dir, ghost_idx, 1 - parity);
+            assert(in.L1() == out.L1());
+          }
+        }
       }
     }
   };
diff --git a/lib/gauge_shift.cu b/lib/gauge_shift.cu
index 2991f445ea..db96c23718 100644
--- a/lib/gauge_shift.cu
+++ b/lib/gauge_shift.cu
@@ -11,41 +11,46 @@ namespace quda
     GaugeField &out;
     const GaugeField &in;
     int shift;
+    bool verify;
     unsigned int minThreads() const { return in.VolumeCB(); }
 
   public:
-    GaugeShifter(GaugeField &out, const GaugeField &in, int shift) :
-      TunableKernel3D(in, 2, 4), out(out), in(in), shift(shift)
+    GaugeShifter(GaugeField &out, const GaugeField &in, int shift, bool verify) :
+      TunableKernel3D(in, 2, 4), out(out), in(in), shift(shift), verify(verify)
     {
       assert(shift == 1 || shift == 3);
       strcat(aux, ",shift=");
       char shift_str[16];
       u32toa(shift_str, shift);
       strcat(aux, shift_str);
+      strcat(aux, verify ? ",verify" : "");
       apply(device::get_default_stream());
     }
 
-    void apply(const qudaStream_t &stream)
+    template <bool verify> void instantiate(TuneParam &tp, const qudaStream_t &stream)
     {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       if (in.Reconstruct() == QUDA_RECONSTRUCT_NO) {
-        GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_NO> arg(out, in, shift);
-        launch<GaugeShift>(tp, stream, arg);
+        launch<GaugeShift>(tp, stream, GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_NO, verify>(out, in, shift));
       } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) {
-        GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_13> arg(out, in, shift);
-        launch<GaugeShift>(tp, stream, arg);
+        launch<GaugeShift>(tp, stream, GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_13, verify>(out, in, shift));
       } else if (in.Reconstruct() == QUDA_RECONSTRUCT_12) {
-        GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_12> arg(out, in, shift);
-        launch<GaugeShift>(tp, stream, arg);
+        launch<GaugeShift>(tp, stream, GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_12, verify>(out, in, shift));
       } else if (in.Reconstruct() == QUDA_RECONSTRUCT_9) {
-        GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_9> arg(out, in, shift);
-        launch<GaugeShift>(tp, stream, arg);
+        launch<GaugeShift>(tp, stream, GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_9, verify>(out, in, shift));
       } else if (in.Reconstruct() == QUDA_RECONSTRUCT_8) {
-        GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_8> arg(out, in, shift);
-        launch<GaugeShift>(tp, stream, arg);
+        launch<GaugeShift>(tp, stream, GaugeShiftArg<Float, nColor, QUDA_RECONSTRUCT_8, verify>(out, in, shift));
       }
     }
 
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      if (verify)
+        instantiate<true>(tp, stream);
+      else
+        instantiate<false>(tp, stream);
+    }
+
     long long bytes() const { return out.Bytes() + in.Bytes(); }
   };
 
@@ -57,12 +62,14 @@ namespace quda
     if (in.GhostExchange() == QUDA_GHOST_EXCHANGE_NO && comm_partitioned())
       errorQuda("comm_dim_partition() == true requires we have GhostExchange = QUDA_GHOST_EXCHANGE_PAD");
     GaugeFieldParam param(in);
-    param.create = QUDA_NULL_FIELD_CREATE;
+    param.create = QUDA_ZERO_FIELD_CREATE;
     GaugeField out(param);
     const_cast<double&>(out.LinkMax()) = in.LinkMax();
-    instantiate<GaugeShifter>(out, in, shift);
+    instantiate<GaugeShifter>(out, in, shift, false);
+#if 0 // set to 1 to run verification
+    instantiate<GaugeShifter>(out, in, shift, true);
+#endif
     getProfile().TPSTOP(QUDA_PROFILE_COMPUTE);
-    if (out.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD) out.exchangeGhost();
     return out;
   }
 

From 73f46afc757ff6386e9f97519fbecf3e7de913cd Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 30 Jan 2026 12:14:11 -0800
Subject: [PATCH 103/121] When double-store is enabled, when doing the halo
 update always read the gauge field from the ghost region - ensures coalesced
 access regarding less of partitioning

---
 include/kernels/dslash_staggered.cuh | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 11490f9e4f..a54de238e9 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -177,7 +177,10 @@ namespace quda
 
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dc.X, d, 1);
-          const Link U = arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg));
+          const Link U = dslash_double_store() ?
+            static_cast<const Link>(arg.Uback.Ghost(d, ghost_idx, 1 - parity, StaggeredPhase(coord, d, +1, arg))) :
+            static_cast<const Link>(arg.U(d, coord.x_cb, parity, StaggeredPhase(coord, d, +1, arg)));
+
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             Vector in = arg.halo.Ghost(d, 1, ghost_idx + (src_idx + s) * arg.dc.ghostFaceCB[d], their_spinor_parity);
@@ -204,7 +207,8 @@ namespace quda
         const bool ghost = coord.in_boundary[1][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dc.X, d, arg.nFace);
-          const Link L = arg.L(d, coord.x_cb, parity);
+          const Link L = dslash_double_store() ? static_cast<const Link>(arg.Lback.Ghost(d, ghost_idx, 1 - parity)) :
+                                                 static_cast<const Link>(arg.L(d, coord.x_cb, parity));
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             const Vector in
@@ -234,9 +238,9 @@ namespace quda
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx2 = ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 1);
           const int ghost_idx = arg.improved ? ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 3) : ghost_idx2;
-          const Link U = dslash_double_store() ?
-            static_cast<const Link>(arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg))) :
-            static_cast<const Link>(arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg)));
+          const Link U
+            = static_cast<const Link>(arg.U.Ghost(d, ghost_idx2, 1 - parity, StaggeredPhase(coord, d, -1, arg)));
+
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             Vector in = arg.halo.Ghost(d, 0, ghost_idx + (src_idx + s) * arg.dc.ghostFaceCB[d], their_spinor_parity);
@@ -250,6 +254,7 @@ namespace quda
             const Link U = dslash_double_store() ?
               static_cast<const Link>(arg.Uback(d, coord.x_cb, parity, StaggeredPhase(coord, d, -1, arg))) :
               static_cast<const Link>(arg.U(d, back_idx, 1 - parity, StaggeredPhase(coord, d, -1, arg)));
+
 #pragma unroll
             for (auto s = 0; s < n_src_tile; s++) {
               Vector in = arg.in[src_idx + s](back_idx, their_spinor_parity);
@@ -265,8 +270,7 @@ namespace quda
         const bool ghost = coord.in_boundary[0][d] & isActive<kernel_type>(active, thread_dim, d, coord, arg);
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 1);
-          const Link L = dslash_double_store() ? static_cast<const Link>(arg.Lback(d, coord.x_cb, parity)) :
-                                                 static_cast<const Link>(arg.L.Ghost(d, ghost_idx, 1 - parity));
+          const Link L = static_cast<const Link>(arg.L.Ghost(d, ghost_idx, 1 - parity));
 #pragma unroll
           for (auto s = 0; s < n_src_tile; s++) {
             const Vector in

From 37cfc7b7df7f2f80483a954845960003accd6ce1 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 30 Jan 2026 13:36:14 -0800
Subject: [PATCH 104/121] Fix bug with staggered dslash test where partitioning
 was being reset - comms partitioning were effectively disabled for testing

---
 tests/staggered_dslash_ctest.cpp    | 1 +
 tests/staggered_dslash_test_utils.h | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp
index cd8b670eef..9ab1d42547 100644
--- a/tests/staggered_dslash_ctest.cpp
+++ b/tests/staggered_dslash_ctest.cpp
@@ -80,6 +80,7 @@ class StaggeredDslashTest
   {
     if (skip()) GTEST_SKIP();
     dslash_test_wrapper.end();
+    commDimPartitionedReset();
   }
 
   static void SetUpTestCase() { initQuda(device_ordinal); }
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index e3b0455283..aa62b19995 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -316,7 +316,6 @@ struct StaggeredDslashTestWrapper {
     freeGaugeQuda();
     cpuFat = {};
     cpuLong = {};
-    commDimPartitionedReset();
   }
 
   static void destroy()

From e223bfa798734413507a1dcbba494fb90a9dce11 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 2 Feb 2026 23:02:43 -0800
Subject: [PATCH 105/121] Selecting the type of prefetching to use is now more
 verbose.

---
 CMakeLists.txt                         | 20 -------------
 include/dslash.h                       | 15 +++++-----
 include/dslash_helper.cuh              | 34 +++++++++++++++++++---
 include/gauge_field_order.h            | 10 ++++---
 include/kernels/dslash_staggered.cuh   |  8 +++---
 include/kernels/dslash_wilson.cuh      |  8 +++---
 include/quda_define.h.in               | 14 +++++----
 include/targets/cuda/tma_helper.hpp    |  2 ++
 include/targets/generic/tma_helper.hpp |  2 ++
 lib/targets/cuda/target_cuda.cmake     | 39 ++++++++++++++++++++++++++
 10 files changed, 104 insertions(+), 48 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9115ac0f12..cb1a3ef606 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -153,26 +153,6 @@ option(QUDA_DIRAC_COVDEV "build code for covariant derivative" ${QUDA_DIRAC_DEFA
 option(QUDA_DIRAC_DISTANCE_PRECONDITIONING "build code for distance preconditioned Wilson/clover Dirac operators" OFF)
 set(QUDA_DOMAIN_DECOMPOSITION "0" CACHE STRING "which domain decomposition to instantiate in QUDA (1-bit number - RedBlack)")
 
-option(QUDA_DSLASH_DOUBLE_STORE "store a forwards shifted copy of the gauge fields for simplified Dslash indexing" OFF)
-mark_as_advanced(QUDA_DSLASH_DOUBLE_STORE)
-set(QUDA_DSLASH_PREFETCH_TMA "0" CACHE STRING "enable TMA prefetching (Hopper+, 0 - disable, 1 - bulk, 2 - tensor)")
-set_property(CACHE QUDA_DSLASH_PREFETCH_TMA PROPERTY STRINGS 0 1 2)
-mark_as_advanced(QUDA_DSLASH_PREFETCH_TMA)
-if(QUDA_DSLASH_PREFETCH_TMA GREATER 0 AND NOT QUDA_DSLASH_DOUBLE_STORE)
-  message(SEND_ERROR "QUDA_DSLASH_PREFETCH_TMA cannot be enabled without QUDA_DSLASH_DOUBLE_STORE")
-endif()
-
-set(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON "0" CACHE STRING "set prefetch distance for Wilson-like fermions")
-set(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED "0" CACHE STRING "set prefetch distance for staggered-like fermions")
-mark_as_advanced(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON)
-mark_as_advanced(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED)
-if(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON GREATER 7)
-  message(SEND_ERROR "QUDA_DSLASH_PREFETCH_DISTANCE_WILSON is greater than pipeline length")
-endif()
-if(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED GREATER 15)
-  message(SEND_ERROR "QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED is greater than pipeline length")
-endif()
-
 option(QUDA_QIO "build QIO code for binary I/O" OFF)
 
 # Multi-GPU options
diff --git a/include/dslash.h b/include/dslash.h
index a521ac9fab..6ea801ce45 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -12,10 +12,6 @@
 namespace quda
 {
 
-#if defined(NVSHMEM_COMMS) && QUDA_DSLASH_PREFETCH_TMA > 0
-#error NVSHMEM cannot be used in combination with TMA prefetching at present
-#endif
-
   /**
      @brief This is the generic driver for launching Dslash kernels
      (the base kernel of which is defined in dslash_helper.cuh).  This
@@ -79,7 +75,12 @@ namespace quda
         strcat(aux_base, ",prefetch=");
         i32toa(tile_str, Arg::prefetch_distance);
         strcat(aux_base, tile_str);
-        if constexpr (Arg::prefetch_tma) strcat(aux_base, Arg::prefetch_tma == 1 ? ",tma=bulk" : ",tma=tensor");
+        if constexpr (dslash_prefetch_type() == PrefetchType::THREAD)
+          strcat(aux_base, ",prefetch=thread");
+        else if constexpr (dslash_prefetch_type() == PrefetchType::BULK)
+          strcat(aux_base, ",prefetch=bulk");
+        else if constexpr (dslash_prefetch_type() == PrefetchType::TENSOR)
+          strcat(aux_base, ",prefetch=tensor");
       }
     }
 
@@ -233,7 +234,7 @@ namespace quda
     virtual bool advanceBlockDim(TuneParam &param) const override
     {
       // if TMA is enabled we must keep parity separate in the block (2-d tuning)
-      if constexpr (QUDA_DSLASH_PREFETCH_TMA > 0)
+      if constexpr (dslash_prefetch_tma())
         return TunableKernel2D_base<false>::advanceBlockDim(param);
       else
         return TunableKernel3D::advanceBlockDim(param);
@@ -288,7 +289,7 @@ namespace quda
     inline void launch(TuneParam &tp, const qudaStream_t &stream)
     {
       tp.set_max_shared_bytes = true;
-      if (QUDA_DSLASH_PREFETCH_TMA > 0 && tp.block.z > 1) errorQuda("Z-dimension block size must be 1 when using TMA");
+      if (dslash_prefetch_tma() && tp.block.z > 1) errorQuda("Z-dimension block size must be 1 when using TMA");
       launch_device<dslash_functor>(
         tp, stream, dslash_functor_arg<D, P, dagger, xpay, kernel_type, Arg>(arg, tp.block.x * tp.grid.x));
     }
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index b123b7bf85..32d44fa6d3 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -13,6 +13,7 @@
 #include <tune_quda.h>
 #include <domain_decomposition_helper.cuh>
 #include <kernel_ops.h>
+#include <tma_helper.hpp>
 
 constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::TRUE;
 
@@ -27,6 +28,33 @@ namespace quda
   constexpr bool dslash_double_store() { return false; }
 #endif
 
+  constexpr PrefetchType dslash_prefetch_type()
+  {
+#if defined(QUDA_DSLASH_PREFETCH_TYPE_NONE)
+    return PrefetchType::NONE;
+#elif defined(QUDA_DSLASH_PREFETCH_TYPE_THREAD)
+    return PrefetchType::THREAD;
+#elif defined(QUDA_DSLASH_PREFETCH_TYPE_BULK)
+    return PrefetchType::BULK;
+#elif defined(QUDA_DSLASH_PREFETCH_TYPE_TENSOR)
+    return PrefetchType::TENSOR;
+#else
+#error "Invalid or missing QUDA_DSLASH_PREFETCH_TYPE"
+#endif
+  }
+
+#if defined(NVSHMEM_COMMS) && (defined(QUDA_DSLASH_PREFETCH_TYPE_BULK) || defined(QUDA_DSLASH_PREFETCH_TYPE_TENSOR))
+#error NVSHMEM cannot be used in combination with TMA prefetching at present
+#endif
+
+  constexpr bool dslash_prefetch_tma()
+  {
+    return (dslash_prefetch_type() == PrefetchType::BULK || dslash_prefetch_type() == PrefetchType::TENSOR);
+  }
+
+  static_assert(!dslash_prefetch_tma() || dslash_double_store(),
+                "Cannot use TMA prefetching unless QUDA_DSLASH_DOUBLE_STORE is enabled");
+
   /**
      @brief Helper function to determine if we should do halo
      computation
@@ -307,9 +335,7 @@ namespace quda
     static constexpr int max_regs = 0;             // by default we don't limit register count
     static constexpr bool spill_shared = false;    // whether a given kernel should use shared memory spilling
     static constexpr int prefetch_distance = 0;    // whether we are using prefetching in the dslash
-    static constexpr int prefetch_tma = QUDA_DSLASH_PREFETCH_TMA;
-    static_assert(!prefetch_tma || dslash_double_store(),
-                  "Cannot use TMA prefetching unless QUDA_DSLASH_DOUBLE_STORE is enabled");
+    static constexpr PrefetchType prefetch_type = dslash_prefetch_type();
     const int parity;  // only use this for single parity fields
     const int nParity; // number of parities we're working on
     const QudaReconstructType reconstruct;
@@ -754,7 +780,7 @@ namespace quda
     {
       typename Arg::D dslash(*this);
 
-      if constexpr (QUDA_DSLASH_PREFETCH_TMA > 0) {
+      if constexpr (dslash_prefetch_tma()) {
         // FIXME need warp uniform parity which is not composable with
         // NVSHMEM since the latter requires blockDim.y and blockDim.z to
         // cover the entire extent
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 31ee60ddd6..938d0b4ea0 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1674,9 +1674,9 @@ namespace quda {
           memcpy(&v[M * N + Nrem], &gauge[parity * offset + phaseOffset + stride * dir + x], sizeof(store_t));
       }
 
-      template <int type> __device__ inline void prefetch(int x, int dir, int parity, int block_size = 0) const
+      template <PrefetchType type> __device__ inline void prefetch(int x, int dir, int parity, int block_size = 0) const
       {
-        if constexpr (type == 0) { // use per-thread prefetching
+        if constexpr (type == PrefetchType::THREAD) { // use per-thread prefetching
 #pragma unroll
           for (int i = 0; i < M; i++)
             prefetch_cache_line(gauge + (parity * offset + dir * (M * N + Nrem) * stride + (i * stride + x) * N));
@@ -1686,7 +1686,7 @@ namespace quda {
             prefetch_cache_line(gauge + (parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem));
 
           if constexpr (loadPhase) prefetch_cache_line(gauge + (parity * offset + phaseOffset + stride * dir + x));
-        } else if constexpr (type == 1) { // bulk prefetch
+        } else if constexpr (type == PrefetchType::BULK) { // bulk prefetch
           if (block_size == 0) block_size = blockDim.x;
           if (target::is_thread_zero()) {
 #pragma unroll
@@ -1702,12 +1702,13 @@ namespace quda {
             if constexpr (loadPhase)
               prefetch_cache_bulk(gauge + (parity * offset + phaseOffset + stride * dir + x), block_size * sizeof(Float));
           }
-        } else if constexpr (type == 2) { // n-d tensor prefetch
+        } else if constexpr (type == PrefetchType::TENSOR) { // n-d tensor prefetch
           if (target::is_thread_zero()) {
             prefetch_cache_tensor_5d(tensor_desc.N, x, x / 16, 0, dir, parity);
             if constexpr (Nrem > 0) prefetch_cache_tensor_4d(tensor_desc.Nrem, x, x / 16, dir, parity);
             if constexpr (loadPhase) prefetch_cache_tensor_4d(tensor_desc.phase, x, x / 16, dir, parity);
           }
+#if 0 // L1 prefetching is a disabled experiment
         } else { // L1 prefetching
 #pragma unroll
           for (int i = 0; i < M; i++)
@@ -1718,6 +1719,7 @@ namespace quda {
             prefetch_L1_cache_line(gauge + (parity * offset + (dir * (M * N + Nrem) + M * N) * stride + x * Nrem));
 
           if constexpr (loadPhase) prefetch_L1_cache_line(gauge + (parity * offset + phaseOffset + stride * dir + x));
+#endif
         }
       }
 
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index a54de238e9..31a4b8974f 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -88,15 +88,15 @@ namespace quda
      @param[in] parity Partiry that we are working on
      @param[in] arg Paramter struct
    */
-  template <int prefetch_type, int distance, class coord_t, class Arg>
+  template <PrefetchType prefetch_type, int distance, class coord_t, class Arg>
   __device__ __host__ void prefetch(int dim, int dir, int hop, const coord_t &coord, const coord_t &coord1, int parity,
                                     const Arg &arg)
   {
     int step = 4 * dim + 2 * dir + hop + distance;
     if (step >= Arg::improved ? 16 : 8) return;
 
-    // if using a bulk prefetch we need to use block's first coordinate
-    auto x_cb = (prefetch_type == 1 || prefetch_type == 2) ? coord.x_cb_0 : coord.x_cb;
+    // if using a TMA prefetch we need to use block's first coordinate
+    auto x_cb = dslash_prefetch_tma() ? coord.x_cb_0 : coord.x_cb;
     x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
 
     if constexpr (Arg::improved) {
@@ -138,7 +138,7 @@ namespace quda
     if constexpr (Arg::prefetch_distance_l1 > 0) // L1 prefetch
       prefetch<3, Arg::prefetch_distance_l1>(dim, dir, hop, coord, coord1, parity, arg);
     if constexpr (Arg::prefetch_distance > 0) // L2 prefetch
-      prefetch<Arg::prefetch_tma, Arg::prefetch_distance>(dim, dir, hop, coord, coord1, parity, arg);
+      prefetch<Arg::prefetch_type, Arg::prefetch_distance>(dim, dir, hop, coord, coord1, parity, arg);
   };
 
   /**
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index fa5821c46a..80eb97fd91 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -87,17 +87,17 @@ namespace quda
     int dim2 = step / 2;
 
     // if using a bulk prefetch we need to use block's first coordinate
-    auto x_cb = Arg::prefetch_tma ? coord.x_cb_0 : coord.x_cb;
+    auto x_cb = dslash_prefetch_tma() ? coord.x_cb_0 : coord.x_cb;
     x_cb = (Arg::nDim == 5 ? x_cb % arg.dc.volume_4d_cb : x_cb);
 
     switch (step % 2) {
-    case 0: arg.U.template prefetch<Arg::prefetch_tma>(x_cb, dim2, parity); break;
+    case 0: arg.U.template prefetch<Arg::prefetch_type>(x_cb, dim2, parity); break;
     case 1:
       if constexpr (dslash_double_store()) {
-        arg.Uback.template prefetch<Arg::prefetch_tma>(x_cb, dim2, parity);
+        arg.Uback.template prefetch<Arg::prefetch_type>(x_cb, dim2, parity);
       } else {
         int idx = getNeighborIndexCB(coord, dim2, -1, arg.dc);
-        arg.U.template prefetch<Arg::prefetch_tma>(Arg::nDim == 5 ? idx % arg.dc.volume_4d_cb : idx, dim2, 1 - parity);
+        arg.U.template prefetch<Arg::prefetch_type>(Arg::nDim == 5 ? idx % arg.dc.volume_4d_cb : idx, dim2, 1 - parity);
       }
       break;
     }
diff --git a/include/quda_define.h.in b/include/quda_define.h.in
index e55dabb14a..98e9177557 100644
--- a/include/quda_define.h.in
+++ b/include/quda_define.h.in
@@ -176,11 +176,15 @@
 #cmakedefine QUDA_DSLASH_DOUBLE_STORE
 
 /**
- * @def QUDA_DSLASH_PREFETCH_TMA @brief This macro sets whether to use
- * the TMA for L2 prefetching: 0 - no TMA, 1 - use bulk prefetch, 2 -
- * use tensor prefetch
- */
-#define QUDA_DSLASH_PREFETCH_TMA @QUDA_DSLASH_PREFETCH_TMA@
+ * @def QUDA_DSLASH_PREFETCH_TYPE
+ * @brief This macro sets whether to use
+ * the TMA for L2 prefetching:
+ * NONE - no prefetch
+ * THREAD - per thread prefetch
+ * BULK - TMA bulk prefetch
+ * TENSOR - TMA tensor descriptor prefetch
+ */
+#define QUDA_DSLASH_PREFETCH_TYPE_@QUDA_DSLASH_PREFETCH_TYPE@
 
 /**
  * @def QUDA_DSLASH_PREFETCH_DISTANCE_WILSON
diff --git a/include/targets/cuda/tma_helper.hpp b/include/targets/cuda/tma_helper.hpp
index 173bb89f8e..852a0a42cf 100644
--- a/include/targets/cuda/tma_helper.hpp
+++ b/include/targets/cuda/tma_helper.hpp
@@ -24,6 +24,8 @@ using barrier_t = cuda::barrier<cuda::thread_scope_block>;
 namespace quda
 {
 
+  enum class PrefetchType { NONE, THREAD, BULK, TENSOR };
+
   struct tma_descriptor_t {
     CUtensorMap map;
   };
diff --git a/include/targets/generic/tma_helper.hpp b/include/targets/generic/tma_helper.hpp
index 0acb5fe298..761768ce98 100644
--- a/include/targets/generic/tma_helper.hpp
+++ b/include/targets/generic/tma_helper.hpp
@@ -3,6 +3,8 @@
 namespace quda
 {
 
+  enum class PrefetchType { NONE, THREAD, BULK, TENSOR };
+
   struct tma_descriptor_t {
   };
 
diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index 3611ca5a36..5e5b7bc669 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -176,6 +176,45 @@ option(QUDA_SHARED_MEMORY_SPILL "enable shared memory spilling?" OFF)
 mark_as_advanced(QUDA_SHARED_MEMORY_SPILL)
 message(STATUS "Shared memory spilling: ${QUDA_SHARED_MEMORY_SPILL}")
 
+# Dslash prefetching
+option(QUDA_DSLASH_DOUBLE_STORE "store a forwards shifted copy of the gauge fields for simplified Dslash indexing" OFF)
+mark_as_advanced(QUDA_DSLASH_DOUBLE_STORE)
+message(STATUS "QUDA_DSLASH_DOUBLE_STORE: ${QUDA_DSLASH_DOUBLE_STORE}")
+
+set(QUDA_DSLASH_PREFETCH_TYPE "NONE" CACHE STRING "enable Dslash prefetching (NONE, THREAD, BULK, TENSOR)")
+set_property(CACHE QUDA_DSLASH_PREFETCH_TYPE PROPERTY STRINGS NONE THREAD BULK TENSOR)
+set(_valid_prefetch NONE THREAD BULK TENSOR)
+if(NOT QUDA_DSLASH_PREFETCH_TYPE IN_LIST _valid_prefetch)
+  message(FATAL_ERROR
+    "Invalid QUDA_DSLASH_PREFETCH_TYPE='${QUDA_DSLASH_PREFETCH_TYPE}'. "
+    "Allowed: ${_valid_prefetch}")
+endif()
+message(STATUS "QUDA_DSLASH_PREFETCH_TYPE: ${QUDA_DSLASH_PREFETCH_TYPE}")
+mark_as_advanced(QUDA_DSLASH_PREFETCH_TYPE)
+
+if(QUDA_DSLASH_PREFETCH_TYPE GREATER 0 AND NOT QUDA_DSLASH_DOUBLE_STORE)
+  message(SEND_ERROR "QUDA_DSLASH_PREFETCH_TYPE with TMA cannot be enabled without QUDA_DSLASH_DOUBLE_STORE=ON")
+endif()
+
+set(_tma_modes BULK TENSOR)
+if(QUDA_DSLASH_PREFETCH_TYPE IN_LIST _tma_modes AND
+   QUDA_COMPUTE_CAPABILITY LESS 90)
+  message(FATAL_ERROR
+    "QUDA_DSLASH_PREFETCH_TYPE=${QUDA_DSLASH_PREFETCH_TYPE} "
+    "requires QUDA_GPU_ARCH=sm_90 or newer")
+endif()
+
+set(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON "0" CACHE STRING "set prefetch distance for Wilson-like fermions")
+set(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED "0" CACHE STRING "set prefetch distance for staggered-like fermions")
+mark_as_advanced(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON)
+mark_as_advanced(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED)
+if(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON GREATER 7)
+  message(SEND_ERROR "QUDA_DSLASH_PREFETCH_DISTANCE_WILSON is greater than pipeline length")
+endif()
+if(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED GREATER 15)
+  message(SEND_ERROR "QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED is greater than pipeline length")
+endif()
+
 # QUDA_HASH for tunecache
 set(HASH cpu_arch=${CPU_ARCH},gpu_arch=${QUDA_GPU_ARCH},cuda_version=${CMAKE_CUDA_COMPILER_VERSION})
 set(GITVERSION "${PROJECT_VERSION}-${GITVERSION}-${QUDA_GPU_ARCH}")

From ea36ceddcad9474152f777c8b264b9efff0a0e91 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 2 Feb 2026 23:11:47 -0800
Subject: [PATCH 106/121] Runtime warning if dslash prefetch distance exceeds
 max for naive staggered

---
 include/kernels/dslash_staggered.cuh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 31a4b8974f..3e2f8d7df9 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -70,6 +70,9 @@ namespace quda
     is_last_time_slice(comm_coord(3) == comm_dim(3) - 1 ? true : false),
     dagger_scale(dagger ? static_cast<real>(-1.0) : static_cast<real>(1.0))
     {
+      if (!improved && prefetch_distance > 7)
+        warningQuda("dslash prefetch distance %d is greater than pipeline length for naive staggered", prefetch_distance);
+
       for (auto i = 0u; i < out.size(); i++) {
         this->out[i] = out[i];
         this->in[i] = in[i];

From 3b25ff53c1d11adb0871d758a159b4b0d4bd6498 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 2 Feb 2026 23:35:52 -0800
Subject: [PATCH 107/121] Fix ROCm compilation

---
 include/dslash_helper.cuh | 1 +
 lib/CMakeLists.txt        | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 32d44fa6d3..02d2fe2f6c 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -41,6 +41,7 @@ namespace quda
 #else
 #error "Invalid or missing QUDA_DSLASH_PREFETCH_TYPE"
 #endif
+    return PrefetchType::NONE;
   }
 
 #if defined(NVSHMEM_COMMS) && (defined(QUDA_DSLASH_PREFETCH_TYPE_BULK) || defined(QUDA_DSLASH_PREFETCH_TYPE_TENSOR))
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 319aa79042..6338e269f4 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -519,6 +519,13 @@ if(NOT DEFINED QUDA_MAX_MULTI_RHS)
   message(STATUS "Max number of rhs per kernel: ${QUDA_MAX_MULTI_RHS}")
 endif()
 
+# Disable dslash prefetching if not already set
+if(NOT QUDA_DSLASH_PREFETCH_TYPE)
+  set(QUDA_DSLASH_PREFETCH_TYPE "NONE" CACHE STRING "enable Dslash prefetching" FORCE)
+  set(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON "0" CACHE STRING "set prefetch distance for Wilson-like fermions" FORCE)
+  set(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED "0" CACHE STRING "set prefetch distance for staggered-like fermions" FORCE)
+endif()
+
 # make one library
 target_sources(quda PRIVATE $<TARGET_OBJECTS:quda_cpp> $<$<TARGET_EXISTS:quda_pack>:$<TARGET_OBJECTS:quda_pack>>
                             ${QUDA_CU_OBJS})

From 9b83fdecd67987131120d657a66138bf784c9551 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Mon, 2 Feb 2026 23:38:10 -0800
Subject: [PATCH 108/121] Make HIP shared memory helpers match CUDA versions

---
 include/targets/hip/shared_memory_cache_helper.h | 1 -
 include/targets/hip/shared_memory_helper.h       | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)
 delete mode 100644 include/targets/hip/shared_memory_cache_helper.h

diff --git a/include/targets/hip/shared_memory_cache_helper.h b/include/targets/hip/shared_memory_cache_helper.h
deleted file mode 100644
index 73be0cd01b..0000000000
--- a/include/targets/hip/shared_memory_cache_helper.h
+++ /dev/null
@@ -1 +0,0 @@
-#include "../generic/shared_memory_cache_helper.h"
diff --git a/include/targets/hip/shared_memory_helper.h b/include/targets/hip/shared_memory_helper.h
index 69d8c095ce..3b4b46a132 100644
--- a/include/targets/hip/shared_memory_helper.h
+++ b/include/targets/hip/shared_memory_helper.h
@@ -80,8 +80,9 @@ namespace quda
     /**
        @brief Constructor for SharedMemory object.
     */
-    template <typename... U>
-    constexpr SharedMemory(const KernelOps<U...> &) : data(cache(get_offset(target::block_dim())))
+    template <typename... U, typename... Arg>
+    constexpr SharedMemory(const KernelOps<U...> &, const Arg &...arg) :
+      data(cache(get_offset(target::block_dim(), arg...)))
     {
     }
 

From 709b7f9f2452e9e61d24cd1390712b34ab793daa Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 3 Feb 2026 16:46:46 -0800
Subject: [PATCH 109/121] Blackwell now defaults to using BULK TMA prefetching
 with a prefetch distance of 2

---
 lib/targets/cuda/target_cuda.cmake | 72 ++++++++++++++++++++++++------
 1 file changed, 58 insertions(+), 14 deletions(-)

diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index 5e5b7bc669..8aa7c18936 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -176,38 +176,81 @@ option(QUDA_SHARED_MEMORY_SPILL "enable shared memory spilling?" OFF)
 mark_as_advanced(QUDA_SHARED_MEMORY_SPILL)
 message(STATUS "Shared memory spilling: ${QUDA_SHARED_MEMORY_SPILL}")
 
-# Dslash prefetching
-option(QUDA_DSLASH_DOUBLE_STORE "store a forwards shifted copy of the gauge fields for simplified Dslash indexing" OFF)
+
+# ---------------------------
+# Set Dslash prefetching
+# ---------------------------
+
+# Arch-dependent defaults
+set(_dslash_double_store_default OFF)
+set(_dslash_prefetch_type_default NONE)
+set(_dslash_prefetch_dist_w_default 0)
+set(_dslash_prefetch_dist_s_default 0)
+
+# These are expected Blackwell+ defaults
+if(QUDA_COMPUTE_CAPABILITY GREATER_EQUAL 100)
+  set(_dslash_double_store_default ON)
+  set(_dslash_prefetch_type_default BULK)
+  set(_dslash_prefetch_dist_w_default 2)
+  set(_dslash_prefetch_dist_s_default 2)
+endif()
+
+# Cache variables (set only if not already defined)
+if(NOT DEFINED QUDA_DSLASH_DOUBLE_STORE)
+  set(QUDA_DSLASH_DOUBLE_STORE ${_dslash_double_store_default}
+      CACHE BOOL "store a forwards shifted copy of the gauge fields for simplified Dslash indexing")
+endif()
 mark_as_advanced(QUDA_DSLASH_DOUBLE_STORE)
 message(STATUS "QUDA_DSLASH_DOUBLE_STORE: ${QUDA_DSLASH_DOUBLE_STORE}")
 
-set(QUDA_DSLASH_PREFETCH_TYPE "NONE" CACHE STRING "enable Dslash prefetching (NONE, THREAD, BULK, TENSOR)")
+if(NOT DEFINED QUDA_DSLASH_PREFETCH_TYPE)
+  set(QUDA_DSLASH_PREFETCH_TYPE ${_dslash_prefetch_type_default}
+      CACHE STRING "enable Dslash prefetching (NONE, THREAD, BULK, TENSOR)")
+endif()
 set_property(CACHE QUDA_DSLASH_PREFETCH_TYPE PROPERTY STRINGS NONE THREAD BULK TENSOR)
+mark_as_advanced(QUDA_DSLASH_PREFETCH_TYPE)
+message(STATUS "QUDA_DSLASH_PREFETCH_TYPE: ${QUDA_DSLASH_PREFETCH_TYPE}")
+
+if(NOT DEFINED QUDA_DSLASH_PREFETCH_DISTANCE_WILSON)
+  set(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON ${_dslash_prefetch_dist_w_default}
+      CACHE STRING "Dslash prefetch distance for Wilson kernels")
+endif()
+mark_as_advanced(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON)
+message(STATUS "QUDA_DSLASH_PREFETCH_DISTANCE_WILSON: ${QUDA_DSLASH_PREFETCH_DISTANCE_WILSON}")
+
+if(NOT DEFINED QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED)
+  set(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED ${_dslash_prefetch_dist_s_default}
+      CACHE STRING "Dslash prefetch distance for Staggered kernels")
+endif()
+mark_as_advanced(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED)
+message(STATUS "QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED: ${QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED}")
+
+# Validate prefetch type
 set(_valid_prefetch NONE THREAD BULK TENSOR)
 if(NOT QUDA_DSLASH_PREFETCH_TYPE IN_LIST _valid_prefetch)
   message(FATAL_ERROR
     "Invalid QUDA_DSLASH_PREFETCH_TYPE='${QUDA_DSLASH_PREFETCH_TYPE}'. "
     "Allowed: ${_valid_prefetch}")
 endif()
-message(STATUS "QUDA_DSLASH_PREFETCH_TYPE: ${QUDA_DSLASH_PREFETCH_TYPE}")
-mark_as_advanced(QUDA_DSLASH_PREFETCH_TYPE)
 
-if(QUDA_DSLASH_PREFETCH_TYPE GREATER 0 AND NOT QUDA_DSLASH_DOUBLE_STORE)
-  message(SEND_ERROR "QUDA_DSLASH_PREFETCH_TYPE with TMA cannot be enabled without QUDA_DSLASH_DOUBLE_STORE=ON")
+# TMA prefetching requires double-store
+set(_tma_modes BULK TENSOR)
+
+# TMA prefetching requires double store
+if(QUDA_DSLASH_PREFETCH_TYPE IN_LIST _tma_modes AND NOT QUDA_DSLASH_DOUBLE_STORE)
+  message(FATAL_ERROR
+    "QUDA_DSLASH_PREFETCH_TYPE=${QUDA_DSLASH_PREFETCH_TYPE} "
+    "requires QUDA_DSLASH_DOUBLE_STORE=ON")
 endif()
 
-set(_tma_modes BULK TENSOR)
-if(QUDA_DSLASH_PREFETCH_TYPE IN_LIST _tma_modes AND
-   QUDA_COMPUTE_CAPABILITY LESS 90)
+# TMA prefetching requires sm_90+
+if(QUDA_DSLASH_PREFETCH_TYPE IN_LIST _tma_modes AND QUDA_COMPUTE_CAPABILITY LESS 90)
   message(FATAL_ERROR
     "QUDA_DSLASH_PREFETCH_TYPE=${QUDA_DSLASH_PREFETCH_TYPE} "
     "requires QUDA_GPU_ARCH=sm_90 or newer")
 endif()
 
-set(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON "0" CACHE STRING "set prefetch distance for Wilson-like fermions")
-set(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED "0" CACHE STRING "set prefetch distance for staggered-like fermions")
-mark_as_advanced(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON)
-mark_as_advanced(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED)
+# validate prefetching distances
 if(QUDA_DSLASH_PREFETCH_DISTANCE_WILSON GREATER 7)
   message(SEND_ERROR "QUDA_DSLASH_PREFETCH_DISTANCE_WILSON is greater than pipeline length")
 endif()
@@ -215,6 +258,7 @@ if(QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED GREATER 15)
   message(SEND_ERROR "QUDA_DSLASH_PREFETCH_DISTANCE_STAGGERED is greater than pipeline length")
 endif()
 
+
 # QUDA_HASH for tunecache
 set(HASH cpu_arch=${CPU_ARCH},gpu_arch=${QUDA_GPU_ARCH},cuda_version=${CMAKE_CUDA_COMPILER_VERSION})
 set(GITVERSION "${PROJECT_VERSION}-${GITVERSION}-${QUDA_GPU_ARCH}")

From 305884e60326bbe34469837070f27f2ae18d5887 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 3 Feb 2026 18:17:28 -0800
Subject: [PATCH 110/121] Signficant cleanup of TENSOR variant of prefetching. 
 Descriptor not now created unless TENSOR prefetching type is enabled

---
 include/dslash.h                     | 13 ++++++++++++-
 include/gauge_field.h                |  5 +++--
 include/kernels/dslash_staggered.cuh |  8 ++++----
 include/kernels/dslash_wilson.cuh    |  4 ++--
 include/kernels/laplace.cuh          |  2 +-
 lib/dslash_improved_staggered.hpp    |  9 +++++----
 lib/dslash_staggered.hpp             | 11 ++++++-----
 lib/dslash_wilson.hpp                |  5 +----
 lib/gauge_field.cpp                  |  1 +
 lib/laplace.hpp                      | 11 ++++++-----
 lib/staggered_quark_smearing.cu      |  9 +++++----
 11 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/include/dslash.h b/include/dslash.h
index 6ea801ce45..372790f420 100644
--- a/include/dslash.h
+++ b/include/dslash.h
@@ -8,6 +8,7 @@
 #include <tunable_nd.h>
 #include <instantiate.h>
 #include <instantiate_dslash.h>
+#include <tma_helper.hpp>
 
 namespace quda
 {
@@ -142,7 +143,7 @@ namespace quda
       }
     }
 
-    inline void setParam(TuneParam &tp)
+    template <bool improved = false> inline void setParam(TuneParam &tp, const GaugeField &U, const GaugeField &L = {})
     {
       // Need to reset ghost pointers prior to every call since the
       // ghost buffer may have been changed during policy tuning.
@@ -185,6 +186,16 @@ namespace quda
           0;
         tp.grid.x += arg.exterior_blocks;
       }
+
+      if constexpr (dslash_prefetch_type() == PrefetchType::TENSOR && Arg::prefetch_distance > 0) {
+        Dslash::arg.U.tensor_desc = get_tensor_descriptor(U, tp.block.x);
+        Dslash::arg.Uback.tensor_desc = get_tensor_descriptor(U.shift(), tp.block.x);
+        if constexpr (improved) {
+          assert(!U.empty());
+          Dslash::arg.L.tensor_desc = get_tensor_descriptor(L, tp.block.x);
+          Dslash::arg.Lback.tensor_desc = get_tensor_descriptor(L.shift(), tp.block.x);
+        }
+      }
     }
 
     virtual int blockStep() const override { return (arg.shmem & 64) ? 8 : 16; }
diff --git a/include/gauge_field.h b/include/gauge_field.h
index 1d0fe72098..4b2905a1a0 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -652,10 +652,11 @@ namespace quda {
     /**
        @brief Return the shifted gauge field by shift in each
        dimension.  Shifted field is cached for subsequent reuse.
-       @param[in] shift value (1 or 3 supported)
+       @param[in] shift value (1 or 3 supported).  If no argument
+       passed the shift is set to Nface.
        @return Reference to shifted field
     */
-    GaugeField &shift(int shift) const;
+    GaugeField &shift(int shift = -1) const;
 
     /**
      * @brief Print the site data
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index 3e2f8d7df9..efaf33c5d7 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -43,10 +43,10 @@ namespace quda
     const Ghost halo_pack; /** accessor for writing the halo */
     const Ghost halo;      /** accessor for reading the halo */
     F x[MAX_MULTI_RHS];    /** input vector when doing xpay */
-    const GU<false> U;     /** the gauge field */
-    const GU<true> Uback;  /** the gauge field */
-    const GL<false> L;     /** the long gauge field */
-    const GL<true> Lback;  /** the long gauge field */
+    mutable GU<false> U;     /** the gauge field */
+    mutable GU<true> Uback;  /** the gauge field */
+    mutable GL<false> L;     /** the long gauge field */
+    mutable GL<true> Lback;  /** the long gauge field */
 
     const real a; /** xpay scale factor */
     const real tboundary; /** temporal boundary condition */
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 80eb97fd91..75d5ce041c 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -39,8 +39,8 @@ namespace quda
     F x[MAX_MULTI_RHS];   /** input vector set when doing xpay */
     Ghost halo_pack;
     Ghost halo;
-    const G<false> U;    /** the gauge field */
-    const G<true> Uback; /** the backwards gauge field */
+    mutable G<false> U;    /** the gauge field */
+    mutable G<true> Uback; /** the backwards gauge field */
     const real a; /** xpay scale factor - can be -kappa or -kappa^2 */
     /** parameters for distance preconditioning */
     const real alpha0;
diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh
index b45ac9774f..bff5bc3c36 100644
--- a/include/kernels/laplace.cuh
+++ b/include/kernels/laplace.cuh
@@ -36,7 +36,7 @@ namespace quda
     const Ghost halo_pack; /** accessor used for writing the halo field */
     const Ghost halo;      /** accessor used for reading the halo field */
     F x[MAX_MULTI_RHS];    /** input vector field for xpay*/
-    const G U;    /** the gauge field */
+    mutable G U;           /** the gauge field */
     const real a; /** xpay scale factor - can be -kappa or -kappa^2 */
     const real b; /** used by Wuppetal smearing kernel */
     int dir;      /** The direction from which to omit the derivative */
diff --git a/lib/dslash_improved_staggered.hpp b/lib/dslash_improved_staggered.hpp
index 6a717adb35..25f9f673f4 100644
--- a/lib/dslash_improved_staggered.hpp
+++ b/lib/dslash_improved_staggered.hpp
@@ -24,19 +24,20 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
     const GaugeField &L;
 
   public:
     Staggered(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-              const ColorSpinorField &halo, const GaugeField &L) :
-      Dslash(arg, out, in, halo), L(L)
+              const ColorSpinorField &halo, const GaugeField &U, const GaugeField &L) :
+      Dslash(arg, out, in, halo), U(U), L(L)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::template setParam<true>(tp, U, L);
       // operator is anti-Hermitian so do not instantiate dagger
       if (arg.xpay)
         Dslash::template instantiate<packStaggeredShmem, false, true>(tp, stream);
@@ -156,7 +157,7 @@ namespace quda
 
       StaggeredArg<Float, nColor, nDim, DDArg, recon_u, recon_l, improved> arg(out, in, halo, U, L, a, x, parity,
                                                                                dagger, comm_override);
-      Staggered<decltype(arg)> staggered(arg, out, in, halo, L);
+      Staggered<decltype(arg)> staggered(arg, out, in, halo, U, L);
       dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_staggered.hpp b/lib/dslash_staggered.hpp
index 31c372d834..24965e00d0 100644
--- a/lib/dslash_staggered.hpp
+++ b/lib/dslash_staggered.hpp
@@ -21,18 +21,19 @@ namespace quda
   {
     using Dslash = Dslash<staggered, Arg>;
     using Dslash::arg;
+    const GaugeField &U;
 
   public:
     Staggered(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-              const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+              const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       // operator is anti-Hermitian so do not instantiate dagger
       if (arg.xpay)
         Dslash::template instantiate<packStaggeredShmem, false, true>(tp, stream);
@@ -55,7 +56,7 @@ namespace quda
         if constexpr (is_enabled<QUDA_MILC_GAUGE_ORDER>()) {
           StaggeredArg<Float, nColor, nDim, DDArg, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_MILC> arg(
             out, in, halo, U, U, a, x, parity, dagger, comm_override);
-          Staggered<decltype(arg)> staggered(arg, out, in, halo);
+          Staggered<decltype(arg)> staggered(arg, out, in, halo, U);
 
           dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, out, in, halo, profile);
         } else {
@@ -65,7 +66,7 @@ namespace quda
         if constexpr (is_enabled<QUDA_TIFR_GAUGE_ORDER>()) {
           StaggeredArg<Float, nColor, nDim, DDArg, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_TIFR> arg(
             out, in, halo, U, U, a, x, parity, dagger, comm_override);
-          Staggered<decltype(arg)> staggered(arg, out, in, halo);
+          Staggered<decltype(arg)> staggered(arg, out, in, halo, U);
 
           dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, out, in, halo, profile);
         } else {
diff --git a/lib/dslash_wilson.hpp b/lib/dslash_wilson.hpp
index 0c2cd4a1a8..514e723ff8 100644
--- a/lib/dslash_wilson.hpp
+++ b/lib/dslash_wilson.hpp
@@ -30,10 +30,7 @@ namespace quda
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-      const_cast<quda::gauge::tensor_desc_t &>(Dslash::arg.U.tensor_desc) = get_tensor_descriptor(U, tp.block.x);
-      if constexpr (dslash_double_store())
-        const_cast<quda::gauge::tensor_desc_t &>(Dslash::arg.Uback.tensor_desc) = get_tensor_descriptor(U.shift(1), tp.block.x);
+      Dslash::setParam(tp, U);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
   };
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index f0d1bd1783..3638c2a64a 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -1443,6 +1443,7 @@ namespace quda {
 
   GaugeField& GaugeField::shift(int shift_offset) const
   {
+    if (shift_offset == -1) shift_offset = nFace;
     // If we don't yet have a cached shifted copy or the shift value changed
     if (!shifted) shifted = std::make_unique<GaugeField>(::quda::shift(*this, shift_offset));
     return *shifted;
diff --git a/lib/laplace.hpp b/lib/laplace.hpp
index 54fa2b1901..cdf0f0741e 100644
--- a/lib/laplace.hpp
+++ b/lib/laplace.hpp
@@ -25,18 +25,19 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     Laplace(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-            const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+            const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream) override
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
 
       // operator is Hermitian so do not instantiate dagger
       if (arg.xpay)
@@ -151,12 +152,12 @@ namespace quda
       if (in.Nspin() == 1) {
         constexpr int nSpin = 1;
         LaplaceArg<Float, nSpin, nColor, nDim, DDArg, recon> arg(out, in, halo, U, dir, a, b, x, parity, comm_override);
-        Laplace<decltype(arg)> laplace(arg, out, in, halo);
+        Laplace<decltype(arg)> laplace(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, out, in, halo, profile);
       } else if (in.Nspin() == 4) {
         constexpr int nSpin = 4;
         LaplaceArg<Float, nSpin, nColor, nDim, DDArg, recon> arg(out, in, halo, U, dir, a, b, x, parity, comm_override);
-        Laplace<decltype(arg)> laplace(arg, out, in, halo);
+        Laplace<decltype(arg)> laplace(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, out, in, halo, profile);
       } else {
         errorQuda("Unsupported nSpin= %d", in.Nspin());
diff --git a/lib/staggered_quark_smearing.cu b/lib/staggered_quark_smearing.cu
index 760855f6e7..6c31a79ae6 100644
--- a/lib/staggered_quark_smearing.cu
+++ b/lib/staggered_quark_smearing.cu
@@ -25,11 +25,12 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     StaggeredQSmear(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                    const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                    const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
@@ -53,7 +54,7 @@ namespace quda
       }
 
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
 
       // operator is Hermitian so do not instantiate dagger
       Dslash::template instantiate<packStaggeredShmem, false, false>(tp, stream);
@@ -194,7 +195,7 @@ namespace quda
         auto halo = ColorSpinorField::create_comms_batch(in, 3);
         StaggeredQSmearArg<Float, nSpin, nColor, nDim, DDArg, recon> arg(out, in, halo, U, t0, is_tslice_kernel, parity,
                                                                          dir, dagger, comm_override);
-        StaggeredQSmear<decltype(arg)> staggered_qsmear(arg, out, in, halo);
+        StaggeredQSmear<decltype(arg)> staggered_qsmear(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(staggered_qsmear)> policy(staggered_qsmear, out, in, halo, profile);
       } else {
         errorQuda("Unsupported nSpin = %d", in.Nspin());

From 06413d098a4de14e904cfb08dd44486360a718f5 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 3 Feb 2026 23:21:19 -0800
Subject: [PATCH 111/121] Fix CI

---
 lib/covariant_derivative.cu                   | 11 +++++-----
 lib/dslash_domain_wall_4d.hpp                 |  9 +++++----
 lib/dslash_domain_wall_4d_fused_m5.hpp        |  9 +++++----
 lib/dslash_domain_wall_5d.hpp                 |  9 +++++----
 lib/dslash_ndeg_twisted_clover.hpp            |  9 +++++----
 ...ash_ndeg_twisted_clover_preconditioned.hpp | 11 +++++-----
 lib/dslash_ndeg_twisted_mass.hpp              |  9 +++++----
 ...slash_ndeg_twisted_mass_preconditioned.hpp | 11 +++++-----
 lib/dslash_twisted_clover.hpp                 |  9 +++++----
 lib/dslash_twisted_clover_preconditioned.hpp  |  9 +++++----
 lib/dslash_twisted_mass.hpp                   |  9 +++++----
 lib/dslash_twisted_mass_preconditioned.hpp    | 11 +++++-----
 lib/dslash_wilson_clover.hpp                  |  9 +++++----
 lib/dslash_wilson_clover_hasenbusch_twist.hpp |  9 +++++----
 ...clover_hasenbusch_twist_preconditioned.hpp | 20 +++++++++++--------
 lib/dslash_wilson_clover_preconditioned.hpp   |  9 +++++----
 16 files changed, 91 insertions(+), 72 deletions(-)

diff --git a/lib/covariant_derivative.cu b/lib/covariant_derivative.cu
index df7767cdd1..fb30b149fb 100644
--- a/lib/covariant_derivative.cu
+++ b/lib/covariant_derivative.cu
@@ -25,18 +25,19 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     CovDev(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-           const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+           const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream) override
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay) errorQuda("Covariant derivative operator only defined without xpay");
       if (arg.nParity != 2) errorQuda("Covariant derivative operator only defined for full field");
 
@@ -143,11 +144,11 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in, 1, false);
       if (in.Nspin() == 4) {
         CovDevArg<Float, 4, nColor, DDArg, recon, nDim> arg(out, in, halo, U, mu, parity, dagger, comm_override);
-        CovDev<decltype(arg)> covDev(arg, out, in, halo);
+        CovDev<decltype(arg)> covDev(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(covDev)> policy(covDev, out, in, halo, profile);
       } else if (in.Nspin() == 1) {
         CovDevArg<Float, 1, nColor, DDArg, recon, nDim> arg(out, in, halo, U, mu, parity, dagger, comm_override);
-        CovDev<decltype(arg)> covDev(arg, out, in, halo);
+        CovDev<decltype(arg)> covDev(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(covDev)> policy(covDev, out, in, halo, profile);
       } else {
         errorQuda("Spin not supported");
diff --git a/lib/dslash_domain_wall_4d.hpp b/lib/dslash_domain_wall_4d.hpp
index b8ebf9b7e4..3b22cbf467 100644
--- a/lib/dslash_domain_wall_4d.hpp
+++ b/lib/dslash_domain_wall_4d.hpp
@@ -21,18 +21,19 @@ namespace quda
     using Dslash = Dslash<domainWall4D, Arg>;
     using Dslash::arg;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     DomainWall4D(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                 const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                 const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
   };
@@ -47,7 +48,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       DomainWall4DArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, a, m_5, b_5, c_5, a != 0.0, x, parity,
                                                              dagger, comm_override);
-      DomainWall4D<decltype(arg)> dwf(arg, out, in, halo);
+      DomainWall4D<decltype(arg)> dwf(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(dwf)> policy(dwf, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_domain_wall_4d_fused_m5.hpp b/lib/dslash_domain_wall_4d_fused_m5.hpp
index fb835eda8f..97ebd5fd7d 100644
--- a/lib/dslash_domain_wall_4d_fused_m5.hpp
+++ b/lib/dslash_domain_wall_4d_fused_m5.hpp
@@ -20,6 +20,7 @@ namespace quda
     using Dslash::aux_base;
     using Dslash::in;
     cvector_ref<ColorSpinorField> &y;
+    const GaugeField &U;
 
     inline std::string get_app_base()
     {
@@ -42,8 +43,8 @@ namespace quda
 
   public:
     DomainWall4DFusedM5(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                        const ColorSpinorField &halo, cvector_ref<ColorSpinorField> &y) :
-      Dslash(arg, out, in, halo, get_app_base()), y(y)
+                        const ColorSpinorField &halo, cvector_ref<ColorSpinorField> &y, const GaugeField &U) :
+      Dslash(arg, out, in, halo, get_app_base()), y(y), U(U)
     {
       TunableKernel3D::resizeStep(in.X(4), 1); // keep Ls local to the thread block
     }
@@ -51,7 +52,7 @@ namespace quda
     void apply(const qudaStream_t &stream) override
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
 
@@ -129,7 +130,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       using Arg = DomainWall4DFusedM5Arg<Float, nColor, nDim, DDArg, recon, dslash5_type_impl>;
       Arg arg(out, in, halo, U, a, m_5, b_5, c_5, a != 0.0, x, y, parity, dagger, comm_override, m_f);
-      DomainWall4DFusedM5<Arg> dwf(arg, out, in, halo, y);
+      DomainWall4DFusedM5<Arg> dwf(arg, out, in, halo, y, U);
       dslash::DslashPolicyTune<decltype(dwf)> policy(dwf, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_domain_wall_5d.hpp b/lib/dslash_domain_wall_5d.hpp
index 6ea9d0d144..ffeb183534 100644
--- a/lib/dslash_domain_wall_5d.hpp
+++ b/lib/dslash_domain_wall_5d.hpp
@@ -18,18 +18,19 @@ namespace quda
     using Dslash = Dslash<domainWall5D, Arg>;
     using Dslash::arg;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     DomainWall5D(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                 const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                 const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       Dslash::template instantiate<packShmem>(tp, stream);
     }
 
@@ -74,7 +75,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       DomainWall5DArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, a, m_f, a != 0.0, x, parity, dagger,
                                                              comm_override);
-      DomainWall5D<decltype(arg)> dwf(arg, out, in, halo);
+      DomainWall5D<decltype(arg)> dwf(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(dwf)> policy(dwf, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_ndeg_twisted_clover.hpp b/lib/dslash_ndeg_twisted_clover.hpp
index 78d1a48484..d13d0df0ab 100644
--- a/lib/dslash_ndeg_twisted_clover.hpp
+++ b/lib/dslash_ndeg_twisted_clover.hpp
@@ -21,6 +21,7 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
     unsigned int sharedBytesPerThread() const
     {
@@ -32,8 +33,8 @@ namespace quda
 
   public:
     NdegTwistedClover(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                      const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                      const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
       TunableKernel3D::resizeStep(2, 1);
     }
@@ -41,7 +42,7 @@ namespace quda
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay)
         Dslash::template instantiate<packShmem, true>(tp, stream);
       else
@@ -87,7 +88,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       NdegTwistedCloverArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, A, a, b, c, x, parity, dagger,
                                                                   comm_override);
-      NdegTwistedClover<decltype(arg)> twisted(arg, out, in, halo);
+      NdegTwistedClover<decltype(arg)> twisted(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_ndeg_twisted_clover_preconditioned.hpp b/lib/dslash_ndeg_twisted_clover_preconditioned.hpp
index 89c7165055..4e91b7ae22 100644
--- a/lib/dslash_ndeg_twisted_clover_preconditioned.hpp
+++ b/lib/dslash_ndeg_twisted_clover_preconditioned.hpp
@@ -20,6 +20,7 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
     unsigned int sharedBytesPerThread() const
     {
@@ -27,9 +28,9 @@ namespace quda
     }
 
   public:
-    NdegTwistedCloverPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out,
-                                    cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+    NdegTwistedCloverPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                    const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
       TunableKernel3D::resizeStep(2, 1); // this will force flavor to be contained in the block
     }
@@ -37,7 +38,7 @@ namespace quda
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.nParity != 1)
         errorQuda("Preconditioned non-degenerate twisted-clover operator not defined nParity=%d", arg.nParity);
 
@@ -106,7 +107,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       NdegTwistedCloverPreconditionedArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, A, a, b, c, xpay, x,
                                                                                 parity, dagger, comm_override);
-      NdegTwistedCloverPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+      NdegTwistedCloverPreconditioned<decltype(arg)> twisted(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_ndeg_twisted_mass.hpp b/lib/dslash_ndeg_twisted_mass.hpp
index 5461de6311..b0b0dc21e7 100644
--- a/lib/dslash_ndeg_twisted_mass.hpp
+++ b/lib/dslash_ndeg_twisted_mass.hpp
@@ -19,18 +19,19 @@ namespace quda
     using Dslash = Dslash<nDegTwistedMass, Arg>;
     using Dslash::arg;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     NdegTwistedMass(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                    const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                    const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay)
         Dslash::template instantiate<packShmem, true>(tp, stream);
       else
@@ -61,7 +62,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       NdegTwistedMassArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, a, b, c, x, parity, dagger,
                                                                 comm_override);
-      NdegTwistedMass<decltype(arg)> twisted(arg, out, in, halo);
+      NdegTwistedMass<decltype(arg)> twisted(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_ndeg_twisted_mass_preconditioned.hpp b/lib/dslash_ndeg_twisted_mass_preconditioned.hpp
index c3ab903c23..25d4deeb2d 100644
--- a/lib/dslash_ndeg_twisted_mass_preconditioned.hpp
+++ b/lib/dslash_ndeg_twisted_mass_preconditioned.hpp
@@ -28,6 +28,7 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   protected:
     bool shared;
@@ -38,8 +39,8 @@ namespace quda
 
   public:
     NdegTwistedMassPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                  const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo), shared(arg.asymmetric || !arg.dagger)
+                                  const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U), shared(arg.asymmetric || !arg.dagger)
     {
       if (shared) TunableKernel3D::resizeStep(2, 1); // this will force flavor to be contained in the block
     }
@@ -47,7 +48,7 @@ namespace quda
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.asymmetric && !arg.dagger) errorQuda("asymmetric operator only defined for dagger");
       if (arg.asymmetric && arg.xpay) errorQuda("asymmetric operator not defined for xpay");
       if (arg.nParity != 1)
@@ -105,12 +106,12 @@ namespace quda
       if (asymmetric) {
         NdegTwistedMassArg<Float, nColor, nDim, DDArg, recon, true> arg(out, in, halo, U, a, b, c, xpay, x, parity,
                                                                         dagger, comm_override);
-        NdegTwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+        NdegTwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
       } else {
         NdegTwistedMassArg<Float, nColor, nDim, DDArg, recon, false> arg(out, in, halo, U, a, b, c, xpay, x, parity,
                                                                          dagger, comm_override);
-        NdegTwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+        NdegTwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo, U);
         dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
       }
     }
diff --git a/lib/dslash_twisted_clover.hpp b/lib/dslash_twisted_clover.hpp
index 80b54887c0..30cc88a4b2 100644
--- a/lib/dslash_twisted_clover.hpp
+++ b/lib/dslash_twisted_clover.hpp
@@ -20,18 +20,19 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     TwistedClover(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                  const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                  const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay)
         Dslash::template instantiate<packShmem, true>(tp, stream);
       else
@@ -76,7 +77,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       WilsonCloverArg<Float, nColor, nDim, DDArg, recon, true> arg(out, in, halo, U, C, a, b, x, parity, dagger,
                                                                    comm_override);
-      TwistedClover<decltype(arg)> twisted(arg, out, in, halo);
+      TwistedClover<decltype(arg)> twisted(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_twisted_clover_preconditioned.hpp b/lib/dslash_twisted_clover_preconditioned.hpp
index 0b8676c4fd..c8f0ee847a 100644
--- a/lib/dslash_twisted_clover_preconditioned.hpp
+++ b/lib/dslash_twisted_clover_preconditioned.hpp
@@ -20,18 +20,19 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     TwistedCloverPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                                const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       // specialize here to constrain the template instantiation
       if (arg.nParity == 1) {
         if (arg.xpay) {
@@ -123,7 +124,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       TwistedCloverArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, C, a, b, xpay, x, parity, dagger,
                                                               comm_override);
-      TwistedCloverPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+      TwistedCloverPreconditioned<decltype(arg)> twisted(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_twisted_mass.hpp b/lib/dslash_twisted_mass.hpp
index 255550a2b6..d5bfd95cfb 100644
--- a/lib/dslash_twisted_mass.hpp
+++ b/lib/dslash_twisted_mass.hpp
@@ -18,18 +18,19 @@ namespace quda
     using Dslash = Dslash<twistedMass, Arg>;
     using Dslash::arg;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     TwistedMass(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                const ColorSpinorField &halo, const GaugeField(U)) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay)
         Dslash::template instantiate<packShmem, true>(tp, stream);
       else
@@ -59,7 +60,7 @@ namespace quda
       constexpr int nDim = 4;
       auto halo = ColorSpinorField::create_comms_batch(in);
       TwistedMassArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, a, b, x, parity, dagger, comm_override);
-      TwistedMass<decltype(arg)> twisted(arg, out, in, halo);
+      TwistedMass<decltype(arg)> twisted(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_twisted_mass_preconditioned.hpp b/lib/dslash_twisted_mass_preconditioned.hpp
index 9aec1d634b..9fcd22ce78 100644
--- a/lib/dslash_twisted_mass_preconditioned.hpp
+++ b/lib/dslash_twisted_mass_preconditioned.hpp
@@ -26,18 +26,19 @@ namespace quda
     using Dslash = Dslash<twistedMassPreconditioned, Arg>;
     using Dslash::arg;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     TwistedMassPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                              const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                              const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.asymmetric && !arg.dagger) errorQuda("asymmetric operator only defined for dagger");
       if (arg.asymmetric && arg.xpay) errorQuda("asymmetric operator not defined for xpay");
       if (arg.nParity != 1) errorQuda("Preconditioned twisted-mass operator not defined nParity=%d", arg.nParity);
@@ -82,13 +83,13 @@ namespace quda
       if (asymmetric) {
         TwistedMassArg<Float, nColor, nDim, DDArg, recon, true> arg(out, in, halo, U, a, b, xpay, x, parity, dagger,
                                                                     comm_override);
-        TwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+        TwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo, U);
 
         dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
       } else {
         TwistedMassArg<Float, nColor, nDim, DDArg, recon, false> arg(out, in, halo, U, a, b, xpay, x, parity, dagger,
                                                                      comm_override);
-        TwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+        TwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo, U);
 
         dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, out, in, halo, profile);
       }
diff --git a/lib/dslash_wilson_clover.hpp b/lib/dslash_wilson_clover.hpp
index 1b34b48814..931b109132 100644
--- a/lib/dslash_wilson_clover.hpp
+++ b/lib/dslash_wilson_clover.hpp
@@ -20,19 +20,20 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
     const CloverField &A;
 
   public:
     WilsonClover(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                 const ColorSpinorField &halo, const CloverField &A) :
-      Dslash(arg, out, in, halo), A(A)
+                 const ColorSpinorField &halo, const GaugeField &U, const CloverField &A) :
+      Dslash(arg, out, in, halo), U(U), A(A)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay)
         Dslash::template instantiate<packShmem, true>(tp, stream);
       else
@@ -79,7 +80,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       WilsonCloverArg<Float, nColor, nDim, DDArg, recon, false, distance_pc> arg(out, in, halo, U, A, a, 0.0, x, parity,
                                                                                  dagger, comm_override, alpha0, t0);
-      WilsonClover<decltype(arg)> wilson(arg, out, in, halo, A);
+      WilsonClover<decltype(arg)> wilson(arg, out, in, halo, U, A);
 
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, out, in, halo, profile);
     }
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist.hpp b/lib/dslash_wilson_clover_hasenbusch_twist.hpp
index d618673307..3ef2a571a7 100644
--- a/lib/dslash_wilson_clover_hasenbusch_twist.hpp
+++ b/lib/dslash_wilson_clover_hasenbusch_twist.hpp
@@ -19,18 +19,19 @@ namespace quda
     using Dslash = Dslash<cloverHasenbusch, Arg>;
     using Dslash::arg;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     WilsonCloverHasenbuschTwist(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                                const ColorSpinorField &halo, const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       if (arg.xpay)
         Dslash::template instantiate<packShmem, true>(tp, stream);
       else
@@ -83,7 +84,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       WilsonCloverHasenbuschTwistArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, A, a, b, x, parity,
                                                                             dagger, comm_override);
-      WilsonCloverHasenbuschTwist<decltype(arg)> wilson(arg, out, in, halo);
+      WilsonCloverHasenbuschTwist<decltype(arg)> wilson(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp
index da290e3259..42d214512a 100644
--- a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp
+++ b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp
@@ -20,18 +20,20 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     WilsonCloverHasenbuschTwistPCNoClovInv(Arg &arg, cvector_ref<ColorSpinorField> &out,
-                                           cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                                           cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo,
+                                           const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
 
       // specialize here to constrain the template instantiation
       if (arg.nParity != 1) errorQuda("Operator not defined nParity=%d", arg.nParity);
@@ -126,7 +128,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       using ArgType = WilsonCloverHasenbuschTwistPCArg<Float, nColor, nDim, DDArg, recon, false>;
       ArgType arg(out, in, halo, U, A, a, b, x, parity, dagger, comm_override);
-      WilsonCloverHasenbuschTwistPCNoClovInv<ArgType> wilson(arg, out, in, halo);
+      WilsonCloverHasenbuschTwistPCNoClovInv<ArgType> wilson(arg, out, in, halo, U);
 
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, out, in, halo, profile);
     }
@@ -144,18 +146,20 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
 
   public:
     WilsonCloverHasenbuschTwistPCClovInv(Arg &arg, cvector_ref<ColorSpinorField> &out,
-                                         cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
+                                         cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo,
+                                         const GaugeField &U) :
+      Dslash(arg, out, in, halo), U(U)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
 
       // specialize here to constrain the template instantiation
       if (arg.nParity != 1) errorQuda("Operator not defined nParity=%d", arg.nParity);
@@ -253,7 +257,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       using ArgType = WilsonCloverHasenbuschTwistPCArg<Float, nColor, nDim, DDArg, recon, true>;
       ArgType arg(out, in, halo, U, A, kappa, mu, x, parity, dagger, comm_override);
-      WilsonCloverHasenbuschTwistPCClovInv<ArgType> wilson(arg, out, in, halo);
+      WilsonCloverHasenbuschTwistPCClovInv<ArgType> wilson(arg, out, in, halo, U);
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, out, in, halo, profile);
     }
   };
diff --git a/lib/dslash_wilson_clover_preconditioned.hpp b/lib/dslash_wilson_clover_preconditioned.hpp
index 0ebd6646f2..99dcd43590 100644
--- a/lib/dslash_wilson_clover_preconditioned.hpp
+++ b/lib/dslash_wilson_clover_preconditioned.hpp
@@ -20,19 +20,20 @@ namespace quda
     using Dslash::arg;
     using Dslash::halo;
     using Dslash::in;
+    const GaugeField &U;
     const CloverField &A;
 
   public:
     WilsonCloverPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                               const ColorSpinorField &halo, const CloverField &A) :
-      Dslash(arg, out, in, halo), A(A)
+                               const ColorSpinorField &halo, const GaugeField &U, const CloverField &A) :
+      Dslash(arg, out, in, halo), U(U), A(A)
     {
     }
 
     void apply(const qudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
+      Dslash::setParam(tp, U);
       // specialize here to constrain the template instantiation
       if (arg.nParity == 1) {
         if (arg.xpay) {
@@ -126,7 +127,7 @@ namespace quda
       auto halo = ColorSpinorField::create_comms_batch(in);
       WilsonCloverArg<Float, nColor, nDim, DDArg, recon, distance_pc> arg(out, in, halo, U, A, a, x, parity, dagger,
                                                                           comm_override, alpha0, t0);
-      WilsonCloverPreconditioned<decltype(arg)> wilson(arg, out, in, halo, A);
+      WilsonCloverPreconditioned<decltype(arg)> wilson(arg, out, in, halo, U, A);
 
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, out, in, halo, profile);
     }

From dd77fc0bf6cd43941dd29b97239ebd6a8bcfcc24 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 4 Feb 2026 14:15:00 -0800
Subject: [PATCH 112/121] Fix type with twisted mass

---
 lib/dslash_twisted_mass.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/dslash_twisted_mass.hpp b/lib/dslash_twisted_mass.hpp
index d5bfd95cfb..43eb8a8bcf 100644
--- a/lib/dslash_twisted_mass.hpp
+++ b/lib/dslash_twisted_mass.hpp
@@ -22,7 +22,7 @@ namespace quda
 
   public:
     TwistedMass(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                const ColorSpinorField &halo, const GaugeField(U)) :
+                const ColorSpinorField &halo, const GaugeField U) :
       Dslash(arg, out, in, halo), U(U)
     {
     }

From a26526991c81aba5f97e2448aead0d860d0b56a7 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 4 Feb 2026 14:55:51 -0800
Subject: [PATCH 113/121] Increase TuneKey::aux_n to prevent buffer overflow

---
 include/tune_key.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/tune_key.h b/include/tune_key.h
index 0cbad5b01a..4a074ae4da 100644
--- a/include/tune_key.h
+++ b/include/tune_key.h
@@ -9,7 +9,7 @@ namespace quda {
 
     static constexpr int volume_n = 32;
     static constexpr int name_n = 512;
-    static constexpr int aux_n = 256;
+    static constexpr int aux_n = 384;
     char volume[volume_n];
     char name[name_n];
     char aux[aux_n];

From f92570e4edd3b99ce0ee597f4a9e11b46356aba8 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 4 Feb 2026 15:22:14 -0800
Subject: [PATCH 114/121] value to reference - fixes clang compilation issue

---
 lib/dslash_twisted_mass.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/dslash_twisted_mass.hpp b/lib/dslash_twisted_mass.hpp
index 43eb8a8bcf..2d37fe4c5a 100644
--- a/lib/dslash_twisted_mass.hpp
+++ b/lib/dslash_twisted_mass.hpp
@@ -22,7 +22,7 @@ namespace quda
 
   public:
     TwistedMass(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                const ColorSpinorField &halo, const GaugeField U) :
+                const ColorSpinorField &halo, const GaugeField &U) :
       Dslash(arg, out, in, halo), U(U)
     {
     }

From 3ada421b82f6927c00325b052ab441f23de37fae Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 4 Feb 2026 21:26:49 -0800
Subject: [PATCH 115/121] Add git to docker file for CSCS

---
 ci/docker/Dockerfile.build | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ci/docker/Dockerfile.build b/ci/docker/Dockerfile.build
index ed21db930c..f8d6aa7ab4 100644
--- a/ci/docker/Dockerfile.build
+++ b/ci/docker/Dockerfile.build
@@ -8,7 +8,9 @@ RUN apt-get update -qq && apt-get install -qq -y --no-install-recommends \
     build-essential \
     cmake \
     wget \
-    ninja-build && \
+    ninja-build \
+    git \
+    ca-certificates && \
     rm -rf /var/lib/apt/lists/*
 
 ARG MPICH_VERSION=3.3.2

From 21255745a1024acbd64d4965ad874a28fa68c15b Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 6 Feb 2026 10:27:26 -0800
Subject: [PATCH 116/121] Fix deprecation warning with recent CUDA 13.1
 regarding NVML temperature monitoring

---
 lib/targets/cuda/device.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lib/targets/cuda/device.cpp b/lib/targets/cuda/device.cpp
index c47e5eee35..fd9bde4f6c 100644
--- a/lib/targets/cuda/device.cpp
+++ b/lib/targets/cuda/device.cpp
@@ -160,7 +160,16 @@ namespace quda
     auto get_temperature()
     {
       unsigned int temp = 0;
+#if defined(NVML_API_VERSION) && NVML_API_VERSION >= 12
+      nvmlTemperature_t temperature;
+      temperature.version = nvmlTemperature_v1;
+      temperature.sensorType = NVML_TEMPERATURE_GPU;
+      temperature.temperature = 0;
+      NVML_CHECK(nvmlDeviceGetTemperatureV(monitor_device_id, &temperature));
+      temp = static_cast<unsigned int>(temperature.temperature);
+#else
       NVML_CHECK(nvmlDeviceGetTemperature(monitor_device_id, NVML_TEMPERATURE_GPU, &temp));
+#endif
       return temp;
     }
 

From 951a3ee932349f1e3571b9256564e1e44fcf1689 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Fri, 6 Feb 2026 15:37:30 -0800
Subject: [PATCH 117/121] Make the NVML temperature query more robust for the
 change in interface

---
 lib/targets/cuda/device.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/targets/cuda/device.cpp b/lib/targets/cuda/device.cpp
index fd9bde4f6c..78f6c80ca1 100644
--- a/lib/targets/cuda/device.cpp
+++ b/lib/targets/cuda/device.cpp
@@ -160,11 +160,10 @@ namespace quda
     auto get_temperature()
     {
       unsigned int temp = 0;
-#if defined(NVML_API_VERSION) && NVML_API_VERSION >= 12
+#if defined(nvmlTemperature_v1)
       nvmlTemperature_t temperature;
       temperature.version = nvmlTemperature_v1;
       temperature.sensorType = NVML_TEMPERATURE_GPU;
-      temperature.temperature = 0;
       NVML_CHECK(nvmlDeviceGetTemperatureV(monitor_device_id, &temperature));
       temp = static_cast<unsigned int>(temperature.temperature);
 #else

From 3c8ed1a4ea9136341a71bf9c28db406862218b47 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Tue, 10 Feb 2026 12:52:52 -0800
Subject: [PATCH 118/121] Fix CLI11 for modern compilers

---
 include/externals/CLI11.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/externals/CLI11.hpp b/include/externals/CLI11.hpp
index a426c5bae4..9174a58890 100644
--- a/include/externals/CLI11.hpp
+++ b/include/externals/CLI11.hpp
@@ -63,6 +63,7 @@
 #include <utility>
 #include <vector>
 #include <array>
+#include <cstdint>
 
 
 // Verbatim copy from CLI/Version.hpp:
@@ -2485,7 +2486,7 @@ class AsNumberWithUnit : public Validator {
 ///   "2 EiB" => 2^61 // Units up to exibyte are supported
 class AsSizeValue : public AsNumberWithUnit {
   public:
-    using result_t = uint64_t;
+    using result_t = std::uint64_t;
 
     /// If kb_is_1000 is true,
     /// interpret 'kb', 'k' as 1000 and 'kib', 'ki' as 1024

From 8c7ba4d1580a35372ec4370ee5595fb013f990fe Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Tue, 3 Mar 2026 11:56:57 -0800
Subject: [PATCH 119/121] Temporary change of default prefetch type on sm100
 while doing some bug hunting

---
 lib/targets/cuda/target_cuda.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index 8aa7c18936..0a3067636d 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -189,8 +189,8 @@ set(_dslash_prefetch_dist_s_default 0)
 
 # These are expected Blackwell+ defaults
 if(QUDA_COMPUTE_CAPABILITY GREATER_EQUAL 100)
-  set(_dslash_double_store_default ON)
-  set(_dslash_prefetch_type_default BULK)
+  set(_dslash_double_store_default OFF)
+  set(_dslash_prefetch_type_default THREAD)
   set(_dslash_prefetch_dist_w_default 2)
   set(_dslash_prefetch_dist_s_default 2)
 endif()

From a510234dc8914e09544d5041c4dd2c93e31571d3 Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 11 Mar 2026 23:17:20 -0700
Subject: [PATCH 120/121] Fix bug in gauge shift when writing its halo.  Add
 some sanity checks with shifting (can't shift a shifted field), and fix move
 constructor so that shift field is moved

---
 include/gauge_field.h           | 10 +++++++++-
 include/kernels/gauge_shift.cuh |  7 +++----
 include/kernels/laplace.cuh     |  2 --
 lib/gauge_field.cpp             |  9 +++++++++
 lib/gauge_shift.cu              |  4 +++-
 5 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/include/gauge_field.h b/include/gauge_field.h
index 4b2905a1a0..9332b5c1e8 100644
--- a/include/gauge_field.h
+++ b/include/gauge_field.h
@@ -147,6 +147,7 @@ namespace quda {
   class GaugeField : public LatticeField {
 
     friend std::ostream &operator<<(std::ostream &output, const GaugeField &param);
+    friend GaugeField shift(const GaugeField &in, int shift);
 
   private:
     /**
@@ -193,7 +194,9 @@ namespace quda {
     double tadpole = 0.0;
     double fat_link_max = 0.0;
 
-    mutable std::unique_ptr<GaugeField> shifted; // shifted copy of the gauge field, used for double-store enabled dslash
+    mutable std::unique_ptr<GaugeField> shifted
+      = nullptr;             // shifted copy of the gauge field, used for double-store enabled dslash
+    bool is_shifted = false; // whether this instance is a shifted one
 
     mutable array<quda_ptr, 2 *QUDA_MAX_DIM> ghost
       = {}; // stores the ghost zone of the gauge field (non-native fields only)
@@ -658,6 +661,11 @@ namespace quda {
     */
     GaugeField &shift(int shift = -1) const;
 
+    /**
+       @brief Resets the shifted field (if it exists).
+    */
+    void shift_reset() const;
+
     /**
      * @brief Print the site data
      * @param[in] parity Parity index
diff --git a/include/kernels/gauge_shift.cuh b/include/kernels/gauge_shift.cuh
index f64134d09b..4726258242 100644
--- a/include/kernels/gauge_shift.cuh
+++ b/include/kernels/gauge_shift.cuh
@@ -42,9 +42,8 @@ namespace quda
 
       if constexpr (!Arg::verify) {
         typename Arg::RawLink link;
-        if (x[dir] < arg.shift
-            && arg.comms_dim_partitioned[dir]) { // on the boundary so we need to fetch from the ghost zone
-          const int ghost_idx = ghostFaceIndexStaggered<0>(x, arg.X, dir, arg.shift);
+        if (x[dir] < arg.shift && arg.comms_dim_partitioned[dir]) { // on boundary so we fetch from ghost
+          const int ghost_idx = ghostFaceIndexStaggered<0>(x, arg.X, dir, 1);
           arg.in.raw_load(link, arg.volume_cb + ghost_idx, dir, 1 - parity);
           arg.out.raw_save(link, x_cb, dir, parity);
         } else { // simple shift
@@ -64,7 +63,7 @@ namespace quda
         // verify the shifting has worked
         using Link = typename Arg::Link;
         if (x[dir] < arg.shift && arg.comms_dim_partitioned[dir]) {
-          const int ghost_idx = ghostFaceIndexStaggered<0>(x, arg.X, dir, arg.shift);
+          const int ghost_idx = ghostFaceIndexStaggered<0>(x, arg.X, dir, 1);
           Link in = arg.in(dir, arg.volume_cb + ghost_idx, 1 - parity);
           Link out = arg.out(dir, x_cb, parity);
           assert(in.L1() == out.L1());
diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh
index bff5bc3c36..b1a45a5c85 100644
--- a/include/kernels/laplace.cuh
+++ b/include/kernels/laplace.cuh
@@ -90,7 +90,6 @@ namespace quda
 
           if (doHalo<kernel_type>(d) && ghost) {
 
-            // const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dc.X, d, 1);
             const int ghost_idx = ghostFaceIndex<1>(coord, arg.dc.X, d, arg.nFace);
             const Link U = arg.U(d, coord.x_cb, parity);
             const Vector in = arg.halo.Ghost(d, 1, ghost_idx + src_idx * arg.dc.ghostFaceCB[d], their_spinor_parity);
@@ -115,7 +114,6 @@ namespace quda
 
           if (doHalo<kernel_type>(d) && ghost) {
 
-            // const int ghost_idx = ghostFaceIndexStaggered<0>(coord, arg.dc.X, d, 1);
             const int ghost_idx = ghostFaceIndex<0>(coord, arg.dc.X, d, arg.nFace);
 
             const Link U = arg.U.Ghost(d, ghost_idx, 1 - parity);
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 3638c2a64a..2e4e0f7e34 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -282,6 +282,8 @@ namespace quda {
     anisotropy = std::exchange(src.anisotropy, 0.0);
     tadpole = std::exchange(src.tadpole, 0.0);
     fat_link_max = std::exchange(src.fat_link_max, 0.0);
+    shifted = std::exchange(src.shifted, nullptr);
+    is_shifted = std::exchange(src.is_shifted, false);
     for (auto i = 0; i < ghost.size(); i++) ghost[i].exchange(src.ghost[i], {});
     ghostFace = std::exchange(src.ghostFace, {});
     staggeredPhaseType = std::exchange(src.staggeredPhaseType, QUDA_STAGGERED_PHASE_INVALID);
@@ -1444,11 +1446,18 @@ namespace quda {
   GaugeField& GaugeField::shift(int shift_offset) const
   {
     if (shift_offset == -1) shift_offset = nFace;
+    if (shift_offset != 1 && shift_offset != 3) errorQuda("Invalid shift_offset = %d", shift_offset);
+    if (is_shifted) errorQuda("Cannot shift a shifted field");
     // If we don't yet have a cached shifted copy or the shift value changed
     if (!shifted) shifted = std::make_unique<GaugeField>(::quda::shift(*this, shift_offset));
     return *shifted;
   }
 
+  void GaugeField::shift_reset() const
+  {
+    if (shifted) shifted.reset(nullptr);
+  }
+
   void GaugeField::PrintMatrix(int dim, int parity, unsigned int x_cb, int rank) const
   {
     genericPrintMatrix(*this, dim, parity, x_cb, rank);
diff --git a/lib/gauge_shift.cu b/lib/gauge_shift.cu
index db96c23718..93ccd090d0 100644
--- a/lib/gauge_shift.cu
+++ b/lib/gauge_shift.cu
@@ -62,9 +62,11 @@ namespace quda
     if (in.GhostExchange() == QUDA_GHOST_EXCHANGE_NO && comm_partitioned())
       errorQuda("comm_dim_partition() == true requires we have GhostExchange = QUDA_GHOST_EXCHANGE_PAD");
     GaugeFieldParam param(in);
-    param.create = QUDA_ZERO_FIELD_CREATE;
+    param.create = QUDA_NULL_FIELD_CREATE;
     GaugeField out(param);
     const_cast<double&>(out.LinkMax()) = in.LinkMax();
+    out.is_shifted = true;
+
     instantiate<GaugeShifter>(out, in, shift, false);
 #if 0 // set to 1 to run verification
     instantiate<GaugeShifter>(out, in, shift, true);

From b0f2a8663fd3d56ebdce9daae4bb6a8c1819fafd Mon Sep 17 00:00:00 2001
From: maddyscientist <mclark@nvidia.com>
Date: Wed, 11 Mar 2026 23:18:42 -0700
Subject: [PATCH 121/121] Revert "Temporary change of default prefetch type on
 sm100 while doing some bug hunting"

This reverts commit 8c7ba4d1580a35372ec4370ee5595fb013f990fe.
---
 lib/targets/cuda/target_cuda.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/targets/cuda/target_cuda.cmake b/lib/targets/cuda/target_cuda.cmake
index 0a3067636d..8aa7c18936 100644
--- a/lib/targets/cuda/target_cuda.cmake
+++ b/lib/targets/cuda/target_cuda.cmake
@@ -189,8 +189,8 @@ set(_dslash_prefetch_dist_s_default 0)
 
 # These are expected Blackwell+ defaults
 if(QUDA_COMPUTE_CAPABILITY GREATER_EQUAL 100)
-  set(_dslash_double_store_default OFF)
-  set(_dslash_prefetch_type_default THREAD)
+  set(_dslash_double_store_default ON)
+  set(_dslash_prefetch_type_default BULK)
   set(_dslash_prefetch_dist_w_default 2)
   set(_dslash_prefetch_dist_s_default 2)
 endif()