[FPSAN] Fix fpsan crash with warp specialization + tmem (#9415)

pawelszczerbuk · web-flow · commit 54bed51cf709 · 2026-02-09T12:56:31.000-08:00
Fpsan is replacing tensor memory with global scratch, and was missing
correct handling of passing global memory pointers to warp_specialize
op.
Also, use the -Ofc mid ptx compilation mode for fpsan compilation.
diff --git a/lib/Dialect/TritonInstrument/Transforms/FpSanitizer.cpp b/lib/Dialect/TritonInstrument/Transforms/FpSanitizer.cpp
@@ -25,6 +25,20 @@ namespace ttng = mlir::triton::nvidia_gpu;
 
 namespace {
 
+static bool isValueAvailableInScope(Value value, Region *scope) {
+  if (!scope)
+    return false;
+  if (auto arg = dyn_cast<BlockArgument>(value)) {
+    Region *argRegion = arg.getOwner()->getParent();
+    return argRegion == scope || scope->isAncestor(argRegion);
+  }
+  if (Operation *def = value.getDefiningOp()) {
+    Region *defRegion = def->getParentRegion();
+    return defRegion == scope || scope->isAncestor(defRegion);
+  }
+  return false;
+}
+
 constexpr int64_t kTileM = 8;
 constexpr int64_t kTileN = 8;
 
@@ -162,6 +176,8 @@ class TmemScratchManager {
           return std::nullopt;
       }
 
+      ptr = remapToScope(ptr, rewriter, scope, loc);
+
       ScratchInfo info{ptr, tensorTy};
       scratchMap[memdesc][scope] = info;
       return info;
@@ -189,8 +205,9 @@ class TmemScratchManager {
           rewriter, loc, rewriter.getI32IntegerAttr(stride));
       auto offsetEls = arith::MulIOp::create(
           rewriter, loc, rewriter.getI32Type(), offsetVal, strideVal);
-      auto ptr = tt::AddPtrOp::create(rewriter, loc, baseInfo->ptr.getType(),
-                                      baseInfo->ptr, offsetEls);
+      Value ptr = tt::AddPtrOp::create(rewriter, loc, baseInfo->ptr.getType(),
+                                       baseInfo->ptr, offsetEls);
+      ptr = remapToScope(ptr, rewriter, scope, loc);
       auto layout = getScratchEncoding(rewriter, memdesc, memTy);
       auto tensorTy = RankedTensorType::get(memTy.getShape(),
                                             memTy.getElementType(), layout);
@@ -218,8 +235,9 @@ class TmemScratchManager {
           rewriter, loc, rewriter.getI32IntegerAttr(stride));
       auto offset = arith::MulIOp::create(rewriter, loc, rewriter.getI32Type(),
                                           idx, strideVal);
-      auto ptr = tt::AddPtrOp::create(rewriter, loc, baseInfo->ptr.getType(),
-                                      baseInfo->ptr, offset);
+      Value ptr = tt::AddPtrOp::create(rewriter, loc, baseInfo->ptr.getType(),
+                                       baseInfo->ptr, offset);
+      ptr = remapToScope(ptr, rewriter, scope, loc);
       auto layout = getScratchEncoding(rewriter, memdesc, memTy);
       auto tensorTy = RankedTensorType::get(memTy.getShape(),
                                             memTy.getElementType(), layout);
@@ -241,6 +259,7 @@ class TmemScratchManager {
       if (ptr.getType() != ptrTy) {
         ptr = tt::BitcastOp::create(rewriter, loc, ptrTy, ptr);
       }
+      ptr = remapToScope(ptr, rewriter, scope, loc);
 
       auto layout = getScratchEncoding(rewriter, memdesc, memTy);
       auto tensorTy = RankedTensorType::get(memTy.getShape(),
@@ -254,6 +273,36 @@ class TmemScratchManager {
   }
 
 private:
+  Value remapToScope(Value value, PatternRewriter &rewriter, Region *scope,
+                     Location loc) {
+    if (!scope || isValueAvailableInScope(value, scope))
+      return value;
+
+    auto *parentOp = scope->getParentOp();
+    auto partitions = dyn_cast_or_null<ttg::WarpSpecializePartitionsOp>(
+        parentOp ? parentOp : nullptr);
+    if (!partitions)
+      return value;
+
+    unsigned captureIdx = partitions.getNumOperands();
+    for (auto [i, capture] :
+         llvm::enumerate(partitions.getExplicitCaptures())) {
+      if (capture == value) {
+        captureIdx = i;
+        break;
+      }
+    }
+
+    if (captureIdx == partitions.getNumOperands()) {
+      partitions->insertOperands(captureIdx, value);
+      for (Region &region : partitions.getPartitionRegions()) {
+        region.addArgument(value.getType(), loc);
+      }
+    }
+
+    return scope->getArgument(captureIdx);
+  }
+
   DenseMap<Value, DenseMap<Region *, ScratchInfo>> scratchMap;
 };
 
diff --git a/test/TritonGPU/fpsan.mlir b/test/TritonGPU/fpsan.mlir
@@ -276,3 +276,36 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
     tt.return
   }
 }
+
+// -----
+
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 32}>
+#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
+#smem = #ttg.shared_memory
+#blocked = #ttg.blocked<{sizePerThread = [1, 64], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:90", ttg.tensor_memory_size = 0 : i32, "ttg.threads-per-warp" = 32 : i32, "ttg.total-num-warps" = 8 : i32} {
+  // CHECK-LABEL: @ws_partition_tmem_load
+  tt.func public @ws_partition_tmem_load() {
+    // CHECK: %[[SCRATCH:.*]] = ttg.global_scratch_alloc
+    // CHECK: ttg.warp_specialize(%{{.*}}, %{{.*}}, %{{.*}}, %[[SCRATCH]])
+    // CHECK: partition0(%{{.*}}: !ttg.memdesc<1xi64, #{{[^,>]+}}, #smem, mutable>, %{{.*}}: !ttg.memdesc<128x128xf32, #{{[^,>]+}}, #smem, mutable>, %{{.*}}: !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, %[[SCRATCH_ARG:.*]]: !tt.ptr<f32>) num_warps(4)
+    // CHECK: %[[PTRS:.*]] = tt.splat %[[SCRATCH_ARG]] : !tt.ptr<f32> -> tensor<128x128x!tt.ptr<f32>, #blocked>
+    // CHECK: tt.load
+    // CHECK: ttg.local_store
+    // CHECK-NOT: ttng.tmem_load
+    %bar = ttg.local_alloc {allocation.offset = 0 : i32} : () -> !ttg.memdesc<1xi64, #shared1, #smem, mutable>
+    %smem = ttg.local_alloc {allocation.offset = 4096 : i32} : () -> !ttg.memdesc<128x128xf32, #shared, #smem, mutable>
+    %buf = ttng.tmem_alloc {tensor_memory_col_offset = 0 : i32, tensor_memory_row_offset = 0 : i32} : () -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+    ttg.warp_specialize(%bar, %smem, %buf) attributes {actualRegisters = array<i32: 32, 32>, allocation.offset = 512 : i32, requestedRegisters = array<i32: 32>, warpGroupStartIds = array<i32: 4>}
+    default {
+      ttg.warp_yield
+    }
+    partition0(%arg0: !ttg.memdesc<1xi64, #shared1, #smem, mutable>, %arg1: !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, %arg2: !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>) num_warps(4) {
+      %val = ttng.tmem_load %arg2 : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
+      ttg.local_store %val, %arg1 : tensor<128x128xf32, #blocked> -> !ttg.memdesc<128x128xf32, #shared, #smem, mutable>
+      ttg.warp_return
+    } : (!ttg.memdesc<1xi64, #shared1, #smem, mutable>, !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>) -> ()
+    tt.return
+  }
+}
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -499,7 +499,7 @@ def make_cubin(self, src, metadata, opt, capability):
             ptx_extra_options = opt.ptx_options.split(" ") if opt.ptx_options else []
 
             # Use -Ofc mid to compile ConSan code, if nothing else is specified.
-            if "consan" in knobs.compilation.instrumentation_mode:
+            if any(mode in knobs.compilation.instrumentation_mode for mode in ["consan", "fpsan"]):
                 ptx_extra_options += ["-Ofc", "mid"]
 
             # Add --regAllocOptLevel=2 to work around ptxas 13.x bug