[FPSAN] Fix crash on incorrect layout for tmem copy (#10046)

pawelszczerbuk · root · codex · web-flow · commit 746064c8a5cf · 2026-04-16T07:29:21.000Z
TMEMCopy pattern is using stale tmem encoding. This may cause a crash in
the validator if the encodings mismatch.

---------

Co-authored-by: root &lt;root@codex-gb200-0.brix.pawelszczerbuk.svc.cluster.local&gt;
Co-authored-by: Codex &lt;noreply@openai.com&gt;
diff --git a/lib/Dialect/TritonInstrument/Transforms/FpSanitizer.cpp b/lib/Dialect/TritonInstrument/Transforms/FpSanitizer.cpp
@@ -1856,9 +1856,11 @@ struct TMEMCopyPattern : public OpRewritePattern<ttng::TMEMCopyOp> {
 
     auto loc = op.getLoc();
     auto srcMemTy = cast<ttg::MemDescType>(op.getSrc().getType());
-    auto srcRegTy =
-        RankedTensorType::get(srcMemTy.getShape(), srcMemTy.getElementType(),
-                              info->tensorType.getEncoding());
+    auto dstMemTy = cast<ttg::MemDescType>(op.getDst().getType());
+    auto srcEncoding =
+        scratch->getScratchEncoding(rewriter, op.getDst(), dstMemTy);
+    auto srcRegTy = RankedTensorType::get(
+        srcMemTy.getShape(), srcMemTy.getElementType(), srcEncoding);
     Value srcReg =
         ttg::LocalLoadOp::create(rewriter, loc, srcRegTy, op.getSrc(), Value())
             .getResult();
diff --git a/python/test/gluon/test_fpsan.py b/python/test/gluon/test_fpsan.py
@@ -14,6 +14,8 @@
     TensorMemoryScalesLayout,
     allocate_tensor_memory,
     mbarrier,
+    tcgen05_commit,
+    tcgen05_copy,
     tcgen05_mma,
     tcgen05_mma_scaled,
 )
@@ -1723,6 +1725,71 @@ def kernel(x_ptr, out_ptr):
     _assert_payload_equal(out, exp_bits)
 
 
+@pytest.mark.skipif(not is_blackwell(), reason="Requires Blackwell")
+def test_tmem_copy_scales_in_warp_specialize_partition(device, fresh_knobs):
+    _require_cuda_backend(device)
+
+    smem_h = 64
+    smem_w = 16
+    SMEM_H = gl.constexpr(smem_h)
+    SMEM_W = gl.constexpr(smem_w)
+
+    fresh_knobs.compilation.instrumentation_mode = "fpsan"
+
+    @gluon.jit
+    def copy_partition(smem, tmem, bar):
+        tcgen05_copy(smem, tmem)
+        tcgen05_commit(bar)
+
+    @gluon.jit
+    def default_partition():
+        pass
+
+    @gluon.jit
+    def kernel(in_ptr, out_ptr):
+        blocked: gl.constexpr = gl.BlockedLayout([1, 4], [32, 1], [gl.num_warps(), 1], [1, 0])
+        in_ptrs = (in_ptr + gl.arange(0, SMEM_H)[:, None] * SMEM_W + gl.arange(0, SMEM_W)[None, :])
+        value = gl.load(gl.set_auto_layout(in_ptrs, blocked))
+
+        smem_layout: gl.constexpr = gl.SharedLinearLayout(offset_bases=[
+            [0, 1],
+            [0, 2],
+            [32, 0],
+            [0, 4],
+            [1, 0],
+            [2, 0],
+            [4, 0],
+            [8, 0],
+            [16, 0],
+            [0, 8],
+        ])
+        smem = gl.allocate_shared_memory(gl.int8, (SMEM_H, SMEM_W), layout=smem_layout)
+        smem.store(value)
+
+        tmem_layout: gl.constexpr = TensorMemoryScalesLayout()
+        tmem = allocate_tensor_memory(gl.int8, (SMEM_H, SMEM_W), layout=tmem_layout)
+        bar = gl.allocate_shared_memory(gl.int64, [1], gl.constexpr(mbarrier.MBarrierLayout()))
+        mbarrier.init(bar, count=1)
+
+        gl.warp_specialize(
+            [
+                (default_partition, ()),
+                (copy_partition, (smem, tmem, bar)),
+            ],
+            [1],
+            [32],
+        )
+
+        mbarrier.wait(bar, phase=0)
+        mbarrier.invalidate(bar)
+        gl.store(out_ptr, 1)
+
+    x = torch.randint(size=(smem_h, smem_w), low=-100, high=100, dtype=torch.int8, device=device)
+    out = torch.empty((), device=device, dtype=torch.int32)
+    kernel[(1, )](x, out, num_warps=4)
+    torch.testing.assert_close(out, torch.ones_like(out))
+
+
 @pytest.mark.skipif(not is_blackwell(), reason="Requires Blackwell")
 def test_tmem_store_in_warp_specialize_partition_visible_to_parent(device, fresh_knobs):
     _require_cuda_backend(device)