[Backend] Fix more tt.scan layout issues (#9189)

lezcano · web-flow · commit 00d2e9f7037d · 2026-01-10T19:26:10.000-03:00
Following #9185, I asked codex to find other issues with regression tests. It hacked around the issue, but this was enough for me to find the real issue and fix it properly. Great team work. We should audit generally other uses of `linearize`/`delinearize` as those that use the legacy APIs will most likely be broken when used with broadcasted layouts.
diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
@@ -436,6 +436,9 @@ Value linearize(RewriterBase &rewriter, Location loc, ArrayRef<Value> multiDim,
 Value linearize(RewriterBase &rewriter, Location loc, ArrayRef<Value> multiDim,
                 ArrayRef<unsigned> shape);
 
+Value linearize(RewriterBase &rewriter, Location loc, ArrayRef<Value> multiDim,
+                triton::gpu::LinearEncodingAttr encoding, StringAttr dimName);
+
 size_t linearize(ArrayRef<unsigned> multiDim, ArrayRef<unsigned> shape,
                  ArrayRef<unsigned> order);
 
diff --git a/lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ScanOpToLLVM.cpp
@@ -537,9 +537,9 @@ ScanOpConversion::emitFastScan(triton::ScanOp op, triton::ScanOpAdaptor adaptor,
         std::get<0>(getMultiDimLaneId(rewriter, helper, laneId));
     multiDimLaneId[helper.getAxis()] = b.i32_val(scanDim - 1);
     auto linearEncoding = helper.getEncoding();
-    auto threadsPerWarp = linearEncoding.getThreadsPerWarp();
-    auto laneIdLast = linearize(rewriter, loc, multiDimLaneId, threadsPerWarp,
-                                helper.getOrder());
+    auto kLane = StringAttr::get(rewriter.getContext(), "lane");
+    Value laneIdLast =
+        linearize(rewriter, loc, multiDimLaneId, linearEncoding, kLane);
     AddPartialReduceOneWarp(srcValues, rewriter, targetInfo, helper, warpIdAxis,
                             laneIdAxis, laneIdLast);
   } // else axisNumWarps == 1 and srcValues.size() == 1, nothing to do.
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -1229,6 +1229,45 @@ Value pext_i32(RewriterBase &rewriter, Location loc, Value a, uint32_t mask) {
   return result;
 }
 
+// Puts the bits of `a` that are set in `mask` into the bits of `result`
+Value pdep_i32(RewriterBase &rewriter, Location loc, Value a, uint32_t mask) {
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
+  assert(a.getType() == i32_ty && "a must be i32");
+
+  if (mask == 0)
+    return b.i32_val(0);
+  assert(mask < 64 && "mask must be less than 64");
+
+  // Blocked algorithm (same grouping trick as the pext example).
+  uint32_t mskConst = mask;
+  uint32_t depcnt = 0; // how many source bits from `a` we've consumed
+  Value result = b.i32_val(0);
+
+  while (mskConst) {
+    uint32_t oldmsk = mskConst;
+
+    // Isolate lsb set bit, then clear the lowest contiguous run of 1s.
+    uint32_t bitgrplsb = mskConst & (~mskConst + 1); // m & -m
+    mskConst &= (bitgrplsb + mskConst);
+    uint32_t bitgrp = mskConst ^ oldmsk; // the cleared run (contiguous 1s)
+
+    // Group start position and length.
+    uint32_t lsbpos = __builtin_ctz(bitgrplsb);
+    uint32_t grplen = __builtin_ctz(~(bitgrp >> lsbpos));
+
+    // Align the next grplen bits of `a` to the group's lsb, then mask to the
+    // group.
+    uint32_t shift =
+        lsbpos - depcnt; // non-negative invariant for this traversal order
+    depcnt += grplen;
+
+    Value deposited = b.and_(b.shl(a, b.i32_val(shift)), b.i32_val(bitgrp));
+    result = b.or_(result, deposited);
+  }
+
+  return result;
+}
+
 std::tuple<SmallVector<Value>, Value>
 delinearize(RewriterBase &rewriter, Location loc,
             triton::gpu::DistributedEncodingTrait layout,
@@ -1344,6 +1383,20 @@ Value linearize(RewriterBase &rewriter, Location loc, ArrayRef<Value> multiDim,
   return linear;
 }
 
+Value linearize(RewriterBase &rewriter, Location loc, ArrayRef<Value> multiDim,
+                triton::gpu::LinearEncodingAttr encoding, StringAttr dimName) {
+  auto orderDim = encoding.orderPerDim(dimName, encoding.getOrder());
+  auto shapeDim = encoding.basesPerDim(dimName);
+  auto linear = linearize(rewriter, loc, multiDim, shapeDim, orderDim);
+  auto ll = encoding.getLinearLayout();
+  int32_t freeVarMask = ll.getFreeVariableMasks().lookup(dimName);
+  if (freeVarMask != 0) {
+    int32_t nonFreeVarMask = ~freeVarMask & (ll.getInDimSize(dimName) - 1);
+    linear = pdep_i32(rewriter, loc, linear, nonFreeVarMask);
+  }
+  return linear;
+}
+
 size_t linearize(ArrayRef<unsigned> multiDim, ArrayRef<unsigned> shape,
                  ArrayRef<unsigned> order) {
   size_t linear = 0;
diff --git a/python/test/gluon/test_lowerings.py b/python/test/gluon/test_lowerings.py
@@ -111,6 +111,24 @@ def test_scan_blocked_broadcast_layout(device):
     torch.testing.assert_close(y, torch.cumsum(x, dim=0))
 
 
+def test_scan_blocked_broadcast_layout_multiblock(device):
+    if not is_cuda():
+        pytest.skip("requires CUDA")
+    if THREADS_PER_WARP != 32:
+        pytest.skip("requires 32-thread warps")
+
+    M = 64
+    # Broadcasting in lane for dim1 and multiple scan blocks along axis 0.
+    src_layout = ttgl.BlockedLayout([2, 4], [16, 2], [1, 2], [1, 0])
+
+    torch.manual_seed(0)
+    x = torch.randn((M, 1), dtype=torch.float32, device=device)
+    y = torch.empty_like(x)
+    scan_kernel[(1, )](x, y, M, 1, src_layout, 0, num_warps=2)
+
+    torch.testing.assert_close(y, torch.cumsum(x, dim=0))
+
+
 def _reduce_linear_layouts():
     if THREADS_PER_WARP == 32:
         return [