Fix infinite loop in dotCanBeProperlyAsync (#9282)

neildhar · web-flow · commit 4e466d81a594 · 2026-01-22T21:44:22.000Z
The `checkOperand` traversal can run forever if either:
1. A block argument participates in a cycle containing only permitted
instructions.
2. A block argument is defined outside of `forOp`, in which case we
never advance transitiveOperand.

To fix (1), track the set of visited block arguments. If we visit the
same block argument again, that means that we are in a cycle originating
in the init value of the iter arg, which is outside the loop.

To fix (2), check for values defined outside the loop as we iterate.
This way, we know that if we are evaluating a block argument, it must be
an iter arg to the loop.
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/WGMMAPipeline.cpp
@@ -438,21 +438,30 @@ static std::optional<int> dotCanBeProperlyAsync(ttng::WarpGroupDotOp dotOp,
     // come from an MemDescIndex op.  Only ConvertLayout and view ops are
     // allowed in between.
     Value transitiveOperand = operand;
-    while (isa_and_nonnull<ttg::ConvertLayoutOp, ttg::MemDescTransOp,
-                           ttg::MemDescReshapeOp, ttg::MemDescSubsliceOp>(
-               transitiveOperand.getDefiningOp()) ||
-           isa<BlockArgument>(transitiveOperand)) {
-      auto blockArg = dyn_cast<BlockArgument>(transitiveOperand);
-      if (blockArg && blockArg.getOwner() == forOp.getBody()) {
-        transitiveOperand =
-            cast<scf::YieldOp>(blockArg.getOwner()->getTerminator())
-                .getOperand(blockArg.getArgNumber() - 1);
-      } else if (Operation *def = transitiveOperand.getDefiningOp()) {
-        transitiveOperand = def->getOperand(0);
+    DenseSet<BlockArgument> visitedBlockArgs;
+    while (!forOp.isDefinedOutsideOfLoop(transitiveOperand)) {
+      if (auto *definingOp = transitiveOperand.getDefiningOp()) {
+        if (isa<ttg::ConvertLayoutOp, ttg::MemDescTransOp,
+                ttg::MemDescReshapeOp, ttg::MemDescSubsliceOp>(definingOp)) {
+          transitiveOperand = definingOp->getOperand(0);
+          continue;
+        }
+        return isa<ttg::MemDescIndexOp>(definingOp);
       }
+      auto blockArg = cast<BlockArgument>(transitiveOperand);
+      // We know that the dotOp is a top level operation in the loop body, and
+      // we have already checked that transitiveOperand is not defined outside
+      // the loop, therefore the block arg must be an iter arg of this loop.
+      assert(dotOp->getParentOp() == forOp);
+      assert(blockArg.getOwner() == forOp.getBody());
+      // If we have already visited this block arg, that means that it
+      // participates in a cycle containing only permitted operations. The
+      // initial value therefore originates outside the loop, making this valid.
+      if (!visitedBlockArgs.insert(blockArg).second)
+        return true;
+      transitiveOperand = forOp.getTiedLoopYieldedValue(blockArg)->get();
     }
-    return forOp.isDefinedOutsideOfLoop(transitiveOperand) ||
-           transitiveOperand.getDefiningOp<ttg::MemDescIndexOp>();
+    return true;
   };
 
   // Rule 1: All shmem operands are multi-buffered.
diff --git a/test/TritonGPU/loop-pipeline-hopper.mlir b/test/TritonGPU/loop-pipeline-hopper.mlir
@@ -1080,3 +1080,57 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return %2 : tensor<64x32xf32, #mma>
   }
 }
+
+// -----
+
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 32, 16]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16}>
+#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = true, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: dot_outer_loop_arg
+  // CHECK: scf.for
+  // CHECK-NEXT: scf.for
+  // CHECK-NEXT: ttng.warp_group_dot
+  // CHECK-NEXT: ttng.warp_group_dot_wait {{.*}} {pendings = 1 : i32}
+  // CHECK-NEXT: scf.yield
+  // CHECK: ttng.warp_group_dot_wait {{.*}} {pendings = 0 : i32}
+  tt.func public @dot_outer_loop_arg(%arg0: i32, %arg2: !ttg.memdesc<64x32xbf16, #shared, #smem, mutable>, %arg3: !ttg.memdesc<32x32xbf16, #shared1, #smem, mutable>) -> tensor<64x32xf32, #mma> {
+    %c0_i32 = arith.constant 0 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma>
+    %outer:2 = scf.for %arg4 = %c0_i32 to %arg0 step %c32_i32 iter_args(%arg5 = %arg3, %arg8 = %cst_0) -> (!ttg.memdesc<32x32xbf16, #shared1, #smem, mutable>, tensor<64x32xf32, #mma>)  : i32 {
+      %0 = scf.for %arg6 = %c0_i32 to %arg0 step %c32_i32 iter_args(%arg7 = %arg8) -> (tensor<64x32xf32, #mma>)  : i32 {
+        %1 = ttng.warp_group_dot %arg2, %arg5, %arg7 {inputPrecision = 0 : i32} : !ttg.memdesc<64x32xbf16, #shared, #smem, mutable> * !ttg.memdesc<32x32xbf16, #shared1, #smem, mutable> -> tensor<64x32xf32, #mma>
+        scf.yield %1 : tensor<64x32xf32, #mma>
+      }
+      scf.yield %arg5, %0 : !ttg.memdesc<32x32xbf16, #shared1, #smem, mutable>, tensor<64x32xf32, #mma>
+    }
+    tt.return %outer#1 : tensor<64x32xf32, #mma>
+  }
+}
+
+// -----
+
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 32, 16]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16}>
+#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = true, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: loop_arg_cycle
+  // CHECK: scf.for
+  // CHECK-NEXT: ttng.warp_group_dot
+  // CHECK-NEXT: ttng.warp_group_dot_wait {{.*}} {pendings = 1 : i32}
+  // CHECK-NEXT: scf.yield
+  // CHECK: ttng.warp_group_dot_wait {{.*}} {pendings = 0 : i32}
+  tt.func public @loop_arg_cycle(%arg0: i32, %arg2: !ttg.memdesc<64x32xbf16, #shared, #smem, mutable>, %arg3: !ttg.memdesc<32x32xbf16, #shared1, #smem, mutable>) -> tensor<64x32xf32, #mma> {
+    %c0_i32 = arith.constant 0 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma>
+    %0:2 = scf.for %arg4 = %c0_i32 to %arg0 step %c32_i32 iter_args(%arg5 = %arg3, %arg7 = %cst_0) -> (!ttg.memdesc<32x32xbf16, #shared1, #smem, mutable>, tensor<64x32xf32, #mma>)  : i32 {
+      %1 = ttng.warp_group_dot %arg2, %arg5, %arg7 {inputPrecision = 0 : i32} : !ttg.memdesc<64x32xbf16, #shared, #smem, mutable> * !ttg.memdesc<32x32xbf16, #shared1, #smem, mutable> -> tensor<64x32xf32, #mma>
+      scf.yield %arg5, %1 : !ttg.memdesc<32x32xbf16, #shared1, #smem, mutable>, tensor<64x32xf32, #mma>
+    }
+    tt.return %0#1 : tensor<64x32xf32, #mma>
+  }
+}