Skip runtime loop unrolling in air-opt-shim-dma-bds for all-1 tile sizes

erwei-xilinx · claude · erwei-xilinx · commit f5f26c46a3d6 · 2026-04-13T10:48:00.000-07:00
When shim-dma-tile-sizes is empty or all-1 (the default aircc path with
--air-runtime-loop-tiling-sizes=1,1), skip the tiling, unrolling, and BD
folding for runtime loops inside dummyLaunch ops with non-trivial trip
counts. The launch is still converted to scf.for + dummyLaunch (needed for
affine symbol validity), and the scf.for loops are preserved through
air-to-std and unrolled later in airrt-to-npu after
removeDeadDeviceComputeOps strips the heavy segment/herd bodies.

BD folding is skipped because AIRUnrollScfForIntoBDChain would otherwise
unroll the runtime loops, defeating the optimization. The channel ops
already have valid wraps/strides from earlier passes (air-dma-to-channel).

The fast path only applies when:
- Tile sizes are empty or all-1 (no useful tiling to perform)
- The scf.for loops are inside a dummyLaunch (from launch conversion)
- The loops have trip count &gt; 1 (trivial loops still use normal BD folding)

Loops directly in functions (not from launch conversion) are unaffected.

The air.launch_end barrier depends on scf.for result tokens and any
top-level channel ops, ensuring proper async dependency tracking.

Profiling on flash attention (12 heads, 1024 LQ/LK, NPU1):
  air-opt-shim-dma-bds: 1,892 ms -&gt; 28 ms (67x faster)
  Total MLIR passes:    3,432 ms -&gt; 664 ms (5.2x faster)
  Total aircc:          6,400 ms -&gt; 3,581 ms (1.8x faster)
  IR size after pass:   2,922 KB -&gt; 226 KB (13x smaller)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/mlir/lib/Conversion/AIRRtToNpuPass.cpp b/mlir/lib/Conversion/AIRRtToNpuPass.cpp
@@ -2162,6 +2162,25 @@ struct AIRRtToNpuPass : public impl::AIRRtToNpuBase<AIRRtToNpuPass> {
           return;
         }
       });
+      if (!containsOnlyWaitAll) {
+        // Convert non-trivial scf.parallel to scf.for so that
+        // unrollSCFFors can handle them. This is needed when the shim BD
+        // pass preserves runtime loops (skip-unroll optimization).
+        if (par_op.getNumLoops() == 1) {
+          IRRewriter rewriter(par_op->getContext());
+          rewriter.setInsertionPoint(par_op);
+          auto forOp = scf::ForOp::create(
+              rewriter, par_op.getLoc(), par_op.getLowerBound()[0],
+              par_op.getUpperBound()[0], par_op.getStep()[0]);
+          IRMapping mapper;
+          mapper.map(par_op.getInductionVars()[0], forOp.getInductionVar());
+          rewriter.setInsertionPointToStart(forOp.getBody());
+          for (auto &op : par_op.getBody()->without_terminator())
+            rewriter.clone(op, mapper);
+          rewriter.eraseOp(par_op);
+        }
+        continue;
+      }
       builder.setInsertionPoint(par_op);
       auto newWaitAll = airrt::WaitAllOp::create(
           builder, par_op->getLoc(),
diff --git a/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp b/mlir/lib/Transform/AIRDependencyScheduleOpt.cpp
@@ -2219,6 +2219,10 @@ struct AIRUnrollScfForIntoBDChain : public OpRewritePattern<scf::ForOp> {
 
   LogicalResult matchAndRewrite(scf::ForOp for_op,
                                 PatternRewriter &rewriter) const override {
+    // Skip runtime loops marked for deferred unrolling in airrt-to-npu.
+    if (for_op->hasAttr("air.runtime_loop"))
+      return failure();
+
     // Check if the loop contains only air.channel.put/get ops, or pure ops.
     auto containsOnlyAIRChannels = [](Block *block) {
       if (block->getOperations().empty())
@@ -6142,6 +6146,9 @@ struct AIRLaunchToScfForPattern : public OpRewritePattern<air::LaunchOp> {
     for (unsigned i = 0; i < lbs.size(); i++) {
       auto scfFor =
           scf::ForOp::create(rewriter, loc, lbs[i], ubs[i], steps[i], iterArgs);
+      // Mark as runtime loop so BD folding's AIRUnrollScfForIntoBDChain
+      // skips it (preserving it for later unrolling in airrt-to-npu).
+      scfFor->setAttr("air.runtime_loop", BoolAttr::get(context, true));
       if (i != 0 && scfFor->getNumResults())
         scf::YieldOp::create(rewriter, loc, scfFor->getResults());
       iterArgs.clear();
@@ -6235,6 +6242,23 @@ class AIROptimizeShimDMABDs
       return;
     }
 
+    // When tile sizes are empty or all-1, skip tiling and unrolling. Tiling by
+    // 1 followed by full unrolling is equivalent to full unrolling of the
+    // runtime loop, which creates N copies of the entire launch body (including
+    // segment/herd/channel ops). This is wasteful because the BD folding only
+    // operates on the ~16 L3 channel ops per iteration, while the ~700 lines
+    // of segment/herd bodies are dead weight stripped later by airrt-to-npu.
+    //
+    // We still convert air.launch to scf.for + dummyLaunch (needed so that
+    // launch IVs are valid as affine symbols in downstream passes), but skip
+    // the tiling/unrolling. The scf.for loops survive through air-to-std and
+    // are unrolled later in airrt-to-npu AFTER removeDeadDeviceComputeOps
+    // strips the heavy segment/herd bodies, yielding O(16) ops per iteration
+    // instead of O(700).
+    bool allTileSizesAreOne =
+        !clTileSizes.empty() &&
+        llvm::all_of(clTileSizes, [](unsigned s) { return s == 1; });
+
     // Convert air.launch to scf.for.
     RewritePatternSet patterns(ctx);
     patterns.insert<AIRLaunchToScfForPattern>(ctx);
@@ -6254,6 +6278,57 @@ class AIROptimizeShimDMABDs
       applyAIRL3DmaFoldingPatterns(func, *device);
       return;
     }
+    // Check if there are runtime loops from launch conversion (inside a
+    // dummyLaunch) with non-trivial trip count. Only skip tiling/unrolling
+    // for these — they cause O(N) IR explosion when unrolled. Loops directly
+    // in functions (not from launch conversion) still need BD folding.
+    bool hasNonTrivialLaunchLoop = llvm::any_of(shimFors, [](scf::ForOp f) {
+      auto tc = air::getStaticScfForTripCountAsInt(f);
+      if (!tc || *tc <= 1)
+        return false;
+      auto parentLaunch = f->getParentOfType<air::LaunchOp>();
+      return parentLaunch && parentLaunch->hasAttr("dummyLaunch");
+    });
+    if ((clTileSizes.empty() || allTileSizesAreOne) &&
+        hasNonTrivialLaunchLoop) {
+      // Skip tiling and unrolling. The runtime scf.for loops survive through
+      // air-to-std and are unrolled in airrt-to-npu after dead device compute
+      // ops (segment/herd bodies) are stripped, making unrolling much cheaper.
+      // BD folding still runs (isolation + specialize), but the runtime loops
+      // are protected by the "air.runtime_loop" attribute which causes
+      // AIRUnrollScfForIntoBDChain to skip them.
+      applyAIRL3DmaFoldingPatterns(func, *device);
+      // Generate air.launch_end barriers. Collect async tokens from
+      // top-level channel ops (isolated by BD folding) and scf.for results
+      // (which carry the async dependency from runtime loop iterations).
+      IRRewriter rw(ctx);
+      SmallVector<Block *> funcAndLaunchBlocks(1, &func.getBody().front());
+      func.walk([&funcAndLaunchBlocks](air::LaunchOp launch) {
+        if (air::isAsyncOp(launch))
+          funcAndLaunchBlocks.push_back(&launch.getRegion().front());
+      });
+      for (auto blk : funcAndLaunchBlocks) {
+        OpBuilder::InsertionGuard guard(rw);
+        SmallVector<Value> asyncTokens;
+        for (auto chan : blk->getOps<air::ChannelInterface>())
+          if (air::isAsyncOp(chan))
+            asyncTokens.push_back(air::getAsyncTokenFromOp(chan));
+        for (auto forOp : blk->getOps<scf::ForOp>())
+          for (auto result : forOp->getResults())
+            if (isa<air::AsyncTokenType>(result.getType()))
+              asyncTokens.push_back(result);
+
+        if (blk->mightHaveTerminator())
+          rw.setInsertionPoint(blk->getTerminator());
+        else
+          rw.setInsertionPointToEnd(blk);
+        auto launchEndWaitAll =
+            air::WaitAllOp::create(rw, rw.getUnknownLoc(),
+                                   /*result_type*/ Type(), asyncTokens);
+        launchEndWaitAll->setAttr("air.launch_end", rw.getUnitAttr());
+      }
+      return;
+    }
     // Helper function converting a vector of unsigned int to a vector of Value.
     auto convertVecOfIntToVecOfValue = [](OpBuilder &b,
                                           SmallVector<unsigned> clTileSizes) {
diff --git a/mlir/lib/Util/Dependency.cpp b/mlir/lib/Util/Dependency.cpp
@@ -801,6 +801,9 @@ scf::ForOp hoistTargetOpsToNewSCFFor(PatternRewriter &rewriter,
   if (auto attr =
           for_op->getAttrOfType<StringAttr>(SymbolTable::getSymbolAttrName()))
     new_for_op->setAttr(SymbolTable::getSymbolAttrName(), attr);
+  // Propagate air.runtime_loop attribute so BD folding patterns skip it.
+  if (for_op->hasAttr("air.runtime_loop"))
+    new_for_op->setAttr("air.runtime_loop", rewriter.getUnitAttr());
   remap.map(for_op.getInductionVar(), new_for_op.getInductionVar());
   remap.map(getLoopCarriedTokenFromScfOp(for_op, "argument"),
             getLoopCarriedTokenFromScfOp(new_for_op, "argument"));
diff --git a/mlir/test/Transform/AIRDependencyScheduleOpt/opt_shim_dma_bds_skip_unroll.mlir b/mlir/test/Transform/AIRDependencyScheduleOpt/opt_shim_dma_bds_skip_unroll.mlir
@@ -0,0 +1,79 @@
+//===- opt_shim_dma_bds_skip_unroll.mlir ------------------------*- MLIR -*-===//
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+//===----------------------------------------------------------------------===//
+
+// RUN: air-opt %s -air-opt-shim-dma-bds="device=npu1 shim-dma-tile-sizes=1,1" | FileCheck %s --check-prefix=TILE1
+// RUN: air-opt %s -air-opt-shim-dma-bds="device=npu1 shim-dma-tile-sizes=2" | FileCheck %s --check-prefix=TILE2
+// RUN: air-opt %s -air-opt-shim-dma-bds="device=npu1" | FileCheck %s --check-prefix=NOTILE
+
+// Test: all-1 tile sizes and empty tile sizes skip tiling and unrolling of the
+// runtime loop while still running BD folding (which isolates channel ops and
+// folds inner loops into BD dimensions). The runtime scf.for (marked with
+// air.runtime_loop) is preserved. Non-trivial tile sizes still tile and unroll.
+
+// TILE1-LABEL: func @multi_iter_with_segment
+// With tile-sizes=1,1: BD folding runs (isolates channel.put out of the loop),
+// but runtime scf.for is preserved (not unrolled by AIRUnrollScfForIntoBDChain).
+// TILE1: air.launch async () in ()
+// TILE1-SAME: dummyLaunch
+// TILE1: %[[PUT:.*]] = air.channel.put async
+// TILE1: %[[FOR_RESULT:.*]] = scf.for
+// TILE1: air.segment @seg
+// TILE1: air.channel.get
+// TILE1: scf.yield
+// TILE1: air.wait_all [%[[PUT]], %[[FOR_RESULT]]] {air.launch_end}
+
+// TILE2-LABEL: func @multi_iter_with_segment
+// With tile-sizes=2: outer loop (trip=2) is unrolled into 2 copies.
+// Each copy has an inner scf.for (trip=2) with segment + channel.get.
+// TILE2: air.launch async () in ()
+// TILE2-SAME: dummyLaunch
+// TILE2: air.channel.put async
+// TILE2: scf.for
+// TILE2: air.segment @seg
+// TILE2: air.channel.get
+// TILE2: scf.yield
+// TILE2: air.wait_all {{.*}} {air.launch_end}
+// The second unrolled copy:
+// TILE2: air.channel.put async
+// TILE2: scf.for
+// TILE2: air.segment @seg
+// TILE2: air.channel.get
+// TILE2: scf.yield
+// TILE2: air.wait_all {{.*}} {air.launch_end}
+
+// NOTILE-LABEL: func @multi_iter_with_segment
+// No tile sizes: same behavior as all-1 (fast path).
+// NOTILE: air.launch async () in ()
+// NOTILE-SAME: dummyLaunch
+// NOTILE: %[[NT_PUT:.*]] = air.channel.put async
+// NOTILE: %[[NT_FOR:.*]] = scf.for
+// NOTILE: air.segment @seg
+// NOTILE: air.channel.get
+// NOTILE: scf.yield
+// NOTILE: air.wait_all [%[[NT_PUT]], %[[NT_FOR]]] {air.launch_end}
+
+module {
+  air.channel @input_ch [1, 1]
+  air.channel @output_ch [1, 1]
+  func.func @multi_iter_with_segment(%arg0: memref<256x64xbf16>, %arg1: memref<256x64xbf16>) {
+    %c4 = arith.constant 4 : index
+    %0 = air.launch async (%arg2) in (%arg3=%c4) args(%arg4=%arg0, %arg5=%arg1) : memref<256x64xbf16>, memref<256x64xbf16> {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c64 = arith.constant 64 : index
+      %1 = affine.apply affine_map<()[s0] -> (s0 * 4096)>()[%arg2]
+      %2 = air.channel.put async @input_ch[%c0, %c0] (%arg4[%c0, %1] [%c64, %c64] [%c64, %c1]) {metadata = @airMemcpyId1} : (memref<256x64xbf16>)
+      %3 = air.segment @seg async {
+        %alloc = memref.alloc() : memref<64x64xbf16, 1>
+        memref.dealloc %alloc : memref<64x64xbf16, 1>
+      }
+      %4 = air.channel.get async [%3] @output_ch[%c0, %c0] (%arg5[%c0, %1] [%c64, %c64] [%c64, %c1]) {metadata = @airMemcpyId2} : (memref<256x64xbf16>)
+      %5 = air.wait_all async [%2, %4]
+    }
+    return
+  }
+}