[AMD] Schedule AsyncWait in front of AsyncCopy and LocalLoad (#6621)

AlexAUT · antiagainst · web-flow · commit 3932686bc300 · 2025-04-28T17:32:08.000-07:00
Changes the scheduling to place `AsyncWait` in a separate cluster at the
start of the schedule if `GlobalLoadStage != LocalLoadStage`. This
allows us to reorder the `AsyncCopy` and `LocalLoads` more freely
because the `AsyncWait` will always be before both `Ops`.

---------

Co-authored-by: Lei Zhang &lt;antiagainst@gmail.com&gt;
diff --git a/test/TritonGPU/loop-pipeline-hip.mlir b/test/TritonGPU/loop-pipeline-hip.mlir
@@ -46,9 +46,9 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n
     // ASYNC: ttg.async_copy_global_to_local
     // ASYNC: scf.for
     // ASYNC:  ttg.async_wait
+    // ASYNC:  ttg.async_copy_global_to_local
     // ASYNC:  tt.dot
     // ASYNC:  tt.dot
-    // ASYNC:  ttg.async_copy_global_to_local
     // ASYNC:  scf.yield
     %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>)  : i32 {
       %18 = tt.load %16 : tensor<64x16x!tt.ptr<f16>, #blocked>
@@ -500,9 +500,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
   //         SYNC:   scf.yield
   //
   //         ASYNC:    ttg.async_wait
-  // ASYNC-COUNT-2:    ttg.local_load
+  //         ASYNC:    ttg.async_copy_global_to_local
+  //         ASYNC:    ttg.local_load
+  //         ASYNC:    ttg.async_copy_global_to_local
+  //         ASYNC:    ttg.local_load
   //         ASYNC:    ttg.dot
-  // ASYNC-COUNT-2:    ttg.async_copy_global_to_local
 
 // Epilogue
 //          ASYNC: ttg.async_wait
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -101,16 +101,19 @@ namespace {
 //
 class StreamPipeliner {
   // Define categories of scheduling details per Operation types.
-  // The StreamPipeliner schedules 4 types of operations:
-  // 1. GLOBAL_LOAD: tt.load
-  // 2. LOCAL_STORE: ttg.local_store (created by the StreamPipeliner)
-  // 3. LOCAL_LOAD:  ttg.local_load (created by the StreamPipeliner)
+  // The StreamPipeliner schedules 5 types of operations:
+  // 1. GLOBAL_LOAD: tt.load / ttg.async_copy_global_to_local
+  // 2. LOCAL_STORE: ttg.local_store
+  // 3. LOCAL_LOAD:  ttg.local_load
   // 4. COMPUTE:     ops that use the loaded data
+  // 5. ASYNC_WAIT:  ttg.async_wait
+  // Note that ttg ops mentioned in the above list are created in this pass.
   enum SchedType {
     SCHED_GLOBAL_LOAD,
     SCHED_LOCAL_STORE,
     SCHED_LOCAL_LOAD,
     SCHED_COMPUTE,
+    SCHED_ASYNC_WAIT,
     SCHED_SIZE
   };
 
@@ -125,6 +128,7 @@ class StreamPipeliner {
     stages[SCHED_LOCAL_STORE] = _globalPrefetch;
     stages[SCHED_LOCAL_LOAD] = lastStage - _localPrefetch;
     stages[SCHED_COMPUTE] = lastStage;
+    stages[SCHED_ASYNC_WAIT] = stages[SCHED_LOCAL_LOAD];
 
     options.supportDynamicLoops = true;
     options.peelEpilogue = true;
@@ -212,7 +216,6 @@ class StreamPipeliner {
 //   WARNING: Changing the order of schedule.clusters.newAtBack() calls
 //            can cause invalid schedules to be produced.
 LogicalResult StreamPipeliner::initSchedule(int maxIndirectionLevel) {
-
   bool pairedGlobalLoadLocalStore = stages[SCHED_LOCAL_STORE] == 0;
   stages[SCHED_LOCAL_STORE] += maxIndirectionLevel;
 
@@ -221,6 +224,7 @@ LogicalResult StreamPipeliner::initSchedule(int maxIndirectionLevel) {
                         << ", LOCAL_STORE stage = " << stages[SCHED_LOCAL_STORE]
                         << ", LOCAL_LOAD stage = " << stages[SCHED_LOCAL_LOAD]
                         << ", COMPUTE stage = " << stages[SCHED_COMPUTE]
+                        << ", ASYNC_WAIT stage = " << stages[SCHED_ASYNC_WAIT]
                         << "; total = " << numStages);
 
   if (stages[SCHED_LOCAL_STORE] >= numStages ||
@@ -241,15 +245,19 @@ LogicalResult StreamPipeliner::initSchedule(int maxIndirectionLevel) {
 
   LDBG("deduced max shared memory buffer number = " << numBuffers);
 
+  // We place async wait as the first cluster because we want to have it being
+  // the first in the main loop after pipelining.
+  int asyncWaitCluster = 0;
+
   // If tt.load and ttg.local_store are in the same stage
   //   spread them apart to allow overlap with compute
   // else
   //   Initiate ttg.local_store before tt.load
-  int globalLoadCluster = 0;
-  int localStoreCluster = 2;
+  int globalLoadCluster = 1;
+  int localStoreCluster = 3;
   if (!pairedGlobalLoadLocalStore) {
-    globalLoadCluster = 2;
-    localStoreCluster = 1;
+    globalLoadCluster = 3;
+    localStoreCluster = 2;
   }
 
   // If ttg.local_load and ttg.local_store are in the same stage
@@ -260,33 +268,35 @@ LogicalResult StreamPipeliner::initSchedule(int maxIndirectionLevel) {
   //   schedule ttg.local_load in the middle
   int localLoadCluster = globalLoadCluster;
   if (stages[SCHED_LOCAL_LOAD] == stages[SCHED_LOCAL_STORE]) {
-    localLoadCluster = std::max(2, localStoreCluster + 1);
+    localLoadCluster = std::max(3, localStoreCluster + 1);
   } else if (numBuffers == 1 && localLoadCluster >= localStoreCluster) {
     // For 1 buffer, ttg.local_load must occur before ttg.local_store
     localLoadCluster = localStoreCluster - 1;
   }
 
   // Schedule compute with ttg.local_load if paired
   // otherwise, schedule in the middle
-  int computeCluster = 1;
+  int computeCluster = 2;
   if (stages[SCHED_LOCAL_LOAD] == stages[SCHED_COMPUTE]) {
     computeCluster = localLoadCluster;
   }
 
   // Make assignments
-  std::array<tt::CoarseSchedule::Cluster, SCHED_SIZE> clusterVec = {
-      schedule.clusters.newAtBack(), schedule.clusters.newAtBack(),
-      schedule.clusters.newAtBack(), schedule.clusters.newAtBack()};
+  std::array<tt::CoarseSchedule::Cluster, SCHED_SIZE> clusterVec;
+  std::generate(clusterVec.begin(), clusterVec.end(),
+                [&]() { return schedule.clusters.newAtBack(); });
 
   clusters[SCHED_GLOBAL_LOAD] = clusterVec[globalLoadCluster];
   clusters[SCHED_LOCAL_STORE] = clusterVec[localStoreCluster];
   clusters[SCHED_LOCAL_LOAD] = clusterVec[localLoadCluster];
   clusters[SCHED_COMPUTE] = clusterVec[computeCluster];
+  clusters[SCHED_ASYNC_WAIT] = clusterVec[asyncWaitCluster];
 
   LDBG("Cluster schedule:" << "  GLOBAL_LOAD cluster = " << globalLoadCluster
                            << ", LOCAL_STORE cluster = " << localStoreCluster
                            << ", LOCAL_LOAD cluster = " << localLoadCluster
                            << ", COMPUTE cluster = " << computeCluster
+                           << ", ASYNC_WAIT cluster = " << asyncWaitCluster
                            << "; total = " << SCHED_SIZE);
 
   return success();
@@ -333,30 +343,37 @@ bool StreamPipeliner::createAsyncCopy(tt::LoadOp loadOp, Value alloc,
   for (auto alloc : allocsToErase)
     alloc.erase();
 
-  auto [stage, cluster] = schedule[loadOp];
-
-  auto newLoadOp = builder.create<ttg::AsyncCopyGlobalToLocalOp>(
+  auto copyOp = builder.create<ttg::AsyncCopyGlobalToLocalOp>(
       loadOp.getLoc(), src, viewLoad, loadOp.getMask(), loadOp.getOther(),
       loadOp.getCache(), loadOp.getEvict(), loadOp.getIsVolatile());
-  schedule.erase(loadOp);
-  schedule.insert(newLoadOp, stage, cluster);
 
   // Insert synchronization primitives to create barriers during lowering
-  auto commit =
-      builder.create<ttg::AsyncCommitGroupOp>(loc, newLoadOp->getResult(0));
-  ttg::AsyncWaitOp wait =
-      builder.create<ttg::AsyncWaitOp>(loc, commit->getResult(0), 0);
-  // We need to place the prefetches (AsyncCopy) after the AsyncWaits which
-  // create a barrier to ensure all warps are finished reading the shared buffer
-  // we will write into. This is done by scheduling it as a local_store.
-  scheduleOp(newLoadOp, SCHED_LOCAL_STORE);
-  // Place ttg.async_commit_group op next to async load so the later
-  // UpdateAsyncWaitCount pass can deduce better waitcnts
-  scheduleOp(commit, SCHED_LOCAL_STORE);
+  auto commitOp =
+      builder.create<ttg::AsyncCommitGroupOp>(loc, copyOp->getResult(0));
+
+  ttg::AsyncWaitOp waitOp =
+      builder.create<ttg::AsyncWaitOp>(loc, commitOp->getResult(0), 0);
 
   // Create local load which consumes the async token from the AsyncWait
   auto sharedLoad =
-      builder.create<ttg::LocalLoadOp>(loc, loadOp.getType(), viewLoad, wait);
+      builder.create<ttg::LocalLoadOp>(loc, loadOp.getType(), viewLoad, waitOp);
+
+  auto [loadStage, loadCluster] = schedule[loadOp];
+  schedule.erase(loadOp);
+  // Schedule new ops
+  schedule.insert(copyOp, loadStage, loadCluster);
+  // Place ttg.async_commit_group op following AsyncCopyGlobalToLocal so the
+  // later UpdateAsyncWaitCount pass can deduce better waitcnts
+  schedule.insert(commitOp, loadStage, loadCluster);
+  // If the LocalLoads are scheduled to a later stage than AsyncCopy we need to
+  // place the AsyncCopy prefetches after the AsyncWaits which create a barrier
+  // to ensure all warps are finished reading the shared buffer we will write
+  // into. This is done by scheduling AsyncWait as the first cluster.
+  // If AsyncCopy and LocalLoads are in the same stage we do not assign a
+  // schdule so they are placed before the LocalLoads
+  if (loadStage != stages[SCHED_LOCAL_LOAD])
+    scheduleOp(waitOp, SCHED_ASYNC_WAIT);
+
   if (stages[SCHED_LOCAL_LOAD] != stages[SCHED_COMPUTE])
     scheduleOp(sharedLoad, SCHED_LOCAL_LOAD);