[GPU] Add coalescing to reduction tiling

nirvedhmeshram · claude · nirvedhmeshram · commit 85efe1ac2364 · 2026-03-05T16:02:11.000-06:00
Signed-off-by: Nirvedh Meshram &lt;nirvedh@gmail.com&gt;
Co-Authored-By: Claude Sonnet 4 &lt;noreply@anthropic.com&gt;
Signed-off-by: Nirvedh Meshram &lt;nirvedh@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
@@ -112,28 +112,42 @@ func.func @matmul_transpose_b(%5: tensor<64x64xf32>, %6: tensor<64x1280xf16>, %7
 
 // -----
 
-#config = #iree_gpu.lowering_config<{reduction = [0, 8]}>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0, d1) -> (d0)>
-func.func @reduction(%3: tensor<128x384xf32>) -> tensor<128xf32> {
+#config = #iree_gpu.lowering_config<{reduction = [0, 8, 4]}>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0)>
+func.func @reduction(%arg0: tensor<128x384x256xf32>) -> tensor<128xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
   %cst = arith.constant 0.000000e+00 : f32
   %empty = tensor.empty() : tensor<128xf32>
-  %4 = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32>
-  %5 = linalg.generic {
-    indexing_maps = [#map1, #map2],
-    iterator_types = ["parallel", "reduction"]
-    } ins(%3 : tensor<128x384xf32>) outs(%4 : tensor<128xf32>) attrs =  {lowering_config = #config} {
-  ^bb0(%in: f32, %out: f32):
-    %7 = arith.addf %in, %out : f32
-    linalg.yield %7 : f32
-  } -> tensor<128xf32>
-  return %5 : tensor<128xf32>
+  %init = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32>
+
+  // Parent scf.for loop that will be coalesced with reduction tiling loops.
+  %result = scf.for %iv = %c0 to %c3 step %c1 iter_args(%arg1 = %init) -> (tensor<128xf32>) {
+    %slice = tensor.extract_slice %arg0[0, 0, 0] [128, 384, 256] [1, 1, 1] : tensor<128x384x256xf32> to tensor<128x384x256xf32>
+    %reduced = linalg.generic {
+      indexing_maps = [#map1, #map2],
+      iterator_types = ["parallel", "reduction", "reduction"]
+      } ins(%slice : tensor<128x384x256xf32>) outs(%arg1 : tensor<128xf32>) attrs =  {lowering_config = #config} {
+    ^bb0(%in: f32, %out: f32):
+      %add = arith.addf %in, %out : f32
+      linalg.yield %add : f32
+    } -> tensor<128xf32>
+    scf.yield %reduced : tensor<128xf32>
+  }
+  return %result : tensor<128xf32>
 }
 
 // CHECK-LABEL: func.func @reduction
-//       CHECK:   %[[FILL:.+]] = linalg.fill {{.*}} tensor<128xf32>
-//       CHECK:   scf.for %{{.*}} = %c0 to %c384 step %c8 iter_args(%{{.*}} = %[[FILL]])
-//       CHECK:     linalg.generic {{.*}} ins(%{{.*}} : tensor<128x8xf32>)
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor<128x384x256xf32>
+//   CHECK-DAG:   %[[C9216:.+]] = arith.constant 9216 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//       CHECK:   %[[INIT:.+]] = linalg.fill {{.*}} tensor<128xf32>
+//       CHECK:   scf.for %{{.*}} = %[[C0]] to %[[C9216]] step %[[C1]] iter_args(%[[ARG:.+]] = %[[INIT]])
+//   CHECK-NOT:   scf.for
+//       CHECK:     linalg.generic {{.*}} ins(%{{.*}} : tensor<128x8x4xf32>) outs(%[[ARG]] : tensor<128xf32>)
 //       CHECK:     scf.yield
 
 // Verify that no tiling happens in the thread case.
@@ -142,6 +156,68 @@ func.func @reduction(%3: tensor<128x384xf32>) -> tensor<128xf32> {
 
 // -----
 
+// Test coalescing when parent scf.for has iter_args but NOT chained with reduction.
+#config2 = #iree_gpu.lowering_config<{reduction = [0, 8, 4]}>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map4 = affine_map<(d0, d1, d2) -> (d0)>
+#map5 = affine_map<(d0) -> (d0)>
+func.func @reduction_nochain_iter_args(%arg0: tensor<128x384x256xf32>) -> tensor<128xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty() : tensor<128xf32>
+  %ew_init = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32>
+
+  // Parent scf.for loop with iter_args but NOT chained with reduction.
+  %result = scf.for %iv = %c0 to %c3 step %c1 iter_args(%ew = %ew_init) -> (tensor<128xf32>) {
+    %empty2 = tensor.empty() : tensor<128xf32>
+    %init = linalg.fill ins(%cst : f32) outs(%empty2 : tensor<128xf32>) -> tensor<128xf32>
+    %slice = tensor.extract_slice %arg0[0, 0, 0] [128, 384, 256] [1, 1, 1] : tensor<128x384x256xf32> to tensor<128x384x256xf32>
+    %reduced = linalg.generic {
+      indexing_maps = [#map3, #map4],
+      iterator_types = ["parallel", "reduction", "reduction"]
+      } ins(%slice : tensor<128x384x256xf32>) outs(%init : tensor<128xf32>) attrs =  {lowering_config = #config2} {
+    ^bb0(%in: f32, %out: f32):
+      %add = arith.addf %in, %out : f32
+      linalg.yield %add : f32
+    } -> tensor<128xf32>
+
+    // elementwise that uses the parent scf.for iter arg.
+    %empty3 = tensor.empty() : tensor<128xf32>
+    %elementwise = linalg.generic {
+      indexing_maps = [#map5, #map5, #map5],
+      iterator_types = ["parallel"]
+    } ins(%ew, %reduced : tensor<128xf32>, tensor<128xf32>) outs(%empty3 : tensor<128xf32>) {
+    ^bb0(%e: f32, %r: f32, %out: f32):
+      %new = arith.addf %e, %r : f32
+      linalg.yield %new : f32
+    } -> tensor<128xf32>
+
+    scf.yield %elementwise : tensor<128xf32>
+  }
+  return %result : tensor<128xf32>
+}
+
+// CHECK-LABEL: func.func @reduction_nochain_iter_args
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor<128x384x256xf32>
+//   CHECK-DAG:   %[[C3072:.+]] = arith.constant 3072 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+//       CHECK:   %[[INIT:.+]] = linalg.fill {{.*}} tensor<128xf32>
+//       CHECK:   scf.for %{{.*}} = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[EW_ARG:.+]] = %[[INIT]])
+//       CHECK:     scf.for %{{.*}} = %[[C0]] to %[[C3072]] step %[[C1]] iter_args(%[[RED_ARG:.+]] = %[[INIT]])
+//   CHECK-NOT:     scf.for
+//       CHECK:       linalg.generic {{.*}} ins(%{{.*}} : tensor<128x8x4xf32>) outs(%[[RED_ARG]] : tensor<128xf32>)
+//       CHECK:     linalg.generic {{.*}} ins(%[[EW_ARG]], %{{.*}} : tensor<128xf32>, tensor<128xf32>)
+//       CHECK:     scf.yield
+
+// THREAD-LABEL: func.func @reduction_no_iter_args
+//   THREAD-NOT:   scf.forall
+
+// -----
+
 #config = #iree_gpu.lowering_config<{reduction = [0, 0, 8]}>
 #map = affine_map<(d0, d1) -> (d0, d1)>
 func.func @matmul_fuse(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>, %5: tensor<64x64xf32>) -> tensor<64x64xf32> {
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileAndFuseUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/TileAndFuseUtils.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Dominance.h"
 
@@ -482,6 +483,80 @@ LogicalResult applyTileAndFuseToEachRoot(
                               tileAndFuseOptions, tiledResults->loops);
       }
     }
+
+    // Coalesce scf.for loops created during reduction tiling.
+    // This is done at the very end after all other transformations
+    // to avoid invalidating dominance info or affecting fusion logic.
+    if (tilingLevel == IREE::GPU::TilingLevel::Reduction &&
+        !tiledResults->loops.empty()) {
+      SmallVector<scf::ForOp> forLoops;
+
+      // Check if tiling happened inside an existing scf.for loop
+      // If so, include that parent loop in the coalescing.
+      Operation *parentOp =
+          tiledResults->loops.front().getOperation()->getParentOp();
+      scf::ForOp parentForOp = dyn_cast<scf::ForOp>(parentOp);
+
+      // Collect all the tiled loops first.
+      for (LoopLikeOpInterface loop : tiledResults->loops) {
+        if (auto forOp = dyn_cast<scf::ForOp>(loop.getOperation())) {
+          forLoops.push_back(forOp);
+        }
+      }
+
+      // Only include parent if it forms a proper iter_args chain with the
+      // tiled loops. This follows the same validation as upstream's
+      // coalescePerfectlyNestedSCFForLoops.
+      if (parentForOp && !forLoops.empty()) {
+        // Check if parent and first child form a valid iter_args chain:
+        // 1. Must have the same number of iter_args.
+        // 2. Parent's iter_args must match child's init_args.
+        // 3. Parent's terminator operands must match child's results.
+        scf::ForOp firstChild = forLoops.front();
+        bool formsChain = true;
+
+        if (parentForOp.getNumRegionIterArgs() !=
+            firstChild.getNumRegionIterArgs()) {
+          formsChain = false;
+          LLVM_DEBUG(llvm::dbgs()
+                     << "Skipping parent loop coalescing: different number of "
+                        "iter_args (parent: "
+                     << parentForOp.getNumRegionIterArgs() << ", child: "
+                     << firstChild.getNumRegionIterArgs() << ")\n");
+        }
+
+        if (formsChain && !llvm::equal(parentForOp.getRegionIterArgs(),
+                                       firstChild.getInitArgs())) {
+          formsChain = false;
+          LLVM_DEBUG(llvm::dbgs() << "Skipping parent loop coalescing: parent "
+                                     "iter_args don't match child init_args\n");
+        }
+
+        if (formsChain) {
+          auto parentTerminator = parentForOp.getBody()->getTerminator();
+          if (!llvm::equal(parentTerminator->getOperands(),
+                           firstChild.getResults())) {
+            formsChain = false;
+            LLVM_DEBUG(llvm::dbgs()
+                       << "Skipping parent loop coalescing: parent yield "
+                          "doesn't match child results\n");
+          }
+        }
+
+        // If forms a valid chain, insert parent at the beginning.
+        if (formsChain) {
+          forLoops.insert(forLoops.begin(), parentForOp);
+        }
+      }
+
+      // Coalesce if we have multiple loops.
+      if (forLoops.size() > 1) {
+        if (failed(coalesceLoops(rewriter, forLoops))) {
+          // Coalescing failure is not critical, just log and continue.
+          LLVM_DEBUG(llvm::dbgs() << "Failed to coalesce reduction loops\n");
+        }
+      }
+    }
   }
   return success();
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_direct_conv_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_direct_conv_tile_and_fuse.mlir
@@ -62,21 +62,18 @@ hal.executable private @main {
 //      CHECK-DAG:   memref.alloc() : memref<1x1x32x68xf16, #gpu.address_space<workgroup>>
 //      CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//      CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
-//      CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
-//      CHECK-DAG:   %[[C36:.+]] = arith.constant 36 : index
+//      CHECK-DAG:   %[[C81:.+]] = arith.constant 81 : index
 //          CHECK:   scf.forall ({{.*}}) in (16, 48, 9) {
-//          CHECK:     scf.for {{.+}} = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
-//          CHECK:       scf.for {{.+}} = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
-//          CHECK:         scf.for {{.+}} = %[[C0]] to %[[C36]] step %[[C4]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
-//          CHECK:           gpu.barrier memfence [#gpu.address_space<workgroup>]
-//      CHECK-DAG:           %[[LHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<4xf16>
-//      CHECK-DAG:           vector.transfer_write %[[LHS_RD]]
-//      CHECK-DAG:           %[[RHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<8xf16>
-//      CHECK-DAG:           vector.transfer_write %[[RHS_RD]]
-//          CHECK:           gpu.barrier memfence [#gpu.address_space<workgroup>]
-//      CHECK-DAG:           vector.transfer_read {{.*}} vector<4x4xf16>
-//      CHECK-DAG:           vector.transfer_read {{.*}} vector<4x4xf16>
-//  CHECK-COUNT-4:           amdgpu.mfma 16x16x16
+//          CHECK:     scf.for {{.+}} = %[[C0]] to %[[C81]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
+//      CHECK-NOT:       scf.for
+//          CHECK:       gpu.barrier memfence [#gpu.address_space<workgroup>]
+//      CHECK-DAG:       %[[LHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<4xf16>
+//      CHECK-DAG:       vector.transfer_write %[[LHS_RD]]
+//      CHECK-DAG:       %[[RHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<8xf16>
+//      CHECK-DAG:       vector.transfer_write %[[RHS_RD]]
+//          CHECK:       gpu.barrier memfence [#gpu.address_space<workgroup>]
+//      CHECK-DAG:       vector.transfer_read {{.*}} vector<4x4xf16>
+//      CHECK-DAG:       vector.transfer_read {{.*}} vector<4x4xf16>
+//  CHECK-COUNT-4:       amdgpu.mfma 16x16x16
 //          CHECK:     vector.transfer_write %{{.*}}, %[[BUF2]]
 //          CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -1066,16 +1066,20 @@ hal.executable public @main {
 }
 
 // CHECK-LABEL: func @elemwise_reduction_elemwise
-//       CHECK:   scf.for %{{.*}} = %{{.*}} to %c16 step %c1 {{.*}} -> (vector<1xf32>)
-//       CHECK:     scf.for
-//       CHECK:       scf.for
-//       CHECK:         %[[REDUCE:.+]] = vector.multi_reduction
-//       CHECK:         scf.yield %[[REDUCE]]
-
-//       CHECK:   scf.for %{{.*}} = %{{.*}} to %c16 step %c1
-//       CHECK:     scf.for
-// CHECK-COUNT-4:     arith.addf {{.*}} : vector<9xf32>
-//       CHECK:       vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #amdgpu.address_space<fat_raw_buffer>>
+//   CHECK-DAG:   %[[C144:.+]] = arith.constant 144 : index
+//   CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+//   CHECK-DAG:   %[[C9:.+]] = arith.constant 9 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//       CHECK:   scf.for %{{.*}} = %[[C0]] to %[[C144]] step %[[C1]] {{.*}} -> (vector<1xf32>)
+//   CHECK-NOT:     scf.for
+//       CHECK:     %[[REDUCE:.+]] = vector.multi_reduction
+//       CHECK:     scf.yield %[[REDUCE]]
+
+//       CHECK:   scf.for %{{.*}} = %[[C0]] to %[[C16]] step %[[C1]]
+//       CHECK:     scf.for %{{.*}} = %[[C0]] to %[[C9]] step %[[C1]]
+// CHECK-COUNT-4:       arith.addf {{.*}} : vector<9xf32>
+//       CHECK:         vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #amdgpu.address_space<fat_raw_buffer>>
 
 // -----