iree-org · nirvedhmeshram · Mar 11, 2026 · Mar 4, 2026 · Mar 11, 2026
@@ -112,28 +112,42 @@ func.func @matmul_transpose_b(%5: tensor<64x64xf32>, %6: tensor<64x1280xf16>, %7
 
 // -----
 
-#config = #iree_gpu.lowering_config<{reduction = [0, 8]}>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0, d1) -> (d0)>
-func.func @reduction(%3: tensor<128x384xf32>) -> tensor<128xf32> {
+#config = #iree_gpu.lowering_config<{reduction = [0, 8, 4]}>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0)>
+func.func @reduction(%arg0: tensor<128x384x256xf32>) -> tensor<128xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
   %cst = arith.constant 0.000000e+00 : f32
   %empty = tensor.empty() : tensor<128xf32>
-  %4 = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32>
-  %5 = linalg.generic {
-    indexing_maps = [#map1, #map2],
-    iterator_types = ["parallel", "reduction"]
-    } ins(%3 : tensor<128x384xf32>) outs(%4 : tensor<128xf32>) attrs =  {lowering_config = #config} {
-  ^bb0(%in: f32, %out: f32):
-    %7 = arith.addf %in, %out : f32
-    linalg.yield %7 : f32
-  } -> tensor<128xf32>
-  return %5 : tensor<128xf32>
+  %init = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32>
+
+  // Parent scf.for loop that will be coalesced with reduction tiling loops.
+  %result = scf.for %iv = %c0 to %c3 step %c1 iter_args(%arg1 = %init) -> (tensor<128xf32>) {
+    %slice = tensor.extract_slice %arg0[0, 0, 0] [128, 384, 256] [1, 1, 1] : tensor<128x384x256xf32> to tensor<128x384x256xf32>
+    %reduced = linalg.generic {
+      indexing_maps = [#map1, #map2],
+      iterator_types = ["parallel", "reduction", "reduction"]
+      } ins(%slice : tensor<128x384x256xf32>) outs(%arg1 : tensor<128xf32>) attrs =  {lowering_config = #config} {
+    ^bb0(%in: f32, %out: f32):
+      %add = arith.addf %in, %out : f32
+      linalg.yield %add : f32
+    } -> tensor<128xf32>
+    scf.yield %reduced : tensor<128xf32>
+  }
+  return %result : tensor<128xf32>
 }
 
 // CHECK-LABEL: func.func @reduction
-//       CHECK:   %[[FILL:.+]] = linalg.fill {{.*}} tensor<128xf32>
-//       CHECK:   scf.for %{{.*}} = %c0 to %c384 step %c8 iter_args(%{{.*}} = %[[FILL]])
-//       CHECK:     linalg.generic {{.*}} ins(%{{.*}} : tensor<128x8xf32>)
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor<128x384x256xf32>
+//   CHECK-DAG:   %[[C9216:.+]] = arith.constant 9216 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//       CHECK:   %[[INIT:.+]] = linalg.fill {{.*}} tensor<128xf32>
+//       CHECK:   scf.for %{{.*}} = %[[C0]] to %[[C9216]] step %[[C1]] iter_args(%[[ARG:.+]] = %[[INIT]])
+//   CHECK-NOT:   scf.for
+//       CHECK:     linalg.generic {{.*}} ins(%{{.*}} : tensor<128x8x4xf32>) outs(%[[ARG]] : tensor<128xf32>)
 //       CHECK:     scf.yield
 
 // Verify that no tiling happens in the thread case.
@@ -142,6 +156,47 @@ func.func @reduction(%3: tensor<128x384xf32>) -> tensor<128xf32> {
 
 // -----
 
+// Test that coalescing is skipped when loops have dynamic trip counts.
+#config_dyn = #iree_gpu.lowering_config<{reduction = [0, 8, 4]}>
+#map_dyn1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map_dyn2 = affine_map<(d0, d1, d2) -> (d0)>
+func.func @reduction_dynamic_trip_count(%arg0: tensor<128x384x256xf32>, %dyn_ub: index) -> tensor<128xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty() : tensor<128xf32>
+  %init = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32>
+
+  // Parent scf.for loop with dynamic upper bound.
+  // This should NOT be coalesced with reduction tiling loops.
+  %result = scf.for %iv = %c0 to %dyn_ub step %c1 iter_args(%arg1 = %init) -> (tensor<128xf32>) {
+    %slice = tensor.extract_slice %arg0[0, 0, 0] [128, 384, 256] [1, 1, 1] : tensor<128x384x256xf32> to tensor<128x384x256xf32>
+    %reduced = linalg.generic {
+      indexing_maps = [#map_dyn1, #map_dyn2],
+      iterator_types = ["parallel", "reduction", "reduction"]
+      } ins(%slice : tensor<128x384x256xf32>) outs(%arg1 : tensor<128xf32>) attrs =  {lowering_config = #config_dyn} {
+    ^bb0(%in: f32, %out: f32):
+      %add = arith.addf %in, %out : f32
+      linalg.yield %add : f32
+    } -> tensor<128xf32>
+    scf.yield %reduced : tensor<128xf32>
+  }
+  return %result : tensor<128xf32>
+}
+
+// CHECK-LABEL: func.func @reduction_dynamic_trip_count
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor<128x384x256xf32>
+//  CHECK-SAME:   %[[DYN_UB:[A-Za-z0-9]+]]: index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//       CHECK:   %[[INIT:.+]] = linalg.fill {{.*}} tensor<128xf32>
+//       CHECK:   scf.for %{{.*}} = %[[C0]] to %[[DYN_UB]] step %[[C1]] iter_args(%[[ARG1:.+]] = %[[INIT]])
+//       CHECK:     scf.for %{{.*}} = %[[C0]] to %{{.*}} step %{{.*}} iter_args(%[[ARG2:.+]] = %[[ARG1]])
+//       CHECK:       scf.for %{{.*}} = %[[C0]] to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.+]] = %[[ARG2]])
+//       CHECK:         linalg.generic {{.*}} ins(%{{.*}} : tensor<128x8x4xf32>) outs(%[[ARG3]] : tensor<128xf32>)
+
+// -----
+
 #config = #iree_gpu.lowering_config<{reduction = [0, 0, 8]}>
 #map = affine_map<(d0, d1) -> (d0, d1)>
 func.func @matmul_fuse(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>, %5: tensor<64x64xf32>) -> tensor<64x64xf32> {

@@ -19,10 +19,11 @@ func.func @conv_nhwc_generic(%a: tensor<1x3x66x8xf32>, %b: tensor<32x3x3x8xf32>,
 }
 
 // CHECK-LABEL: func.func @conv_nhwc_generic
-//       CHECK:  scf.for %{{.*}} = %c0 to %c3 step %c1
-//       CHECK:    scf.for %{{.*}} = %c0 to %c3 step %c1
-//       CHECK:      linalg.generic
-//  CHECK-SAME:        affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d6)>
+//   CHECK-DAG:  %[[C9:.+]] = arith.constant 9 : index
+//       CHECK:  scf.for %{{.*}} = %c0 to %[[C9]] step %c1
+//   CHECK-NOT:    scf.for
+//       CHECK:    linalg.generic
+//  CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d6)>
 
 // -----
 
@@ -35,10 +36,11 @@ func.func @conv_nhwc_named_dilated(%a: tensor<1x5x68x8xf32>, %b: tensor<32x3x3x8
 }
 
 // CHECK-LABEL: func.func @conv_nhwc_named_dilated
-//       CHECK:  scf.for %{{.*}} = %c0 to %c3 step %c1
-//       CHECK:    scf.for %{{.*}} = %c0 to %c3 step %c1
-//       CHECK:      linalg.generic
-//  CHECK-SAME:        affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d6)>
+//   CHECK-DAG:  %[[C9:.+]] = arith.constant 9 : index
+//       CHECK:  scf.for %{{.*}} = %c0 to %[[C9]] step %c1
+//   CHECK-NOT:    scf.for
+//       CHECK:    linalg.generic
+//  CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d6)>
 
 // -----
 
@@ -51,10 +53,11 @@ func.func @conv_nchw_named(%arg0: tensor<2x16x130x130xf32>, %arg1: tensor<32x16x
 }
 
 // CHECK-LABEL: func.func @conv_nchw_named
-//       CHECK:  scf.for %{{.*}} = %c0 to %c3 step %c1
-//       CHECK:    scf.for %{{.*}} = %c0 to %c3 step %c1
-//       CHECK:      linalg.generic
-//  CHECK-SAME:        affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d4, d2, d3)>
+//   CHECK-DAG:  %[[C9:.+]] = arith.constant 9 : index
+//       CHECK:  scf.for %{{.*}} = %c0 to %[[C9]] step %c1
+//   CHECK-NOT:    scf.for
+//       CHECK:    linalg.generic
+//  CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d4, d2, d3)>
 
 // -----
 
@@ -77,7 +80,8 @@ func.func @conv_chwn_generic(%a: tensor<16x24x16x16xf32>, %b: tensor<16x24x16x16
 }
 
 // CHECK-LABEL: func.func @conv_chwn_generic
-//       CHECK:  scf.for %{{.*}} = %c0 to %c24 step %c1
-//       CHECK:    scf.for %{{.*}} = %c0 to %c16 step %c1
-//       CHECK:      linalg.generic
-//  CHECK-SAME:        affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d1, d2, d3)>
+//   CHECK-DAG:  %[[C384:.+]] = arith.constant 384 : index
+//       CHECK:  scf.for %{{.*}} = %c0 to %[[C384]] step %c1
+//   CHECK-NOT:    scf.for
+//       CHECK:    linalg.generic
+//  CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d1, d2, d3)>
@@ -13,10 +13,13 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/TopologicalSortUtils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Dominance.h"
 
 #include <cassert>
@@ -482,6 +485,53 @@ LogicalResult applyTileAndFuseToEachRoot(
                               tileAndFuseOptions, tiledResults->loops);
       }
     }
+
+    // Coalesce scf.for loops created during reduction tiling.
+    // This is done at the very end after all other transformations
+    // to avoid invalidating dominance info or affecting fusion logic.
+    if (tilingLevel == IREE::GPU::TilingLevel::Reduction &&
+        !tiledResults->loops.empty()) {
+      SmallVector<scf::ForOp> forLoops;
+
+      // Check if tiling happened inside an existing scf.for loop
+      // If so, include that parent loop in the coalescing.
+      Operation *parentOp =
+          tiledResults->loops.front().getOperation()->getParentOp();
+      scf::ForOp parentForOp = dyn_cast<scf::ForOp>(parentOp);
+      if (parentForOp) {
+        forLoops.push_back(parentForOp);
+      }
+
+      // Collect all the tiled loops first.
+      for (LoopLikeOpInterface loop : tiledResults->loops) {
+        if (auto forOp = dyn_cast<scf::ForOp>(loop.getOperation())) {
+          forLoops.push_back(forOp);
+        }
+      }
+
+      // If loops have dynamic trip counts and we coalesce them, it can
+      // cause range analysis to not find static bounds. This was mainly
+      // noticed as a problem in applyPaddingLevel, to prevent a regression
+      // we dont coalesce such loops.
+      bool hasDynamicTripCount = false;
+      for (scf::ForOp forOp : forLoops) {
+        if (!getConstantIntValue(forOp.getLowerBound()) ||
+            !getConstantIntValue(forOp.getUpperBound()) ||
+            !getConstantIntValue(forOp.getStep())) {
+          hasDynamicTripCount = true;
+          LLVM_DEBUG(llvm::dbgs()
+                     << "Skipping coalescing: loop has dynamic trip count\n");
+          break;
+        }
+      }
+
+      if (forLoops.size() > 1 && !hasDynamicTripCount) {
+        if (failed(coalesceLoops(rewriter, forLoops))) {
+          // Coalescing failure is not critical, just log and continue.
+          LLVM_DEBUG(llvm::dbgs() << "Failed to coalesce reduction loops\n");
+        }
+      }
+    }
   }
   return success();
 }

@@ -62,21 +62,18 @@ hal.executable private @main {
 //      CHECK-DAG:   memref.alloc() : memref<1x1x32x68xf16, #gpu.address_space<workgroup>>
 //      CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//      CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
-//      CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
-//      CHECK-DAG:   %[[C36:.+]] = arith.constant 36 : index
+//      CHECK-DAG:   %[[C81:.+]] = arith.constant 81 : index
 //          CHECK:   scf.forall ({{.*}}) in (16, 48, 9) {
-//          CHECK:     scf.for {{.+}} = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
-//          CHECK:       scf.for {{.+}} = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
-//          CHECK:         scf.for {{.+}} = %[[C0]] to %[[C36]] step %[[C4]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
-//          CHECK:           gpu.barrier memfence [#gpu.address_space<workgroup>]
-//      CHECK-DAG:           %[[LHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<4xf16>
-//      CHECK-DAG:           vector.transfer_write %[[LHS_RD]]
-//      CHECK-DAG:           %[[RHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<8xf16>
-//      CHECK-DAG:           vector.transfer_write %[[RHS_RD]]
-//          CHECK:           gpu.barrier memfence [#gpu.address_space<workgroup>]
-//      CHECK-DAG:           vector.transfer_read {{.*}} vector<4x4xf16>
-//      CHECK-DAG:           vector.transfer_read {{.*}} vector<4x4xf16>
-//  CHECK-COUNT-4:           amdgpu.mfma 16x16x16
+//          CHECK:     scf.for {{.+}} = %[[C0]] to %[[C81]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
+//      CHECK-NOT:       scf.for
+//          CHECK:       gpu.barrier memfence [#gpu.address_space<workgroup>]
+//      CHECK-DAG:       %[[LHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<4xf16>
+//      CHECK-DAG:       vector.transfer_write %[[LHS_RD]]
+//      CHECK-DAG:       %[[RHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<8xf16>
+//      CHECK-DAG:       vector.transfer_write %[[RHS_RD]]
+//          CHECK:       gpu.barrier memfence [#gpu.address_space<workgroup>]
+//      CHECK-DAG:       vector.transfer_read {{.*}} vector<4x4xf16>
+//      CHECK-DAG:       vector.transfer_read {{.*}} vector<4x4xf16>
+//  CHECK-COUNT-4:       amdgpu.mfma 16x16x16
 //          CHECK:     vector.transfer_write %{{.*}}, %[[BUF2]]
 //          CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
@@ -1066,16 +1066,20 @@ hal.executable public @main {
 }
 
 // CHECK-LABEL: func @elemwise_reduction_elemwise
-//       CHECK:   scf.for %{{.*}} = %{{.*}} to %c16 step %c1 {{.*}} -> (vector<1xf32>)
-//       CHECK:     scf.for
-//       CHECK:       scf.for
-//       CHECK:         %[[REDUCE:.+]] = vector.multi_reduction
-//       CHECK:         scf.yield %[[REDUCE]]
-
-//       CHECK:   scf.for %{{.*}} = %{{.*}} to %c16 step %c1
-//       CHECK:     scf.for
-// CHECK-COUNT-4:     arith.addf {{.*}} : vector<9xf32>
-//       CHECK:       vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #amdgpu.address_space<fat_raw_buffer>>
+//   CHECK-DAG:   %[[C144:.+]] = arith.constant 144 : index
+//   CHECK-DAG:   %[[C16:.+]] = arith.constant 16 : index
+//   CHECK-DAG:   %[[C9:.+]] = arith.constant 9 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//       CHECK:   scf.for %{{.*}} = %[[C0]] to %[[C144]] step %[[C1]] {{.*}} -> (vector<1xf32>)
+//   CHECK-NOT:     scf.for
+//       CHECK:     %[[REDUCE:.+]] = vector.multi_reduction
+//       CHECK:     scf.yield %[[REDUCE]]
+
+//       CHECK:   scf.for %{{.*}} = %[[C0]] to %[[C16]] step %[[C1]]
+//       CHECK:     scf.for %{{.*}} = %[[C0]] to %[[C9]] step %[[C1]]
+// CHECK-COUNT-4:       arith.addf {{.*}} : vector<9xf32>
+//       CHECK:         vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #amdgpu.address_space<fat_raw_buffer>>
 
 // -----