[GPU] Add coalescing to reduction tiling

nirvedhmeshram · claude · nirvedhmeshram · commit ffb0b442ee1b · 2026-03-09T14:10:46.000-05:00
Signed-off-by: Nirvedh Meshram &lt;nirvedh@gmail.com&gt;
Co-Authored-By: Claude Sonnet 4 &lt;noreply@anthropic.com&gt;
Signed-off-by: Nirvedh Meshram &lt;nirvedh@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
@@ -112,28 +112,42 @@ func.func @matmul_transpose_b(%5: tensor<64x64xf32>, %6: tensor<64x1280xf16>, %7
 
 // -----
 
-#config = #iree_gpu.lowering_config<{reduction = [0, 8]}>
-#map1 = affine_map<(d0, d1) -> (d0, d1)>
-#map2 = affine_map<(d0, d1) -> (d0)>
-func.func @reduction(%3: tensor<128x384xf32>) -> tensor<128xf32> {
+#config = #iree_gpu.lowering_config<{reduction = [0, 8, 4]}>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0)>
+func.func @reduction(%arg0: tensor<128x384x256xf32>) -> tensor<128xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
   %cst = arith.constant 0.000000e+00 : f32
   %empty = tensor.empty() : tensor<128xf32>
-  %4 = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32>
-  %5 = linalg.generic {
-    indexing_maps = [#map1, #map2],
-    iterator_types = ["parallel", "reduction"]
-    } ins(%3 : tensor<128x384xf32>) outs(%4 : tensor<128xf32>) attrs =  {lowering_config = #config} {
-  ^bb0(%in: f32, %out: f32):
-    %7 = arith.addf %in, %out : f32
-    linalg.yield %7 : f32
-  } -> tensor<128xf32>
-  return %5 : tensor<128xf32>
+  %init = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32>
+
+  // Parent scf.for loop that will be coalesced with reduction tiling loops.
+  %result = scf.for %iv = %c0 to %c3 step %c1 iter_args(%arg1 = %init) -> (tensor<128xf32>) {
+    %slice = tensor.extract_slice %arg0[0, 0, 0] [128, 384, 256] [1, 1, 1] : tensor<128x384x256xf32> to tensor<128x384x256xf32>
+    %reduced = linalg.generic {
+      indexing_maps = [#map1, #map2],
+      iterator_types = ["parallel", "reduction", "reduction"]
+      } ins(%slice : tensor<128x384x256xf32>) outs(%arg1 : tensor<128xf32>) attrs =  {lowering_config = #config} {
+    ^bb0(%in: f32, %out: f32):
+      %add = arith.addf %in, %out : f32
+      linalg.yield %add : f32
+    } -> tensor<128xf32>
+    scf.yield %reduced : tensor<128xf32>
+  }
+  return %result : tensor<128xf32>
 }
 
 // CHECK-LABEL: func.func @reduction
-//       CHECK:   %[[FILL:.+]] = linalg.fill {{.*}} tensor<128xf32>
-//       CHECK:   scf.for %{{.*}} = %c0 to %c384 step %c8 iter_args(%{{.*}} = %[[FILL]])
-//       CHECK:     linalg.generic {{.*}} ins(%{{.*}} : tensor<128x8xf32>)
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor<128x384x256xf32>
+//   CHECK-DAG:   %[[C9216:.+]] = arith.constant 9216 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//       CHECK:   %[[INIT:.+]] = linalg.fill {{.*}} tensor<128xf32>
+//       CHECK:   scf.for %{{.*}} = %[[C0]] to %[[C9216]] step %[[C1]] iter_args(%[[ARG:.+]] = %[[INIT]])
+//   CHECK-NOT:   scf.for
+//       CHECK:     linalg.generic {{.*}} ins(%{{.*}} : tensor<128x8x4xf32>) outs(%[[ARG]] : tensor<128xf32>)
 //       CHECK:     scf.yield
 
 // Verify that no tiling happens in the thread case.
@@ -142,6 +156,109 @@ func.func @reduction(%3: tensor<128x384xf32>) -> tensor<128xf32> {
 
 // -----
 
+// Test coalescing when parent scf.for has iter_args but NOT chained with reduction.
+#config2 = #iree_gpu.lowering_config<{reduction = [0, 8, 4]}>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map4 = affine_map<(d0, d1, d2) -> (d0)>
+#map5 = affine_map<(d0) -> (d0)>
+func.func @reduction_nochain_iter_args(%arg0: tensor<128x384x256xf32>) -> tensor<128xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty() : tensor<128xf32>
+  %ew_init = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32>
+
+  // Parent scf.for loop with iter_args but NOT chained with reduction.
+  %result = scf.for %iv = %c0 to %c3 step %c1 iter_args(%ew = %ew_init) -> (tensor<128xf32>) {
+    %empty2 = tensor.empty() : tensor<128xf32>
+    %init = linalg.fill ins(%cst : f32) outs(%empty2 : tensor<128xf32>) -> tensor<128xf32>
+    %slice = tensor.extract_slice %arg0[0, 0, 0] [128, 384, 256] [1, 1, 1] : tensor<128x384x256xf32> to tensor<128x384x256xf32>
+    %reduced = linalg.generic {
+      indexing_maps = [#map3, #map4],
+      iterator_types = ["parallel", "reduction", "reduction"]
+      } ins(%slice : tensor<128x384x256xf32>) outs(%init : tensor<128xf32>) attrs =  {lowering_config = #config2} {
+    ^bb0(%in: f32, %out: f32):
+      %add = arith.addf %in, %out : f32
+      linalg.yield %add : f32
+    } -> tensor<128xf32>
+
+    // elementwise that uses the parent scf.for iter arg.
+    %empty3 = tensor.empty() : tensor<128xf32>
+    %elementwise = linalg.generic {
+      indexing_maps = [#map5, #map5, #map5],
+      iterator_types = ["parallel"]
+    } ins(%ew, %reduced : tensor<128xf32>, tensor<128xf32>) outs(%empty3 : tensor<128xf32>) {
+    ^bb0(%e: f32, %r: f32, %out: f32):
+      %new = arith.addf %e, %r : f32
+      linalg.yield %new : f32
+    } -> tensor<128xf32>
+
+    scf.yield %elementwise : tensor<128xf32>
+  }
+  return %result : tensor<128xf32>
+}
+
+// CHECK-LABEL: func.func @reduction_nochain_iter_args
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor<128x384x256xf32>
+//   CHECK-DAG:   %[[C3072:.+]] = arith.constant 3072 : index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+//       CHECK:   %[[INIT:.+]] = linalg.fill {{.*}} tensor<128xf32>
+//       CHECK:   scf.for %{{.*}} = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[EW_ARG:.+]] = %[[INIT]])
+//       CHECK:     scf.for %{{.*}} = %[[C0]] to %[[C3072]] step %[[C1]] iter_args(%[[RED_ARG:.+]] = %[[INIT]])
+//   CHECK-NOT:     scf.for
+//       CHECK:       linalg.generic {{.*}} ins(%{{.*}} : tensor<128x8x4xf32>) outs(%[[RED_ARG]] : tensor<128xf32>)
+//       CHECK:     linalg.generic {{.*}} ins(%[[EW_ARG]], %{{.*}} : tensor<128xf32>, tensor<128xf32>)
+//       CHECK:     scf.yield
+
+// THREAD-LABEL: func.func @reduction_nochain_iter_args
+//   THREAD-NOT:   scf.forall
+
+// -----
+
+// Test that coalescing is skipped when loops have dynamic trip counts.
+#config_dyn = #iree_gpu.lowering_config<{reduction = [0, 8, 4]}>
+#map_dyn1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map_dyn2 = affine_map<(d0, d1, d2) -> (d0)>
+func.func @reduction_dynamic_trip_count(%arg0: tensor<128x384x256xf32>, %dyn_ub: index) -> tensor<128xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty() : tensor<128xf32>
+  %init = linalg.fill ins(%cst : f32) outs(%empty : tensor<128xf32>) -> tensor<128xf32>
+
+  // Parent scf.for loop with dynamic upper bound.
+  // This should NOT be coalesced with reduction tiling loops.
+  %result = scf.for %iv = %c0 to %dyn_ub step %c1 iter_args(%arg1 = %init) -> (tensor<128xf32>) {
+    %slice = tensor.extract_slice %arg0[0, 0, 0] [128, 384, 256] [1, 1, 1] : tensor<128x384x256xf32> to tensor<128x384x256xf32>
+    %reduced = linalg.generic {
+      indexing_maps = [#map_dyn1, #map_dyn2],
+      iterator_types = ["parallel", "reduction", "reduction"]
+      } ins(%slice : tensor<128x384x256xf32>) outs(%arg1 : tensor<128xf32>) attrs =  {lowering_config = #config_dyn} {
+    ^bb0(%in: f32, %out: f32):
+      %add = arith.addf %in, %out : f32
+      linalg.yield %add : f32
+    } -> tensor<128xf32>
+    scf.yield %reduced : tensor<128xf32>
+  }
+  return %result : tensor<128xf32>
+}
+
+// CHECK-LABEL: func.func @reduction_dynamic_trip_count
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor<128x384x256xf32>
+//  CHECK-SAME:   %[[DYN_UB:[A-Za-z0-9]+]]: index
+//   CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//       CHECK:   %[[INIT:.+]] = linalg.fill {{.*}} tensor<128xf32>
+//       CHECK:   scf.for %{{.*}} = %[[C0]] to %[[DYN_UB]] step %[[C1]] iter_args(%[[ARG1:.+]] = %[[INIT]])
+//       CHECK:     scf.for %{{.*}} = %[[C0]] to %{{.*}} step %{{.*}} iter_args(%[[ARG2:.+]] = %[[ARG1]])
+//       CHECK:       scf.for %{{.*}} = %[[C0]] to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.+]] = %[[ARG2]])
+//       CHECK:         linalg.generic {{.*}} ins(%{{.*}} : tensor<128x8x4xf32>) outs(%[[ARG3]] : tensor<128xf32>)
+
+// -----
+
 #config = #iree_gpu.lowering_config<{reduction = [0, 0, 8]}>
 #map = affine_map<(d0, d1) -> (d0, d1)>
 func.func @matmul_fuse(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>, %5: tensor<64x64xf32>) -> tensor<64x64xf32> {
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile_and_convert_conv_to_matmul.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_tile_and_convert_conv_to_matmul.mlir
@@ -19,10 +19,11 @@ func.func @conv_nhwc_generic(%a: tensor<1x3x66x8xf32>, %b: tensor<32x3x3x8xf32>,
 }
 
 // CHECK-LABEL: func.func @conv_nhwc_generic
-//       CHECK:  scf.for %{{.*}} = %c0 to %c3 step %c1
-//       CHECK:    scf.for %{{.*}} = %c0 to %c3 step %c1
-//       CHECK:      linalg.generic
-//  CHECK-SAME:        affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d6)>
+//   CHECK-DAG:  %[[C9:.+]] = arith.constant 9 : index
+//       CHECK:  scf.for %{{.*}} = %c0 to %[[C9]] step %c1
+//   CHECK-NOT:    scf.for
+//       CHECK:    linalg.generic
+//  CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d6)>
 
 // -----
 
@@ -35,10 +36,11 @@ func.func @conv_nhwc_named_dilated(%a: tensor<1x5x68x8xf32>, %b: tensor<32x3x3x8
 }
 
 // CHECK-LABEL: func.func @conv_nhwc_named_dilated
-//       CHECK:  scf.for %{{.*}} = %c0 to %c3 step %c1
-//       CHECK:    scf.for %{{.*}} = %c0 to %c3 step %c1
-//       CHECK:      linalg.generic
-//  CHECK-SAME:        affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d6)>
+//   CHECK-DAG:  %[[C9:.+]] = arith.constant 9 : index
+//       CHECK:  scf.for %{{.*}} = %c0 to %[[C9]] step %c1
+//   CHECK-NOT:    scf.for
+//       CHECK:    linalg.generic
+//  CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d6)>
 
 // -----
 
@@ -51,10 +53,11 @@ func.func @conv_nchw_named(%arg0: tensor<2x16x130x130xf32>, %arg1: tensor<32x16x
 }
 
 // CHECK-LABEL: func.func @conv_nchw_named
-//       CHECK:  scf.for %{{.*}} = %c0 to %c3 step %c1
-//       CHECK:    scf.for %{{.*}} = %c0 to %c3 step %c1
-//       CHECK:      linalg.generic
-//  CHECK-SAME:        affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d4, d2, d3)>
+//   CHECK-DAG:  %[[C9:.+]] = arith.constant 9 : index
+//       CHECK:  scf.for %{{.*}} = %c0 to %[[C9]] step %c1
+//   CHECK-NOT:    scf.for
+//       CHECK:    linalg.generic
+//  CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d4, d2, d3)>
 
 // -----
 
@@ -77,7 +80,8 @@ func.func @conv_chwn_generic(%a: tensor<16x24x16x16xf32>, %b: tensor<16x24x16x16
 }
 
 // CHECK-LABEL: func.func @conv_chwn_generic
-//       CHECK:  scf.for %{{.*}} = %c0 to %c24 step %c1
-//       CHECK:    scf.for %{{.*}} = %c0 to %c16 step %c1
-//       CHECK:      linalg.generic
-//  CHECK-SAME:        affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d1, d2, d3)>
+//   CHECK-DAG:  %[[C384:.+]] = arith.constant 384 : index
+//       CHECK:  scf.for %{{.*}} = %c0 to %[[C384]] step %c1
+//   CHECK-NOT:    scf.for
+//       CHECK:    linalg.generic
+//  CHECK-SAME:      affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d1, d2, d3)>
diff --git a/compiler/src/iree/compiler/Codegen/Common/TileAndFuseUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/TileAndFuseUtils.cpp
@@ -13,10 +13,13 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/TopologicalSortUtils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Dominance.h"
 
 #include <cassert>
@@ -482,6 +485,94 @@ LogicalResult applyTileAndFuseToEachRoot(
                               tileAndFuseOptions, tiledResults->loops);
       }
     }
+
+    // Coalesce scf.for loops created during reduction tiling.
+    // This is done at the very end after all other transformations
+    // to avoid invalidating dominance info or affecting fusion logic.
+    if (tilingLevel == IREE::GPU::TilingLevel::Reduction &&
+        !tiledResults->loops.empty()) {
+      SmallVector<scf::ForOp> forLoops;
+
+      // Check if tiling happened inside an existing scf.for loop
+      // If so, include that parent loop in the coalescing.
+      Operation *parentOp =
+          tiledResults->loops.front().getOperation()->getParentOp();
+      scf::ForOp parentForOp = dyn_cast<scf::ForOp>(parentOp);
+
+      // Collect all the tiled loops first.
+      for (LoopLikeOpInterface loop : tiledResults->loops) {
+        if (auto forOp = dyn_cast<scf::ForOp>(loop.getOperation())) {
+          forLoops.push_back(forOp);
+        }
+      }
+
+      // Only include parent if it forms a proper iter_args chain with the
+      // tiled loops. This follows the same validation as upstream's
+      // coalescePerfectlyNestedSCFForLoops.
+      if (parentForOp && !forLoops.empty()) {
+        // Check if parent and first child form a valid iter_args chain:
+        // 1. Must have the same number of iter_args.
+        // 2. Parent's iter_args must match child's init_args.
+        // 3. Parent's terminator operands must match child's results.
+        scf::ForOp firstChild = forLoops.front();
+        bool formsChain = true;
+
+        if (parentForOp.getNumRegionIterArgs() !=
+            firstChild.getNumRegionIterArgs()) {
+          formsChain = false;
+          LLVM_DEBUG(llvm::dbgs()
+                     << "Skipping parent loop coalescing: different number of "
+                        "iter_args (parent: "
+                     << parentForOp.getNumRegionIterArgs() << ", child: "
+                     << firstChild.getNumRegionIterArgs() << ")\n");
+        }
+
+        if (formsChain && !llvm::equal(parentForOp.getRegionIterArgs(),
+                                       firstChild.getInitArgs())) {
+          formsChain = false;
+          LLVM_DEBUG(llvm::dbgs() << "Skipping parent loop coalescing: parent "
+                                     "iter_args don't match child init_args\n");
+        }
+
+        if (formsChain) {
+          auto parentTerminator = parentForOp.getBody()->getTerminator();
+          if (!llvm::equal(parentTerminator->getOperands(),
+                           firstChild.getResults())) {
+            formsChain = false;
+            LLVM_DEBUG(llvm::dbgs()
+                       << "Skipping parent loop coalescing: parent yield "
+                          "doesn't match child results\n");
+          }
+        }
+
+        if (formsChain) {
+          forLoops.insert(forLoops.begin(), parentForOp);
+        }
+      }
+
+      // If loops have dynamic trip counts and we coalesce them, it can
+      // cause range analysis to not find static bounds. This was mainly
+      // noticed as a problem in applyPaddingLevel, to prevent a regression
+      // we dont coalesce such loops.
+      bool hasDynamicTripCount = false;
+      for (scf::ForOp forOp : forLoops) {
+        if (!getConstantIntValue(forOp.getLowerBound()) ||
+            !getConstantIntValue(forOp.getUpperBound()) ||
+            !getConstantIntValue(forOp.getStep())) {
+          hasDynamicTripCount = true;
+          LLVM_DEBUG(llvm::dbgs()
+                     << "Skipping coalescing: loop has dynamic trip count\n");
+          break;
+        }
+      }
+
+      if (forLoops.size() > 1 && !hasDynamicTripCount) {
+        if (failed(coalesceLoops(rewriter, forLoops))) {
+          // Coalescing failure is not critical, just log and continue.
+          LLVM_DEBUG(llvm::dbgs() << "Failed to coalesce reduction loops\n");
+        }
+      }
+    }
   }
   return success();
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_direct_conv_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_direct_conv_tile_and_fuse.mlir
@@ -62,21 +62,18 @@ hal.executable private @main {
 //      CHECK-DAG:   memref.alloc() : memref<1x1x32x68xf16, #gpu.address_space<workgroup>>
 //      CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-//      CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
-//      CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
-//      CHECK-DAG:   %[[C36:.+]] = arith.constant 36 : index
+//      CHECK-DAG:   %[[C81:.+]] = arith.constant 81 : index
 //          CHECK:   scf.forall ({{.*}}) in (16, 48, 9) {
-//          CHECK:     scf.for {{.+}} = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
-//          CHECK:       scf.for {{.+}} = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
-//          CHECK:         scf.for {{.+}} = %[[C0]] to %[[C36]] step %[[C4]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
-//          CHECK:           gpu.barrier memfence [#gpu.address_space<workgroup>]
-//      CHECK-DAG:           %[[LHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<4xf16>
-//      CHECK-DAG:           vector.transfer_write %[[LHS_RD]]
-//      CHECK-DAG:           %[[RHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<8xf16>
-//      CHECK-DAG:           vector.transfer_write %[[RHS_RD]]
-//          CHECK:           gpu.barrier memfence [#gpu.address_space<workgroup>]
-//      CHECK-DAG:           vector.transfer_read {{.*}} vector<4x4xf16>
-//      CHECK-DAG:           vector.transfer_read {{.*}} vector<4x4xf16>
-//  CHECK-COUNT-4:           amdgpu.mfma 16x16x16
+//          CHECK:     scf.for {{.+}} = %[[C0]] to %[[C81]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
+//      CHECK-NOT:       scf.for
+//          CHECK:       gpu.barrier memfence [#gpu.address_space<workgroup>]
+//      CHECK-DAG:       %[[LHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<4xf16>
+//      CHECK-DAG:       vector.transfer_write %[[LHS_RD]]
+//      CHECK-DAG:       %[[RHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<8xf16>
+//      CHECK-DAG:       vector.transfer_write %[[RHS_RD]]
+//          CHECK:       gpu.barrier memfence [#gpu.address_space<workgroup>]
+//      CHECK-DAG:       vector.transfer_read {{.*}} vector<4x4xf16>
+//      CHECK-DAG:       vector.transfer_read {{.*}} vector<4x4xf16>
+//  CHECK-COUNT-4:       amdgpu.mfma 16x16x16
 //          CHECK:     vector.transfer_write %{{.*}}, %[[BUF2]]
 //          CHECK:   } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir