iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/BUILD.bazel‎
Lines changed: 2 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/BUILD.bazel‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/HoistInnerTiledAccReshapes.cpp‎
Lines changed: 155 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/HoistInnerTiledAccReshapes.cpp‎
Lines changed: 155 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/Passes.td‎
Lines changed: 17 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/Passes.td‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/test/hoist_inner_tiled_acc_reshapes.mlir‎
Lines changed: 69 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/test/hoist_inner_tiled_acc_reshapes.mlir‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -114,6 +114,7 @@ iree_compiler_cc_library(
         "ForallToFor.cpp",
         "FuseTensorPadWithConsumer.cpp",
         "GenericVectorization.cpp",
+        "HoistInnerTiledAccReshapes.cpp",
         "HoistStaticallyBoundAllocations.cpp",
         "HoistUnrolledVectorExtractInsertSlice.cpp",
         "IREECodegenCanonicalizer.cpp",
@@ -227,6 +228,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Dialect/TensorExt/Transforms",
         "//compiler/src/iree/compiler/Dialect/Util/Analysis",
         "//compiler/src/iree/compiler/Dialect/Util/IR",
+        "//compiler/src/iree/compiler/Dialect/Util/Transforms",
         "//compiler/src/iree/compiler/Utils",
         "//llvm-external-projects/iree-dialects:IREELinalgTransformDialect",
         "@llvm-project//llvm:Support",
 
@@ -107,6 +107,7 @@ iree_cc_library(
     "ForallToFor.cpp"
     "FuseTensorPadWithConsumer.cpp"
     "GenericVectorization.cpp"
+    "HoistInnerTiledAccReshapes.cpp"
     "HoistStaticallyBoundAllocations.cpp"
     "HoistUnrolledVectorExtractInsertSlice.cpp"
     "IREECodegenCanonicalizer.cpp"
@@ -264,6 +265,7 @@ iree_cc_library(
     iree::compiler::Dialect::TensorExt::Transforms
     iree::compiler::Dialect::Util::Analysis
     iree::compiler::Dialect::Util::IR
+    iree::compiler::Dialect::Util::Transforms
     iree::compiler::Utils
   PUBLIC
 )
 
@@ -0,0 +1,155 @@
+// Copyright 2026 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h"
+#include "iree/compiler/Dialect/Util/IR/UtilDialect.h"
+#include "iree/compiler/Dialect/Util/IR/UtilOps.h"
+#include "iree/compiler/Dialect/Util/Transforms/Passes.h"
+#include "mlir/Dialect/UB/IR/UBOps.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_HOISTINNERTILEDACCRESHAPESPASS
+#include "iree/compiler/Codegen/Common/Passes.h.inc"
+
+// Look for operations that reshape vectors to or from the form needed by
+// intrinsics, which are hard to hoist from loops up in vector distribute as
+// currently architected.
+static bool isReshapeOp(Operation *op) {
+  return isa<vector::TransposeOp, vector::ShapeCastOp, vector::BroadcastOp>(op);
+}
+
+static constexpr llvm::StringLiteral kAccReshapeTo = "acc_reshape_to_intrinsic";
+static constexpr llvm::StringLiteral kAccReshapeFrom =
+    "acc_reshape_from_intrinsic";
+
+namespace {
+
+struct WrapAccReshapesPattern final
+    : OpRewritePattern<IREE::Codegen::InnerTiledOp> {
+  using Base::Base;
+
+  LogicalResult matchAndRewrite(IREE::Codegen::InnerTiledOp tiledOp,
+                                PatternRewriter &rewriter) const override {
+    auto loopLike = dyn_cast<LoopLikeOpInterface>(tiledOp->getParentOp());
+    if (!loopLike || loopLike.getRegionIterArgs().empty()) {
+      return rewriter.notifyMatchFailure(tiledOp,
+                                         "not inside a loop with iter_args");
+    }
+
+    bool anyOutputMatched = false;
+    for (size_t outputIdx = 0, numOutputs = tiledOp.getOutputs().size();
+         outputIdx < numOutputs; ++outputIdx) {
+      Value accOperand = tiledOp.getOutputs()[outputIdx];
+
+      SmallVector<Operation *> prefixOps;
+      Value accRoot = accOperand;
+      while (auto *defOp = accRoot.getDefiningOp()) {
+        if (!defOp->hasOneUse() || !isReshapeOp(defOp)) {
+          break;
+        }
+        prefixOps.push_back(defOp);
+        accRoot = defOp->getOperand(0);
+      }
+
+      SmallVector<Operation *> suffixOps;
+      Value suffixEnd = tiledOp.getResult(outputIdx);
+      while (suffixEnd.hasOneUse()) {
+        Operation *user = *suffixEnd.getUsers().begin();
+        if (!isReshapeOp(user)) {
+          break;
+        }
+        suffixOps.push_back(user);
+        suffixEnd = user->getResult(0);
+      }
+
+      if (prefixOps.empty() || suffixOps.empty()) {
+        continue;
+      }
+      if (!isa<BlockArgument>(accRoot)) {
+        continue;
+      }
+
+      rewriter.setInsertionPoint(prefixOps.back());
+      // Wrap the prefix reshapes (iter_arg -> inner_tiled accumulator shape)
+      // in a hoistable_conversion so the pair can be hoisted out of the loop.
+      auto prefixHoist = IREE::Util::HoistableConversionOp::create(
+          rewriter, tiledOp.getLoc(), /*tag=*/kAccReshapeTo,
+          /*inverseTag=*/kAccReshapeFrom, accRoot,
+          [&](OpBuilder &b, Location loc, ValueRange args) {
+            Value v = args[0];
+            for (auto *op : llvm::reverse(prefixOps)) {
+              IRMapping mapping;
+              mapping.map(op->getOperand(0), v);
+              v = b.clone(*op, mapping)->getResult(0);
+            }
+            return SmallVector<Value>{v};
+          });
+      rewriter.replaceAllUsesWith(accOperand, prefixHoist.getResult(0));
+      for (auto *op : prefixOps) {
+        if (op->use_empty()) {
+          rewriter.eraseOp(op);
+        }
+      }
+
+      Value suffixInput = tiledOp.getResult(outputIdx);
+      rewriter.setInsertionPointAfter(suffixOps.back());
+      // Wrap the suffix reshapes (inner_tiled result -> iter_arg shape)
+      // as the inverse conversion.
+      auto suffixHoist = IREE::Util::HoistableConversionOp::create(
+          rewriter, tiledOp.getLoc(), /*tag=*/kAccReshapeFrom,
+          /*inverseTag=*/kAccReshapeTo, TypeRange{suffixEnd.getType()},
+          suffixInput, [&](OpBuilder &b, Location loc, ValueRange args) {
+            Value v = args[0];
+            for (auto *op : suffixOps) {
+              IRMapping mapping;
+              mapping.map(op->getOperand(0), v);
+              v = b.clone(*op, mapping)->getResult(0);
+            }
+            return SmallVector<Value>{v};
+          });
+      rewriter.replaceAllUsesWith(suffixEnd, suffixHoist.getResult(0));
+      for (auto *op : llvm::reverse(suffixOps)) {
+        if (op->use_empty()) {
+          rewriter.eraseOp(op);
+        }
+      }
+
+      anyOutputMatched = true;
+    }
+
+    return success(anyOutputMatched);
+  }
+};
+
+struct HoistInnerTiledAccReshapesPass final
+    : impl::HoistInnerTiledAccReshapesPassBase<HoistInnerTiledAccReshapesPass> {
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    RewritePatternSet patterns(context);
+    patterns.add<WrapAccReshapesPattern>(context);
+    bool changed = false;
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns),
+                                     GreedyRewriteConfig(), &changed))) {
+      return signalPassFailure();
+    }
+
+    if (changed) {
+      if (failed(IREE::Util::eliminateHoistableConversions(getOperation()))) {
+        return signalPassFailure();
+      }
+    }
+  }
+};
+
+} // namespace
+} // namespace mlir::iree_compiler
@@ -656,6 +656,23 @@ def OptimizeTensorInsertExtractSlicesPass
   ];
 }
 
+def HoistInnerTiledAccReshapesPass :
+    InterfacePass<"iree-codegen-hoist-inner-tiled-acc-reshapes", "mlir::FunctionOpInterface"> {
+  let summary = "Hoist vector reshapes surrounding inner_tiled ops out of loops";
+  let description = [{
+    This pass, mainly intended for use in vector distribution-based pipelines,
+    searches for chains of vector reshapes (currently transpose, shape_cast, and
+    broadcast, but it could be extended to other operations) that bracket an
+    `iree_codegen.inner_tiled`'s accumulator(s) on the way from and to the
+    argument of the reduction loop it's in, and adds in `util.hoistable_conversion`
+    markers to move these operations out of the loop to enable further optimizations.
+  }];
+  let dependentDialects = [
+    "::mlir::iree_compiler::IREE::Util::UtilDialect",
+    "::mlir::ub::UBDialect",
+  ];
+}
+
 def HoistUnrolledVectorExtractInsertSlicePass :
     InterfacePass<"iree-codegen-hoist-vector-extract-insert-slice", "mlir::FunctionOpInterface"> {
   let summary = "Hoist unrolled vector (extract, insert) pairs out of scf.for op";
 
@@ -69,6 +69,7 @@ iree_lit_test_suite(
             "generic_vectorization_masked_inferred.mlir",
             "generic_vectorization_unmasked.mlir",
             "generic_vectorization_using_transfer_gather.mlir",
+            "hoist_inner_tiled_acc_reshapes.mlir",
             "hoist_statically_bound_allocations.mlir",
             "hoist_unrolled_vector_extract_insert_slice.mlir",
             "insert_batch_dim_for_batchless_conv.mlir",
 
@@ -64,6 +64,7 @@ iree_lit_test_suite(
     "generic_vectorization_masked_inferred.mlir"
     "generic_vectorization_unmasked.mlir"
     "generic_vectorization_using_transfer_gather.mlir"
+    "hoist_inner_tiled_acc_reshapes.mlir"
     "hoist_statically_bound_allocations.mlir"
     "hoist_unrolled_vector_extract_insert_slice.mlir"
     "insert_batch_dim_for_batchless_conv.mlir"
 
@@ -0,0 +1,69 @@
+// RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-hoist-inner-tiled-acc-reshapes))" %s | FileCheck %s
+
+#contraction_accesses = [
+ affine_map<(i, j, k) -> (i, k)>,
+ affine_map<(i, j, k) -> (k, j)>,
+ affine_map<(i, j, k) -> (i, j)>
+]
+
+// CHECK-LABEL: @hoist_shape_cast_chain
+// CHECK-SAME: %[[INIT:[a-zA-Z0-9]+]]: vector<2x2x1x1x4x1xf32>
+// CHECK-DAG: %[[POISON:.+]] = ub.poison : vector<2x2x1x1x4x1xf32>
+// CHECK-DAG: %[[SC0:.+]] = vector.shape_cast %[[INIT]]
+// CHECK: %[[LOOP:.+]]:2 = scf.for {{.*}} iter_args(%[[DEADACC:.*]] = %[[POISON]], %[[ACC:.*]] = %[[SC0]])
+// CHECK:   %[[OUT:.+]] = iree_codegen.inner_tiled {{.*}} outs(%[[ACC]])
+// CHECK:   scf.yield %[[DEADACC]], %[[OUT]]
+// CHECK: vector.shape_cast %[[LOOP]]#1
+// CHECK-NOT: util.hoistable_conversion
+func.func @hoist_shape_cast_chain(
+    %lhs: vector<2x2x4xf16>, %rhs: vector<2x2x4xf16>,
+    %init: vector<2x2x1x1x4x1xf32>) -> vector<2x2x1x1x4x1xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %result = scf.for %iv = %c0 to %c10 step %c1 iter_args(%acc = %init) -> vector<2x2x1x1x4x1xf32> {
+    %inner_acc = vector.shape_cast %acc : vector<2x2x1x1x4x1xf32> to vector<2x2x4x1xf32>
+    %mma = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%inner_acc) {
+      indexing_maps = #contraction_accesses,
+      iterator_types = [#linalg.iterator_type<parallel>,
+                        #linalg.iterator_type<parallel>,
+                        #linalg.iterator_type<reduction>],
+      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+      semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
+    } : vector<2x2x4xf16>, vector<2x2x4xf16> into vector<2x2x4x1xf32>
+    %back = vector.shape_cast %mma : vector<2x2x4x1xf32> to vector<2x2x1x1x4x1xf32>
+    scf.yield %back : vector<2x2x1x1x4x1xf32>
+  }
+  return %result : vector<2x2x1x1x4x1xf32>
+}
+
+// -----
+
+#contraction_accesses2 = [
+ affine_map<(i, j, k) -> (i, k)>,
+ affine_map<(i, j, k) -> (k, j)>,
+ affine_map<(i, j, k) -> (i, j)>
+]
+
+// CHECK-LABEL: @no_reshape
+// CHECK-NOT: util.hoistable_conversion
+// CHECK-NOT: vector.shape_cast
+func.func @no_reshape(
+    %lhs: vector<2x2x4xf16>, %rhs: vector<2x2x4xf16>,
+    %init: vector<2x2x4x1xf32>) -> vector<2x2x4x1xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %result = scf.for %iv = %c0 to %c10 step %c1 iter_args(%acc = %init) -> vector<2x2x4x1xf32> {
+    %mma = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) {
+      indexing_maps = #contraction_accesses2,
+      iterator_types = [#linalg.iterator_type<parallel>,
+                        #linalg.iterator_type<parallel>,
+                        #linalg.iterator_type<reduction>],
+      kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+      semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
+    } : vector<2x2x4xf16>, vector<2x2x4xf16> into vector<2x2x4x1xf32>
+    scf.yield %mma : vector<2x2x4x1xf32>
+  }
+  return %result : vector<2x2x4x1xf32>
+}
@@ -97,6 +97,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Dialect/LinalgExt/IR",
         "//compiler/src/iree/compiler/Dialect/LinalgExt/Utils",
         "//compiler/src/iree/compiler/Dialect/TensorExt/IR",
+        "//compiler/src/iree/compiler/Dialect/Util/IR",
         "//compiler/src/iree/compiler/Utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AMDGPUDialect",
 
@@ -79,6 +79,7 @@ iree_cc_library(
     iree::compiler::Dialect::LinalgExt::IR
     iree::compiler::Dialect::LinalgExt::Utils
     iree::compiler::Dialect::TensorExt::IR
+    iree::compiler::Dialect::Util::IR
     iree::compiler::Utils
     iree::compiler::bindings::c::headers
   PUBLIC