iree-org · bjacob · Jun 5, 2026 · May 20, 2026
@@ -930,6 +930,29 @@ getCombineRelayoutOpsControlFn(IREE::Codegen::RelayoutCombinationScope scope) {
       return isa<IREE::Codegen::StoreToBufferOp>(*leaf.getUsers().begin());
     };
     break;
+  // Control function for DispatchReshape scope. Like Dispatch, but additionally
+  // requires the backward relayout slice to contain a `tensor.expand_shape` or
+  // `tensor.collapse_shape` — the non-tileable reshape whose presence is what
+  // makes folding into a `map_store` necessary (see the enum doc comment).
+  case IREE::Codegen::RelayoutCombinationScope::DispatchReshape:
+    controlFn = [](OpResult leaf) {
+      if (leaf.getNumUses() != 1) {
+        return false;
+      }
+      if (!isa<IREE::Codegen::StoreToBufferOp>(*leaf.getUsers().begin())) {
+        return false;
+      }
+      llvm::SetVector<Operation *> slice;
+      BackwardSliceOptions options;
+      options.filter = isSupportedSingleInputRelayoutOpForResult;
+      options.inclusive = true;
+      if (failed(getBackwardSlice(leaf, &slice, options))) {
+        return false;
+      }
+      return llvm::any_of(
+          slice, llvm::IsaPred<tensor::CollapseShapeOp, tensor::ExpandShapeOp>);
+    };
+    break;
   // Control function for Workgroup scope. Filters to only relayout ops with
   // a single tensor.parallel_insert_slice user inside of a workgroup
   // scf.forall op. Relayout chains of only reshapes are also filtered out,

@@ -44,10 +44,18 @@ namespace IREE::Codegen {
 /// Enum defining the scope of the CombineResultLayoutTransformationPass.
 ///  - The `Dispatch` scope will combine layout transformation chains that are
 ///    consumed by an `iree_codegen.store_to_buffer` op.
+///  - The `DispatchReshape` scope is like `Dispatch`, but additionally
+///    restricted to chains whose backward slice contains a
+///    `tensor.expand_shape` or `tensor.collapse_shape`. Such a reshape does not
+///    implement `TilingInterface`, so when it sits between two tileable
+///    relayout ops it blocks producer fusion and leaves an untiled,
+///    whole-tensor intermediate (iree-org/iree#24483). Pure
+///    pack/unpack/transpose/pad chains tile fine via tile-and-fuse and do not
+///    need a `map_store`.
 ///  - The `Workgroup` scope will combine layout transformation chains that are
 ///    consumed by a `tensor.parallel_insert_slice` op at the end of an
 ///    scf.forall with an `iree_codegen.workgroup_mapping` attribute.
-enum class RelayoutCombinationScope { Dispatch, Workgroup };
+enum class RelayoutCombinationScope { Dispatch, DispatchReshape, Workgroup };
 } // namespace IREE::Codegen
 
 /// Get the corresponding control function for the given scope. The control

@@ -199,6 +199,8 @@ def CombineResultLayoutTransformationPass :
            ::llvm::cl::values(
              clEnumValN(IREE::Codegen::RelayoutCombinationScope::Dispatch, "dispatch",
                "Combine relayout ops starting from iree_codegen.store_to_buffer"),
+             clEnumValN(IREE::Codegen::RelayoutCombinationScope::DispatchReshape, "dispatch-reshape",
+               "Like dispatch, but only for chains containing an expand/collapse_shape"),
              clEnumValN(IREE::Codegen::RelayoutCombinationScope::Workgroup, "workgroup",
                "Combine relayout ops starting from workgroup forall terminator ops"))
            }]>

@@ -2,6 +2,8 @@
 // RUN:   -split-input-file %s | FileCheck %s --check-prefixes=DISPATCH-SCOPE
 // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-combine-result-layout-transformation{scope=workgroup},canonicalize,cse))" \
 // RUN:   -split-input-file %s | FileCheck %s --check-prefixes=WORKGROUP-SCOPE
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-combine-result-layout-transformation{scope=dispatch-reshape},canonicalize,cse))" \
+// RUN:   -split-input-file %s | FileCheck %s --check-prefixes=RESHAPE-SCOPE
 
 func.func @fold_collapse_shape_op(%source : tensor<2x4x16xf32>, %result : memref<8x16xf32>) {
   %collapse = tensor.collapse_shape %source [[0, 1], [2]] : tensor<2x4x16xf32> into tensor<8x16xf32>
@@ -555,3 +557,46 @@ func.func @fold_pack_op_dynamic_inner_tiles(%source : tensor<250x250xf32>, %resu
 // Verify no second padding loop (inner tile size 1 needs no padding)
 //   DISPATCH-SCOPE-NOT:   scf.forall
 //       DISPATCH-SCOPE:   return
+
+// -----
+
+// The `dispatch-reshape` scope folds a relayout chain only when its backward
+// slice contains a non-tileable `expand_shape`/`collapse_shape`. Here the
+// chain is `expand_shape` -> `transpose`, so it folds into a single map_store.
+func.func @reshape_chain_folds_under_dispatch_reshape(
+    %source : tensor<8x16xf32>, %result : memref<4x16x2xf32>) {
+  %expand = tensor.expand_shape %source [[0, 1], [2]] output_shape [2, 4, 16]
+      : tensor<8x16xf32> into tensor<2x4x16xf32>
+  %init = tensor.empty() : tensor<4x16x2xf32>
+  %transposed = linalg.transpose ins(%expand : tensor<2x4x16xf32>)
+      outs(%init : tensor<4x16x2xf32>) permutation = [1, 2, 0]
+  iree_codegen.store_to_buffer %transposed, %result
+      : tensor<4x16x2xf32> into memref<4x16x2xf32>
+  return
+}
+// RESHAPE-SCOPE-LABEL: @reshape_chain_folds_under_dispatch_reshape
+//   RESHAPE-SCOPE-NOT:   tensor.expand_shape
+//   RESHAPE-SCOPE-NOT:   linalg.transpose
+//       RESHAPE-SCOPE:   iree_linalg_ext.map_store
+
+// -----
+
+// A pure transpose chain carries no reshape, so the `dispatch-reshape` scope
+// leaves it untouched for tile-and-fuse. The plain `dispatch` scope, which is
+// not gated on a reshape, still folds it into a map_store -- this contrast is
+// the whole point of the `dispatch-reshape` scope.
+func.func @pure_transpose_skipped_under_dispatch_reshape(
+    %source : tensor<2x4x16xf32>, %result : memref<4x16x2xf32>) {
+  %init = tensor.empty() : tensor<4x16x2xf32>
+  %transposed = linalg.transpose ins(%source : tensor<2x4x16xf32>)
+      outs(%init : tensor<4x16x2xf32>) permutation = [1, 2, 0]
+  iree_codegen.store_to_buffer %transposed, %result
+      : tensor<4x16x2xf32> into memref<4x16x2xf32>
+  return
+}
+// DISPATCH-SCOPE-LABEL: @pure_transpose_skipped_under_dispatch_reshape
+//       DISPATCH-SCOPE:   iree_linalg_ext.map_store
+//
+// RESHAPE-SCOPE-LABEL: @pure_transpose_skipped_under_dispatch_reshape
+//       RESHAPE-SCOPE:   linalg.transpose
+//   RESHAPE-SCOPE-NOT:   iree_linalg_ext.map_store
@@ -652,7 +652,25 @@ void buildLLVMCPUCodegenConfigurationPassPipelineImpl(
       .addPass(createConvertAccGEMMToGEMMPass)
       // TODO: Remove the following pass the plumb support for
       // #hal.descriptor_type memory space through the stack.
-      .addPass(createEraseHALDescriptorTypeFromMemRefPass);
+      .addPass(createEraseHALDescriptorTypeFromMemRefPass)
+      // Fold reshape-containing relayout chains (`pack` -> `expand_shape` ->
+      // `transpose`) emitted by encoding materialization for non-row-major
+      // swizzles into a single `iree_linalg_ext.map_store`, before the
+      // dispatch is tiled. This mirrors the GPU configuration pipeline.
+      // Without it, the intervening `tensor.expand_shape` (not a
+      // `TilingInterface` op) blocks producer fusion and leaves an untiled,
+      // whole-tensor `pack` intermediate whose dynamic `tensor.empty`
+      // bufferizes to a bogus unbounded allocation (iree-org/iree#24483);
+      // `map_store` is a scatter with no intermediate buffer. The
+      // `DispatchReshape` scope restricts this to reshape-containing chains so
+      // plain `pack` encodings (which tile fine) are left untouched.
+      .addPass(createBufferizeDispatchTensorLoadStorePass)
+      .addPass([] {
+        CombineResultLayoutTransformationPassOptions options;
+        options.scope =
+            IREE::Codegen::RelayoutCombinationScope::DispatchReshape;
+        return createCombineResultLayoutTransformationPass(options);
+      });
   modulePassManager.addPass(createLLVMCPUSelectLoweringStrategyPass());
   LLVM_DEBUG({
     llvm::dbgs() << "LLVMCPU codegen configuration pass pipeline:\n";

@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/Utils/CPUUtils.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 
 #include <numeric>
 
@@ -56,7 +57,8 @@ FailureOr<Operation *> getRootOperation(ArrayRef<Operation *> computeOps) {
     }
 
     if (isa<TilingInterface>(op) &&
-        !isa<tensor::PadOp, linalg::PackOp, linalg::UnPackOp>(op)) {
+        !isa<tensor::PadOp, linalg::PackOp, linalg::UnPackOp,
+             IREE::LinalgExt::MapLoadOp, IREE::LinalgExt::MapStoreOp>(op)) {
       // All other operations that implement this interface are root ops.
       rootOperation = op;
       break;
@@ -74,9 +76,14 @@ FailureOr<Operation *> getRootOperation(ArrayRef<Operation *> computeOps) {
   }
 
   if (!rootOperation) {
-    // Check for pad/pack/unpack ops by themselves.
+    // Check for relayout ops (pad/pack/unpack and the map_load/map_store
+    // scatter/gather ops that encoding materialization folds into) by
+    // themselves. These are excluded from the sweeps above so that a real
+    // compute op in the same dispatch wins; a pure-relayout dispatch (e.g. a
+    // `set_encoding` dispatch) still picks one of them here.
     for (auto op : llvm::reverse(computeOps)) {
-      if (isa<tensor::PadOp, linalg::PackOp, linalg::UnPackOp>(op)) {
+      if (isa<tensor::PadOp, linalg::PackOp, linalg::UnPackOp,
+              IREE::LinalgExt::MapLoadOp, IREE::LinalgExt::MapStoreOp>(op)) {
         rootOperation = op;
         break;
       }