Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,29 @@ getCombineRelayoutOpsControlFn(IREE::Codegen::RelayoutCombinationScope scope) {
return isa<IREE::Codegen::StoreToBufferOp>(*leaf.getUsers().begin());
};
break;
// Control function for DispatchReshape scope. Like Dispatch, but additionally
// requires the backward relayout slice to contain a `tensor.expand_shape` or
// `tensor.collapse_shape` — the non-tileable reshape whose presence is what
// makes folding into a `map_store` necessary (see the enum doc comment).
case IREE::Codegen::RelayoutCombinationScope::DispatchReshape:
controlFn = [](OpResult leaf) {
if (leaf.getNumUses() != 1) {
return false;
}
if (!isa<IREE::Codegen::StoreToBufferOp>(*leaf.getUsers().begin())) {
return false;
}
llvm::SetVector<Operation *> slice;
BackwardSliceOptions options;
options.filter = isSupportedSingleInputRelayoutOpForResult;
options.inclusive = true;
if (failed(getBackwardSlice(leaf, &slice, options))) {
return false;
}
return llvm::any_of(
slice, llvm::IsaPred<tensor::CollapseShapeOp, tensor::ExpandShapeOp>);
};
break;
// Control function for Workgroup scope. Filters to only relayout ops with
// a single tensor.parallel_insert_slice user inside of a workgroup
// scf.forall op. Relayout chains of only reshapes are also filtered out,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,18 @@ namespace IREE::Codegen {
/// Enum defining the scope of the CombineResultLayoutTransformationPass.
/// - The `Dispatch` scope will combine layout transformation chains that are
/// consumed by an `iree_codegen.store_to_buffer` op.
/// - The `DispatchReshape` scope is like `Dispatch`, but additionally
/// restricted to chains whose backward slice contains a
/// `tensor.expand_shape` or `tensor.collapse_shape`. Such a reshape does not
/// implement `TilingInterface`, so when it sits between two tileable
/// relayout ops it blocks producer fusion and leaves an untiled,
/// whole-tensor intermediate (iree-org/iree#24483). Pure
/// pack/unpack/transpose/pad chains tile fine via tile-and-fuse and do not
/// need a `map_store`.
/// - The `Workgroup` scope will combine layout transformation chains that are
/// consumed by a `tensor.parallel_insert_slice` op at the end of an
/// scf.forall with an `iree_codegen.workgroup_mapping` attribute.
enum class RelayoutCombinationScope { Dispatch, Workgroup };
enum class RelayoutCombinationScope { Dispatch, DispatchReshape, Workgroup };
} // namespace IREE::Codegen

/// Get the corresponding control function for the given scope. The control
Expand Down
2 changes: 2 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ def CombineResultLayoutTransformationPass :
::llvm::cl::values(
clEnumValN(IREE::Codegen::RelayoutCombinationScope::Dispatch, "dispatch",
"Combine relayout ops starting from iree_codegen.store_to_buffer"),
clEnumValN(IREE::Codegen::RelayoutCombinationScope::DispatchReshape, "dispatch-reshape",
"Like dispatch, but only for chains containing an expand/collapse_shape"),
clEnumValN(IREE::Codegen::RelayoutCombinationScope::Workgroup, "workgroup",
"Combine relayout ops starting from workgroup forall terminator ops"))
}]>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// RUN: -split-input-file %s | FileCheck %s --check-prefixes=DISPATCH-SCOPE
// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-combine-result-layout-transformation{scope=workgroup},canonicalize,cse))" \
// RUN: -split-input-file %s | FileCheck %s --check-prefixes=WORKGROUP-SCOPE
// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-combine-result-layout-transformation{scope=dispatch-reshape},canonicalize,cse))" \
// RUN: -split-input-file %s | FileCheck %s --check-prefixes=RESHAPE-SCOPE

func.func @fold_collapse_shape_op(%source : tensor<2x4x16xf32>, %result : memref<8x16xf32>) {
%collapse = tensor.collapse_shape %source [[0, 1], [2]] : tensor<2x4x16xf32> into tensor<8x16xf32>
Expand Down Expand Up @@ -555,3 +557,46 @@ func.func @fold_pack_op_dynamic_inner_tiles(%source : tensor<250x250xf32>, %resu
// Verify no second padding loop (inner tile size 1 needs no padding)
// DISPATCH-SCOPE-NOT: scf.forall
// DISPATCH-SCOPE: return

// -----

// The `dispatch-reshape` scope folds a relayout chain only when its backward
// slice contains a non-tileable `expand_shape`/`collapse_shape`. Here the
// chain is `expand_shape` -> `transpose`, so it folds into a single map_store.
func.func @reshape_chain_folds_under_dispatch_reshape(
%source : tensor<8x16xf32>, %result : memref<4x16x2xf32>) {
%expand = tensor.expand_shape %source [[0, 1], [2]] output_shape [2, 4, 16]
: tensor<8x16xf32> into tensor<2x4x16xf32>
%init = tensor.empty() : tensor<4x16x2xf32>
%transposed = linalg.transpose ins(%expand : tensor<2x4x16xf32>)
outs(%init : tensor<4x16x2xf32>) permutation = [1, 2, 0]
iree_codegen.store_to_buffer %transposed, %result
: tensor<4x16x2xf32> into memref<4x16x2xf32>
return
}
// RESHAPE-SCOPE-LABEL: @reshape_chain_folds_under_dispatch_reshape
// RESHAPE-SCOPE-NOT: tensor.expand_shape
// RESHAPE-SCOPE-NOT: linalg.transpose
// RESHAPE-SCOPE: iree_linalg_ext.map_store

// -----

// A pure transpose chain carries no reshape, so the `dispatch-reshape` scope
// leaves it untouched for tile-and-fuse. The plain `dispatch` scope, which is
// not gated on a reshape, still folds it into a map_store -- this contrast is
// the whole point of the `dispatch-reshape` scope.
func.func @pure_transpose_skipped_under_dispatch_reshape(
%source : tensor<2x4x16xf32>, %result : memref<4x16x2xf32>) {
%init = tensor.empty() : tensor<4x16x2xf32>
%transposed = linalg.transpose ins(%source : tensor<2x4x16xf32>)
outs(%init : tensor<4x16x2xf32>) permutation = [1, 2, 0]
iree_codegen.store_to_buffer %transposed, %result
: tensor<4x16x2xf32> into memref<4x16x2xf32>
return
}
// DISPATCH-SCOPE-LABEL: @pure_transpose_skipped_under_dispatch_reshape
// DISPATCH-SCOPE: iree_linalg_ext.map_store
//
// RESHAPE-SCOPE-LABEL: @pure_transpose_skipped_under_dispatch_reshape
// RESHAPE-SCOPE: linalg.transpose
// RESHAPE-SCOPE-NOT: iree_linalg_ext.map_store
20 changes: 19 additions & 1 deletion compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,25 @@ void buildLLVMCPUCodegenConfigurationPassPipelineImpl(
.addPass(createConvertAccGEMMToGEMMPass)
// TODO: Remove the following pass the plumb support for
// #hal.descriptor_type memory space through the stack.
.addPass(createEraseHALDescriptorTypeFromMemRefPass);
.addPass(createEraseHALDescriptorTypeFromMemRefPass)
// Fold reshape-containing relayout chains (`pack` -> `expand_shape` ->
// `transpose`) emitted by encoding materialization for non-row-major
// swizzles into a single `iree_linalg_ext.map_store`, before the
// dispatch is tiled. This mirrors the GPU configuration pipeline.
// Without it, the intervening `tensor.expand_shape` (not a
// `TilingInterface` op) blocks producer fusion and leaves an untiled,
// whole-tensor `pack` intermediate whose dynamic `tensor.empty`
// bufferizes to a bogus unbounded allocation (iree-org/iree#24483);
// `map_store` is a scatter with no intermediate buffer. The
// `DispatchReshape` scope restricts this to reshape-containing chains so
// plain `pack` encodings (which tile fine) are left untouched.
.addPass(createBufferizeDispatchTensorLoadStorePass)
.addPass([] {
CombineResultLayoutTransformationPassOptions options;
options.scope =
IREE::Codegen::RelayoutCombinationScope::DispatchReshape;
return createCombineResultLayoutTransformationPass(options);
Comment thread
egebeysel marked this conversation as resolved.
});
modulePassManager.addPass(createLLVMCPUSelectLoweringStrategyPass());
LLVM_DEBUG({
llvm::dbgs() << "LLVMCPU codegen configuration pass pipeline:\n";
Expand Down
13 changes: 10 additions & 3 deletions compiler/src/iree/compiler/Codegen/Utils/CPUUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "iree/compiler/Codegen/Utils/CPUUtils.h"
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"

#include <numeric>

Expand Down Expand Up @@ -56,7 +57,8 @@ FailureOr<Operation *> getRootOperation(ArrayRef<Operation *> computeOps) {
}

if (isa<TilingInterface>(op) &&
!isa<tensor::PadOp, linalg::PackOp, linalg::UnPackOp>(op)) {
!isa<tensor::PadOp, linalg::PackOp, linalg::UnPackOp,
IREE::LinalgExt::MapLoadOp, IREE::LinalgExt::MapStoreOp>(op)) {
Comment thread
egebeysel marked this conversation as resolved.
// All other operations that implement this interface are root ops.
rootOperation = op;
break;
Expand All @@ -74,9 +76,14 @@ FailureOr<Operation *> getRootOperation(ArrayRef<Operation *> computeOps) {
}

if (!rootOperation) {
// Check for pad/pack/unpack ops by themselves.
// Check for relayout ops (pad/pack/unpack and the map_load/map_store
// scatter/gather ops that encoding materialization folds into) by
// themselves. These are excluded from the sweeps above so that a real
// compute op in the same dispatch wins; a pure-relayout dispatch (e.g. a
// `set_encoding` dispatch) still picks one of them here.
for (auto op : llvm::reverse(computeOps)) {
if (isa<tensor::PadOp, linalg::PackOp, linalg::UnPackOp>(op)) {
if (isa<tensor::PadOp, linalg::PackOp, linalg::UnPackOp,
IREE::LinalgExt::MapLoadOp, IREE::LinalgExt::MapStoreOp>(op)) {
rootOperation = op;
break;
}
Expand Down
Loading