Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3238,6 +3238,27 @@ bool AsyncDispatchOp::preferCloneToConsumers() {
return !consumesAny;
}

void AsyncDispatchOp::getEffects(
SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
// A non-tied resource result is a fresh allocation: the dispatch creates a
// new buffer that didn't exist before. Reporting Allocate prevents CSE from
// merging two identical dispatch calls that each produce their own buffer,
// which would cause both consumers to alias the same buffer and corrupt
// each other's in-place mutations (e.g. two ScatterElements sharing one
// zero-initialized buffer).
//
// When all results are tied, the dispatch transforms existing buffers
// in-place and no allocation occurs — CSE may legitimately merge identical
// calls.
for (unsigned i = 0, e = getNumResults(); i < e; ++i) {
if (isa<IREE::Stream::ResourceType>(getResult(i).getType()) &&
!getTiedResultOperandIndex(i).has_value()) {
effects.emplace_back(MemoryEffects::Allocate::get());
return;
}
}
}

//===----------------------------------------------------------------------===//
// stream.async.func
//===----------------------------------------------------------------------===//
Expand Down
3 changes: 2 additions & 1 deletion compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -2645,7 +2645,7 @@ def Stream_AsyncStoreOp : Stream_PureOp<"async.store", [
let hasCanonicalizer = 1;
}

def Stream_AsyncDispatchOp : Stream_PureOp<"async.dispatch", [
def Stream_AsyncDispatchOp : Stream_Op<"async.dispatch", [
AttrSizedOperandSegments,
DeclareOpInterfaceMethods<SymbolUserOpInterface>,
Stream_AffinityOp,
Expand All @@ -2656,6 +2656,7 @@ def Stream_AsyncDispatchOp : Stream_PureOp<"async.dispatch", [
DeclareOpInterfaceMethods<Stream_AsyncAccessOp, [
"getAsyncAccessRanges",
]>,
DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
Util_HoistableOpInterface,
Util_SizeAwareOp,
DeclareOpInterfaceMethods<Util_TiedOpInterface, [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ iree_lit_test_suite(
"layout_slices.mlir",
"materialize_builtins.mlir",
"materialize_copy_on_write.mlir",
"materialize_cow_dispatch_cse_aliasing.mlir",
"materialize_encodings.mlir",
"materialize_transient_size_queries.mlir",
"pack_constants.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ iree_lit_test_suite(
"layout_slices.mlir"
"materialize_builtins.mlir"
"materialize_copy_on_write.mlir"
"materialize_cow_dispatch_cse_aliasing.mlir"
"materialize_encodings.mlir"
"materialize_transient_size_queries.mlir"
"pack_constants.mlir"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -405,16 +405,19 @@ util.func public @constantCloneMultiUse(%size: index) -> (!stream.resource<const
%clone1 = stream.async.clone %const : !stream.resource<constant>{%size} -> !stream.resource<constant>{%size}

// Both dispatches read from the same constant (no mutation).
// CSE will deduplicate these identical dispatches into one.
// CSE cannot merge them because they produce non-tied resource results
// (fresh allocations) and getEffects() reports Allocate.
// CHECK: %[[RESULT0:.+]] = stream.async.dispatch @constantDispatch::@entry
// CHECK-SAME: %[[CONST]]
%result0 = stream.async.dispatch @constantDispatch::@entry(%clone0[%c0 to %size for %size]) :
(!stream.resource<constant>{%size}) -> !stream.resource<constant>{%size}

// CHECK: %[[RESULT1:.+]] = stream.async.dispatch @constantDispatch::@entry
// CHECK-SAME: %[[CONST]]
%result1 = stream.async.dispatch @constantDispatch::@entry(%clone1[%c0 to %size for %size]) :
(!stream.resource<constant>{%size}) -> !stream.resource<constant>{%size}

// CHECK: util.return %[[RESULT0]], %[[RESULT0]]
// CHECK: util.return %[[RESULT0]], %[[RESULT1]]
util.return %result0, %result1 : !stream.resource<constant>, !stream.resource<constant>
}

Expand Down Expand Up @@ -1349,7 +1352,8 @@ util.func public @multiCloneImmutableSource(%size: index) -> (!stream.resource<*
%source = stream.async.splat %c123_i32 : i32 -> !stream.resource<*>{%size}
// Multiple clones of same source, all used by read-only dispatches.
// All clones can be elided because neither source nor results are mutated.
// After elision, CSE folds the identical dispatches into one.
// CSE cannot fold the dispatches because they produce non-tied resource
// results (getEffects() reports Allocate).
// CHECK-NOT: stream.async.clone
%clone0 = stream.async.clone %source : !stream.resource<*>{%size} -> !stream.resource<*>{%size}
%d0 = stream.async.dispatch @dispatch_ex::@dispatch(%clone0[%c0 to %size for %size])
Expand All @@ -1360,8 +1364,10 @@ util.func public @multiCloneImmutableSource(%size: index) -> (!stream.resource<*
%clone2 = stream.async.clone %source : !stream.resource<*>{%size} -> !stream.resource<*>{%size}
%d2 = stream.async.dispatch @dispatch_ex::@dispatch(%clone2[%c0 to %size for %size])
: (!stream.resource<*>{%size}) -> !stream.resource<*>{%size}
// CHECK: %[[DISPATCH:.+]] = stream.async.dispatch @dispatch_ex::@dispatch(%[[SOURCE]]
// CHECK: util.return %[[DISPATCH]], %[[DISPATCH]], %[[DISPATCH]]
// CHECK: %[[D0:.+]] = stream.async.dispatch @dispatch_ex::@dispatch(%[[SOURCE]]
// CHECK: %[[D1:.+]] = stream.async.dispatch @dispatch_ex::@dispatch(%[[SOURCE]]
// CHECK: %[[D2:.+]] = stream.async.dispatch @dispatch_ex::@dispatch(%[[SOURCE]]
// CHECK: util.return %[[D0]], %[[D1]], %[[D2]]
util.return %d0, %d1, %d2 : !stream.resource<*>, !stream.resource<*>, !stream.resource<*>
}

Expand Down Expand Up @@ -1432,7 +1438,8 @@ util.func public @multiCloneImmutableSource(%size: index) -> (!stream.resource<*
%source = stream.async.splat %c123_i32 : i32 -> !stream.resource<*>{%size}
// Multiple clones of same source, all used by read-only dispatches.
// All clones can be elided because neither source nor results are mutated.
// After elision, CSE folds the identical dispatches into one.
// CSE cannot fold the dispatches because they produce non-tied resource
// results (getEffects() reports Allocate).
// CHECK-NOT: stream.async.clone
%clone0 = stream.async.clone %source : !stream.resource<*>{%size} -> !stream.resource<*>{%size}
%d0 = stream.async.dispatch @dispatch_ex::@dispatch(%clone0[%c0 to %size for %size])
Expand All @@ -1443,8 +1450,10 @@ util.func public @multiCloneImmutableSource(%size: index) -> (!stream.resource<*
%clone2 = stream.async.clone %source : !stream.resource<*>{%size} -> !stream.resource<*>{%size}
%d2 = stream.async.dispatch @dispatch_ex::@dispatch(%clone2[%c0 to %size for %size])
: (!stream.resource<*>{%size}) -> !stream.resource<*>{%size}
// CHECK: %[[DISPATCH:.+]] = stream.async.dispatch @dispatch_ex::@dispatch(%[[SOURCE]]
// CHECK: util.return %[[DISPATCH]], %[[DISPATCH]], %[[DISPATCH]]
// CHECK: %[[D0:.+]] = stream.async.dispatch @dispatch_ex::@dispatch(%[[SOURCE]]
// CHECK: %[[D1:.+]] = stream.async.dispatch @dispatch_ex::@dispatch(%[[SOURCE]]
// CHECK: %[[D2:.+]] = stream.async.dispatch @dispatch_ex::@dispatch(%[[SOURCE]]
// CHECK: util.return %[[D0]], %[[D1]], %[[D2]]
util.return %d0, %d1, %d2 : !stream.resource<*>, !stream.resource<*>, !stream.resource<*>
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Tests the dual-scatter buffer aliasing bug caused by CSE.
//
// Pattern:
// A dispatch with non-tied resource results (zeros_kernel) is used as the
// tied destination for two independent scatter dispatches. The flow:
//
// 1. MaterializeCopyOnWrite inserts clone(zeros_dispatch) for each scatter.
// 2. Canonicalize (PropagateCloneableOps) replaces clone(X) with fresh X
// at each use site because preferCloneToConsumers() == true for
// dispatches with no resource inputs.
// 3. After step 2 there are two independent zeros dispatches.
//
// FIX: AsyncDispatchOp::getEffects() reports MemoryEffects::Allocate when the
// dispatch has any non-tied resource result (a fresh allocation), preventing
// CSE from merging the two freshly rematerialized zeros dispatches.
//
// RUN: iree-opt --split-input-file \
// RUN: --pass-pipeline='builtin.module(util.func(iree-stream-materialize-copy-on-write,canonicalize,cse))' \
// RUN: %s | FileCheck %s
// RUN: iree-opt --split-input-file --cse %s \
// RUN: | FileCheck %s --check-prefix=CSE

// After COW + canonicalize each scatter should have its own independent zeros
// buffer. CSE must NOT merge them. The output order is interleaved:
// zeros0, scatter0, zeros1, scatter1
//
// CHECK-LABEL: @moe_scatter_aliasing
// CHECK: %[[ZEROS0:.+]] = stream.async.dispatch @ex::@zeros_kernel
// CHECK: stream.async.dispatch @ex::@scatter{{.*}}(%[[ZEROS0]]
// CHECK-SAME: -> %[[ZEROS0]]
// CHECK: %[[ZEROS1:.+]] = stream.async.dispatch @ex::@zeros_kernel
// CHECK: stream.async.dispatch @ex::@scatter{{.*}}(%[[ZEROS1]]
// CHECK-SAME: -> %[[ZEROS1]]
util.func private @moe_scatter_aliasing(
%data0: !stream.resource<*>, %data1: !stream.resource<*>
) -> (!stream.resource<*>, !stream.resource<*>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c16 = arith.constant 16 : index
%c64 = arith.constant 64 : index

// zeros_kernel produces a non-tied resource result (fresh allocation).
// COW inserts clone(zeros) for each tied-operand consumer, then
// canonicalize replaces clone(X) with a fresh X. CSE then merges
// the two fresh dispatches (both identical) into one -> BUG.
%zeros = stream.async.dispatch @ex::@zeros_kernel[%c1, %c1, %c1](%c64) :
(index) -> !stream.resource<*>{%c64}

%scatter0 = stream.async.dispatch @ex::@scatter[%c1, %c1, %c1](
%zeros[%c0 to %c64 for %c64], %data0[%c0 to %c16 for %c16]) :
(!stream.resource<*>{%c64}, !stream.resource<*>{%c16}) -> %zeros{%c64}

%scatter1 = stream.async.dispatch @ex::@scatter[%c1, %c1, %c1](
%zeros[%c0 to %c64 for %c64], %data1[%c0 to %c16 for %c16]) :
(!stream.resource<*>{%c64}, !stream.resource<*>{%c16}) -> %zeros{%c64}

util.return %scatter0, %scatter1 : !stream.resource<*>, !stream.resource<*>
}

// -----

// Verify that CSE still merges identical dispatches when all results are tied
// (pure in-place transformation, no fresh allocation).
// This test uses the CSE-only RUN line (no COW/canonicalize).

// CSE-LABEL: @tied_dispatches_cse_allowed
// CSE: %[[D:.+]] = stream.async.dispatch @ex::@transform
// CSE-NOT: stream.async.dispatch @ex::@transform
// CSE: util.return %[[D]], %[[D]]
util.func private @tied_dispatches_cse_allowed(
%input: !stream.resource<*>
) -> (!stream.resource<*>, !stream.resource<*>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index

// Both dispatches are identical and produce tied results (in-place).
// getEffects() returns no effects -> CSE merges them into one.
%d0 = stream.async.dispatch @ex::@transform[%c1, %c1, %c1](
%input[%c0 to %c64 for %c64]) :
(!stream.resource<*>{%c64}) -> %input{%c64}

%d1 = stream.async.dispatch @ex::@transform[%c1, %c1, %c1](
%input[%c0 to %c64 for %c64]) :
(!stream.resource<*>{%c64}) -> %input{%c64}

util.return %d0, %d1 : !stream.resource<*>, !stream.resource<*>
}
3 changes: 3 additions & 0 deletions tests/e2e/linalg_ext_ops/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ ALL_SRCS = enforce_glob(
"map_store.mlir",
"scan.mlir",
"scatter.mlir",
"scatter_multi_consumer.mlir",
"sort.mlir",
"top-k.mlir",
"winograd_input.mlir",
Expand Down Expand Up @@ -56,6 +57,7 @@ VMVX_SRCS = enforce_glob(
"map_store.mlir",
"scan.mlir",
"scatter.mlir",
"scatter_multi_consumer.mlir",
"sort.mlir",
"top-k.mlir",
"winograd_input.mlir",
Expand Down Expand Up @@ -122,6 +124,7 @@ ROCM_HIP_SRCS = enforce_glob(
"map_store.mlir",
"scan.mlir",
"scatter.mlir",
"scatter_multi_consumer.mlir",
"sort.mlir",
"winograd_input.mlir",
"winograd_output.mlir",
Expand Down
3 changes: 3 additions & 0 deletions tests/e2e/linalg_ext_ops/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ iree_check_single_backend_test_suite(
"map_store.mlir"
"scan.mlir"
"scatter.mlir"
"scatter_multi_consumer.mlir"
"sort.mlir"
"top-k.mlir"
"winograd_input.mlir"
Expand All @@ -49,6 +50,7 @@ iree_check_single_backend_test_suite(
"map_store.mlir"
"scan.mlir"
"scatter.mlir"
"scatter_multi_consumer.mlir"
"sort.mlir"
"top-k.mlir"
"winograd_input.mlir"
Expand Down Expand Up @@ -94,6 +96,7 @@ iree_check_single_backend_test_suite(
"map_store.mlir"
"scan.mlir"
"scatter.mlir"
"scatter_multi_consumer.mlir"
"sort.mlir"
"winograd_input.mlir"
"winograd_output.mlir"
Expand Down
Loading
Loading