Skip to content

Commit 9801b28

Browse files
committed
[Stream] Preserve borrowed imports during copy elision
A stream.tensor.import without consume leaves the imported storage owned by the caller. Local SSA last-use information only proves that the stream program has no later use of the value; it does not prove that mutating the underlying storage is legal. Teach ElideAsyncCopies to track whether a clone source may contain non-consuming imported storage, including through call boundaries, tied pass-through ops, and same-lifetime transfers. A clone protecting borrowed storage is kept when the clone result is mutated, while read-only clones and clones from consuming imports can still be elided. Keep the ABI/storage contracts aligned with that safety rule. Tensor arguments marked as output storage and hal.tensor.alias destinations are imported as consumed because those contracts explicitly provide storage for the program to overwrite and return. Ordinary non-consuming imports remain borrowed and keep their protective copies. Update the materialized-copy tests that intentionally mutate imported storage to model that ownership transfer explicitly with consume.
1 parent 7009ee1 commit 9801b28

7 files changed

Lines changed: 244 additions & 24 deletions

File tree

compiler/src/iree/compiler/Bindings/Native/Transforms/WrapEntryPoints.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -662,13 +662,15 @@ createExportWrapperFunc(IREE::ABI::InvocationModel invocationModel,
662662
exportOp.getArgAttrOfType<TypeAttr>(argIndex, "iree.abi.encoding");
663663
auto consumeAttr =
664664
exportOp.getArgAttrOfType<UnitAttr>(argIndex, "iree.abi.consume");
665+
auto outputAttr =
666+
exportOp.getArgAttrOfType<IntegerAttr>(argIndex, "iree.abi.output");
665667
auto affinityAttr = exportOp.getArgAttr(argIndex, "iree.abi.affinity");
666668
auto argName = inferArgumentName(entryBuilder.getContext(), argIndex,
667669
exportOp.getArgAttrDict(argIndex));
668670
auto tensorImportOp = IREE::HAL::TensorImportOp::create(
669671
entryBuilder, arg.getLoc(), oldType, arg,
670672
fallback(encodingAttr, TypeAttr::get(oldType)),
671-
/*consume=*/consumeAttr ? true : false, waitFence, argName,
673+
/*consume=*/consumeAttr || outputAttr, waitFence, argName,
672674
fallback(affinityAttr, defaultAffinityAttr));
673675
arguments.push_back(tensorImportOp.getTarget());
674676
} else {

compiler/src/iree/compiler/Bindings/Native/Transforms/test/wrap_entry_points.mlir

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,29 @@ util.func public @outputStorage(%arg0: tensor<?x8x8x3xf32>, %ret1: !hal.buffer {
127127

128128
// -----
129129

130+
// Tests that tensor-typed output storage arguments are imported as consumed.
131+
// The caller is explicitly providing storage to overwrite and return, so the
132+
// stream program may update that storage in-place.
133+
134+
// CHECK-LABEL: util.func public @outputTensorStorage
135+
// CHECK-SAME: (%[[ARG0:[a-z0-9]+]]: !hal.buffer_view, %[[STORAGE:[a-z0-9]+]]: !hal.buffer_view) -> !hal.buffer_view
136+
// CHECK-NEXT: %[[ARG0_TENSOR:.+]] = hal.tensor.import %[[ARG0]] "input0" : !hal.buffer_view -> tensor<4xf32>
137+
// CHECK-NEXT: %[[STORAGE_TENSOR:.+]] = hal.tensor.import consume %[[STORAGE]] "input1" : !hal.buffer_view -> tensor<4xf32>
138+
// CHECK-NEXT: %[[RET_TENSOR:.+]] = util.call @_outputTensorStorage(%[[ARG0_TENSOR]], %[[STORAGE_TENSOR]])
139+
// CHECK-NEXT: %[[RET_ALIAS:.+]] = hal.tensor.alias %[[RET_TENSOR]] : tensor<4xf32> to %[[STORAGE]] : !hal.buffer_view
140+
// CHECK-NEXT: %[[RET_VIEW:.+]] = hal.tensor.export %[[RET_ALIAS]] "output0" : tensor<4xf32> -> !hal.buffer_view
141+
// CHECK-NEXT: util.return %[[RET_VIEW]]
142+
// CHECK-NEXT: }
143+
144+
// CHECK-LABEL: util.func private @_outputTensorStorage(
145+
// CHECK-SAME: hal.abi.convention = #hal.abi.convention<synchronous>
146+
util.func public @outputTensorStorage(%arg0: tensor<4xf32>, %storage: tensor<4xf32> {iree.abi.output = 0 : index}) -> tensor<4xf32> {
147+
%0 = arith.addf %arg0, %arg0 : tensor<4xf32>
148+
util.return %0 : tensor<4xf32>
149+
}
150+
151+
// -----
152+
130153
// Tests that functions already wrapped (iree.abi.stub present) are ignored.
131154

132155
// CHECK-LABEL: util.func public @wrappedAlready

compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/Patterns.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,8 @@ struct ConvertTensorAliasOp
246246
auto importOp = IREE::Stream::TensorImportOp::create(
247247
rewriter, op.getLoc(), externalType, adaptor.getStorage().front(),
248248
TypeAttr::get(sourceType), convertedSourceDims, storageSize,
249-
/*consume=*/UnitAttr{}, executionAffinityAttr);
249+
/*consume=*/UnitAttr::get(rewriter.getContext()),
250+
executionAffinityAttr);
250251

251252
// Await the fence, if needed. When not specified the storage is assumed to
252253
// be immediately available.

compiler/src/iree/compiler/Dialect/Stream/Conversion/HALToStream/test/abi_ops.mlir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ util.func public @exportBufferView(%tensor: tensor<?x?x4xf32>, %dim0: index, %di
114114
// CHECK-SAME: (%[[TENSOR:.+]]: !stream.resource<*>, %[[SIZE:.+]]: index, %[[DIM0:.+]]: index, %[[STORAGE:.+]]: !hal.buffer)
115115
util.func public @aliasStorage(%tensor: tensor<?x4xf32>, %dim0: index, %storage: !hal.buffer) -> tensor<?x4xf32> {
116116
// CHECK: %[[MIN_STORAGE_SIZE:.+]] = stream.tensor.sizeof tensor<?x4xf32>{%[[DIM0]]}
117-
// CHECK: %[[STORAGE_RESOURCE:.+]] = stream.tensor.import %[[STORAGE]] : !hal.buffer -> tensor<?x4xf32>{%[[DIM0]]} in !stream.resource<external>{%[[MIN_STORAGE_SIZE]]}
117+
// CHECK: %[[STORAGE_RESOURCE:.+]] = stream.tensor.import consume %[[STORAGE]] : !hal.buffer -> tensor<?x4xf32>{%[[DIM0]]} in !stream.resource<external>{%[[MIN_STORAGE_SIZE]]}
118118
// CHECK: %[[UPDATE:.+]] = stream.async.update %[[TENSOR]], %[[STORAGE_RESOURCE]][%c0 to %[[SIZE]]] : !stream.resource<*>{%[[SIZE]]} -> %[[STORAGE_RESOURCE]] as !stream.resource<external>{%[[MIN_STORAGE_SIZE]]}
119119
// CHECK: %[[SLICE:.+]] = stream.async.slice %[[UPDATE]][%c0 to %[[SIZE]]] : !stream.resource<external>{%[[MIN_STORAGE_SIZE]]} -> !stream.resource<external>{%[[SIZE]]}
120120
// CHECK: %[[RESULT:.+]] = stream.async.clone %[[SLICE]] : !stream.resource<external>{%[[SIZE]]} -> !stream.resource<*>{%[[SIZE]]}
@@ -129,7 +129,7 @@ util.func public @aliasStorage(%tensor: tensor<?x4xf32>, %dim0: index, %storage:
129129
// CHECK-SAME: (%[[TENSOR:.+]]: !stream.resource<*>, %[[SIZE:.+]]: index, %[[DIM0:.+]]: index, %[[STORAGE:.+]]: !hal.buffer, %[[FENCE:.+]]: !hal.fence)
130130
util.func public @aliasStorageAsync(%tensor: tensor<?x4xf32>, %dim0: index, %storage: !hal.buffer, %fence: !hal.fence) -> tensor<?x4xf32> {
131131
// CHECK-DAG: %[[MIN_STORAGE_SIZE:.+]] = stream.tensor.sizeof tensor<?x4xf32>{%[[DIM0]]}
132-
// CHECK-DAG: %[[UNREADY_STORAGE:.+]] = stream.tensor.import %[[STORAGE]] : !hal.buffer -> tensor<?x4xf32>{%[[DIM0]]} in !stream.resource<external>{%[[MIN_STORAGE_SIZE]]}
132+
// CHECK-DAG: %[[UNREADY_STORAGE:.+]] = stream.tensor.import consume %[[STORAGE]] : !hal.buffer -> tensor<?x4xf32>{%[[DIM0]]} in !stream.resource<external>{%[[MIN_STORAGE_SIZE]]}
133133
// CHECK-DAG: %[[TIMEPOINT:.+]] = stream.timepoint.import %[[FENCE]]
134134
// CHECK-DAG: %[[READY_STORAGE:.+]] = stream.timepoint.await %[[TIMEPOINT]] => %[[UNREADY_STORAGE]] : !stream.resource<external>{%[[MIN_STORAGE_SIZE]]}
135135
// CHECK: %[[UPDATE:.+]] = stream.async.update %[[TENSOR]], %[[READY_STORAGE]][%c0 to %[[SIZE]]] : !stream.resource<*>{%[[SIZE]]} -> %[[READY_STORAGE]] as !stream.resource<external>{%[[MIN_STORAGE_SIZE]]}
@@ -209,7 +209,7 @@ util.func public @aliasStorageCrossDevice(%tensor: tensor<4xf32>, %storage: !hal
209209
// CHECK: %[[TRANSFER:.+]] = stream.async.transfer %[[TENSOR]] : !stream.resource<*>{%[[SIZE]]}
210210
// CHECK-SAME: -> to(#hal.device.promise<@dev_b>) !stream.resource<*>{%[[SIZE]]}
211211
// CHECK-DAG: %[[STORAGE_SIZE:.+]] = stream.tensor.sizeof on(#hal.device.promise<@dev_b>) tensor<4xf32>
212-
// CHECK: %[[STORAGE_RESOURCE:.+]] = stream.tensor.import on(#hal.device.promise<@dev_b>) %[[STORAGE]] :
212+
// CHECK: %[[STORAGE_RESOURCE:.+]] = stream.tensor.import on(#hal.device.promise<@dev_b>) consume %[[STORAGE]] :
213213
// CHECK-SAME: !hal.buffer -> tensor<4xf32> in !stream.resource<external>{%[[STORAGE_SIZE]]}
214214
// CHECK: %[[UPDATE:.+]] = stream.async.update on(#hal.device.promise<@dev_b>) %[[TRANSFER]], %[[STORAGE_RESOURCE]][%c0 to %[[SIZE]]] :
215215
// CHECK-SAME: !stream.resource<*>{%[[SIZE]]} -> %[[STORAGE_RESOURCE]] as !stream.resource<external>{%[[STORAGE_SIZE]]}

compiler/src/iree/compiler/Dialect/Stream/Transforms/ElideAsyncCopies.cpp

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,15 @@ class ElisionAnalysis {
714714
return mutationSemantics.isAssumedNotMutated();
715715
}
716716

717+
// Returns true if |value| may alias storage imported without consuming it.
718+
// Such storage is caller-owned even when this SSA value is its last local
719+
// use, so eliding a protective copy is only valid when the copy result is
720+
// read-only.
721+
bool containsBorrowedStorage(Value value) {
722+
llvm::SmallPtrSet<Value, 8> visited;
723+
return containsBorrowedStorage(value, visited);
724+
}
725+
717726
// Attempts to infer the affinity of |value| by walking up its defining ops
718727
// and checking for explicit affinity annotations or default affinities.
719728
// Returns nullptr if no affinity can be determined.
@@ -770,6 +779,55 @@ class ElisionAnalysis {
770779
}
771780

772781
private:
782+
bool containsBorrowedStorage(Value value,
783+
llvm::SmallPtrSetImpl<Value> &visited) {
784+
if (!visited.insert(value).second) {
785+
return true;
786+
}
787+
788+
bool foundBorrowedStorage = false;
789+
auto traversalResult = explorer.walkDefiningOps(
790+
value,
791+
[&](OpResult result) {
792+
Operation *op = result.getOwner();
793+
794+
// A non-consuming import borrows externally-owned storage. A
795+
// consuming import transfers ownership to the stream program.
796+
if (auto importOp = dyn_cast<IREE::Stream::TensorImportOp>(op)) {
797+
foundBorrowedStorage = !importOp.getConsume();
798+
return WalkResult::interrupt();
799+
}
800+
801+
// Same-lifetime transfers are usage/affinity refinements and may be
802+
// elided later, so preserve the source storage ownership. A
803+
// lifetime-changing transfer materializes storage with distinct
804+
// ownership semantics and stops the borrowed-storage chain.
805+
if (auto transferOp = dyn_cast<IREE::Stream::AsyncTransferOp>(op)) {
806+
auto sourceType = cast<IREE::Stream::ResourceType>(
807+
transferOp.getSource().getType());
808+
auto resultType =
809+
cast<IREE::Stream::ResourceType>(result.getType());
810+
if (sourceType.getLifetime() == resultType.getLifetime()) {
811+
foundBorrowedStorage =
812+
containsBorrowedStorage(transferOp.getSource(), visited);
813+
}
814+
return foundBorrowedStorage ? WalkResult::interrupt()
815+
: WalkResult::skip();
816+
}
817+
818+
// Clones produce independent storage. If this clone remains, uses of
819+
// its result do not mutate the original borrowed storage.
820+
if (isa<IREE::Stream::AsyncCloneOp>(op)) {
821+
return WalkResult::skip();
822+
}
823+
824+
return WalkResult::advance();
825+
},
826+
TraversalBehavior::DEFAULT);
827+
return foundBorrowedStorage ||
828+
traversalResult == TraversalResult::INCOMPLETE;
829+
}
830+
773831
Explorer explorer;
774832
llvm::BumpPtrAllocator allocator;
775833
DFX::Solver solver;
@@ -866,12 +924,23 @@ static bool isSafeToElideCloneOp(IREE::Stream::AsyncCloneOp cloneOp,
866924
<< " ? clone source is a by-value arg; may elide\n");
867925
}
868926

927+
// A non-consuming import borrows caller-owned storage. If a clone of that
928+
// storage is later mutated then the clone is semantically required even when
929+
// the imported SSA value has no remaining local uses: without the clone, the
930+
// mutation would be performed in-place on the caller's buffer.
931+
bool resultSafe = analysis.isNeverMutated(cloneOp.getResult());
932+
if (analysis.containsBorrowedStorage(cloneOp.getSource()) && !resultSafe) {
933+
LLVM_DEBUG(llvm::dbgs()
934+
<< " - clone protects borrowed storage from mutation; cannot "
935+
"elide\n");
936+
return false;
937+
}
938+
869939
// If neither source nor result is ever mutated anywhere in the program,
870940
// we can safely elide the clone because sharing the underlying buffer has no
871941
// observable effect. This extends the constant-to-constant analysis above to
872942
// all lifetime types - if no writes occur, reads can safely share buffers.
873943
bool sourceSafe = analysis.isNeverMutated(cloneOp.getSource());
874-
bool resultSafe = analysis.isNeverMutated(cloneOp.getResult());
875944
if (sourceSafe && resultSafe) {
876945
LLVM_DEBUG(llvm::dbgs()
877946
<< " + source and result never mutated; can elide\n");

compiler/src/iree/compiler/Dialect/Stream/Transforms/test/elide_async_copies_tensor_import_consume.mlir

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,3 +266,119 @@ util.func public @mixed_consume_no_consume(%arg0: !util.buffer, %arg1: !util.buf
266266
// CHECK: util.return %[[FILL0]], %[[FILL1]], %[[IMPORT1]]
267267
util.return %fill0, %fill1, %import1 : !stream.resource<external>, !stream.resource<external>, !stream.resource<external>
268268
}
269+
270+
// -----
271+
272+
// Tests that a non-consuming import remains borrowed even when the imported SSA
273+
// value has no uses after the clone. The clone protects caller-owned storage
274+
// from the tied fill mutation and must not be elided by one-use reasoning.
275+
276+
// CHECK-LABEL: @borrowed_import_single_use_preserves_mutating_clone
277+
// CHECK-SAME: (%[[BUFFER:[^:]+]]: !util.buffer)
278+
util.func public @borrowed_import_single_use_preserves_mutating_clone(%arg0: !util.buffer) -> !stream.resource<external> {
279+
%c0 = arith.constant 0 : index
280+
%c100 = arith.constant 100 : index
281+
%c123_i32 = arith.constant 123 : i32
282+
// CHECK: %[[IMPORT:.+]] = stream.tensor.import %[[BUFFER]]
283+
%import = stream.tensor.import %arg0 : !util.buffer -> tensor<f32> in !stream.resource<external>{%c100}
284+
// CHECK: %[[CLONE:.+]] = stream.async.clone %[[IMPORT]]
285+
%clone = stream.async.clone %import : !stream.resource<external>{%c100} -> !stream.resource<external>{%c100}
286+
// CHECK: %[[FILL:.+]] = stream.async.fill %c123_i32, %[[CLONE]]
287+
%fill = stream.async.fill %c123_i32, %clone[%c0 to %c100 for %c100] : i32 -> %clone as !stream.resource<external>{%c100}
288+
// CHECK: util.return %[[FILL]]
289+
util.return %fill : !stream.resource<external>
290+
}
291+
292+
// -----
293+
294+
// Tests that borrowed storage is preserved across call boundaries. Even though
295+
// the imported SSA value is a last-use call operand, the callee argument still
296+
// aliases non-consuming imported storage and needs its protective clone.
297+
298+
// CHECK-LABEL: util.func private @borrowed_import_last_use_callee
299+
// CHECK-SAME: (%[[ARG:.+]]: !stream.resource<external>)
300+
util.func private @borrowed_import_last_use_callee(%arg: !stream.resource<external>) -> !stream.resource<external> {
301+
%c0 = arith.constant 0 : index
302+
%c100 = arith.constant 100 : index
303+
%c123_i32 = arith.constant 123 : i32
304+
// CHECK: %[[CLONE:.+]] = stream.async.clone %[[ARG]]
305+
%clone = stream.async.clone %arg : !stream.resource<external>{%c100} -> !stream.resource<external>{%c100}
306+
// CHECK: %[[FILL:.+]] = stream.async.fill %c123_i32, %[[CLONE]]
307+
%fill = stream.async.fill %c123_i32, %clone[%c0 to %c100 for %c100] : i32 -> %clone as !stream.resource<external>{%c100}
308+
// CHECK: util.return %[[FILL]]
309+
util.return %fill : !stream.resource<external>
310+
}
311+
312+
// CHECK-LABEL: @borrowed_import_last_use_caller
313+
// CHECK-SAME: (%[[BUFFER:[^:]+]]: !util.buffer)
314+
util.func public @borrowed_import_last_use_caller(%arg0: !util.buffer) -> !stream.resource<external> {
315+
%c100 = arith.constant 100 : index
316+
// CHECK: %[[IMPORT:.+]] = stream.tensor.import %[[BUFFER]]
317+
%import = stream.tensor.import %arg0 : !util.buffer -> tensor<f32> in !stream.resource<external>{%c100}
318+
// CHECK: %[[RESULT:.+]] = util.call @borrowed_import_last_use_callee(%[[IMPORT]])
319+
%result = util.call @borrowed_import_last_use_callee(%import) : (!stream.resource<external>) -> !stream.resource<external>
320+
// CHECK: util.return %[[RESULT]]
321+
util.return %result : !stream.resource<external>
322+
}
323+
324+
// -----
325+
326+
// Tests that borrowed imports still permit clone elision when the clone result
327+
// is read-only. Non-consuming imports only require protective clones when the
328+
// clone would otherwise shield caller-owned storage from mutation.
329+
330+
stream.executable private @ex_borrowed_readonly {
331+
stream.executable.export public @read_only workgroups() -> (index, index, index) {
332+
%c1 = arith.constant 1 : index
333+
stream.return %c1, %c1, %c1 : index, index, index
334+
}
335+
}
336+
337+
// CHECK-LABEL: @borrowed_import_readonly_elides_clone
338+
// CHECK-SAME: (%[[BUFFER:[^:]+]]: !util.buffer)
339+
util.func public @borrowed_import_readonly_elides_clone(%arg0: !util.buffer) -> !stream.resource<external> {
340+
%c0 = arith.constant 0 : index
341+
%c100 = arith.constant 100 : index
342+
// CHECK: %[[IMPORT:.+]] = stream.tensor.import %[[BUFFER]]
343+
%import = stream.tensor.import %arg0 : !util.buffer -> tensor<f32> in !stream.resource<external>{%c100}
344+
// CHECK-NOT: stream.async.clone
345+
%clone = stream.async.clone %import : !stream.resource<external>{%c100} -> !stream.resource<external>{%c100}
346+
// CHECK: %[[RESULT:.+]] = stream.async.dispatch @ex_borrowed_readonly::@read_only(%[[IMPORT]]
347+
%result = stream.async.dispatch @ex_borrowed_readonly::@read_only(%clone[%c0 to %c100 for %c100]) : (!stream.resource<external>{%c100}) -> !stream.resource<external>{%c100}
348+
// CHECK: util.return %[[RESULT]]
349+
util.return %result : !stream.resource<external>
350+
}
351+
352+
// -----
353+
354+
// Tests that borrowed storage remains borrowed through a same-lifetime transfer
355+
// that topology analysis cannot remove. The transfer may move the storage
356+
// between devices, but it does not create an owned copy with a new lifetime.
357+
358+
module @test attributes {
359+
stream.topology = #hal.device.topology<links = [
360+
(@dev_a -> @dev_c = {})
361+
]>
362+
} {
363+
364+
// CHECK-LABEL: @borrowed_import_transfer_preserves_mutating_clone
365+
// CHECK-SAME: (%[[BUFFER:[^:]+]]: !util.buffer)
366+
util.func public @borrowed_import_transfer_preserves_mutating_clone(%arg0: !util.buffer) -> !stream.resource<external> {
367+
%c0 = arith.constant 0 : index
368+
%c100 = arith.constant 100 : index
369+
%c123_i32 = arith.constant 123 : i32
370+
// CHECK: %[[IMPORT:.+]] = stream.tensor.import on(#hal.device.promise<@dev_a>) %[[BUFFER]]
371+
%import = stream.tensor.import on(#hal.device.promise<@dev_a>) %arg0 : !util.buffer -> tensor<f32> in !stream.resource<external>{%c100}
372+
// CHECK: %[[TRANSFER:.+]] = stream.async.transfer %[[IMPORT]]
373+
// CHECK-SAME: from(#hal.device.promise<@dev_a>) -> to(#hal.device.promise<@dev_c>)
374+
%transfer = stream.async.transfer %import : !stream.resource<external>{%c100}
375+
from(#hal.device.promise<@dev_a>) -> to(#hal.device.promise<@dev_c>) !stream.resource<external>{%c100}
376+
// CHECK: %[[CLONE:.+]] = stream.async.clone %[[TRANSFER]]
377+
%clone = stream.async.clone %transfer : !stream.resource<external>{%c100} -> !stream.resource<external>{%c100}
378+
// CHECK: %[[FILL:.+]] = stream.async.fill %c123_i32, %[[CLONE]]
379+
%fill = stream.async.fill %c123_i32, %clone[%c0 to %c100 for %c100] : i32 -> %clone as !stream.resource<external>{%c100}
380+
// CHECK: util.return %[[FILL]]
381+
util.return %fill : !stream.resource<external>
382+
}
383+
384+
} // module

0 commit comments

Comments
 (0)