Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -860,7 +860,18 @@ struct AIEObjectFifoStatefulTransformPass
int acqMode = 1;
int relMode = 1;
auto acqLockAction = LockAction::Acquire;
if (state.locksPerFifo[op].size() > 0) {
// Static-init producer cycled via iter_count > 1 with no upstream link:
// skip source-side locks. The BD chain restarts via the channel's
// task_count, but the per-BD lock state never gets replenished (no
// upstream S2MM refills the buffers) so the chain would deadlock on
// the second pass. Back-pressure to the downstream consumer is handled
// by the DMA stream's flow control; source-side locking is unnecessary
// for correctness in this configuration.
bool isCycledStaticInitProducer =
channelDir == DMAChannelDir::MM2S && op.getInitValues().has_value() &&
op.getIterCount().has_value() && op.getIterCount().value() > 1 &&
!getOptionalLinkOp(op).has_value();
if (state.locksPerFifo[op].size() > 0 && !isCycledStaticInitProducer) {
auto dev = op->getParentOfType<DeviceOp>();
if (auto &target = dev.getTargetModel();
target.getTargetArch() == AIEArch::AIE1) {
Expand Down
2 changes: 1 addition & 1 deletion programming_examples/vision/color_detect/color_detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def color_detect(dev, width, height):
# AIE-array data movement with object fifos
# Input
inOF_L3L2 = ObjectFifo(line_bytes_ty, name="inOF_L3L2")
inOF_L2L1 = inOF_L3L2.cons(6).forward(depth=6, name="inOF_L2L1")
inOF_L2L1 = inOF_L3L2.cons(6).forward(depth=2, name="inOF_L2L1")

# Output
outOF_L1L2 = ObjectFifo(line_bytes_ty, name="outOF_L1L2")
Expand Down
2 changes: 1 addition & 1 deletion programming_examples/vision/edge_detect/edge_detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def edge_detect(dev, width, height):
# AIE-array data movement with object fifos
# Input
inOF_L3L2 = ObjectFifo(line_bytes_ty, name="inOF_L3L2")
inOF_L2L1 = inOF_L3L2.cons(7).forward(depth=7, name="inOF_L2L1")
inOF_L2L1 = inOF_L3L2.cons(7).forward(depth=2, name="inOF_L2L1")

# Output
outOF_L1L2 = ObjectFifo(line_bytes_ty, name="outOF_L1L2")
Expand Down
32 changes: 17 additions & 15 deletions python/iron/dataflow/objectfifo.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,20 +253,6 @@ def _cons_tiles_ops(self) -> list[Tile]:
)
return [cons.endpoint.tile.op for cons in self._cons]

def _get_depths(self) -> int | list[int]:
if not self._prod:
raise ValueError(
"Cannot return depths since prod ObjectFifoHandle is not created."
)
if len(self._cons) == 0:
raise ValueError(
"Cannot return depths since no cons ObjectFifoHandles are created."
)
depths = [self._prod.depth] + [con.depth for con in self._cons]
if len(set(depths)) == 1:
return depths[0]
return depths

def _get_endpoint(self, is_prod: bool) -> list[ObjectFifoEndpoint]:
if is_prod:
if self._prod:
Expand All @@ -290,11 +276,27 @@ def resolve(
for con in self._cons
]

if not self._prod:
raise ValueError(
f"ObjectFifo {self.name}: producer handle not created."
)
if len(self._cons) == 0:
raise ValueError(
f"ObjectFifo {self.name}: no consumer handles created."
)
# Always emit the per-handle ArrayAttr [prod_depth, *cons_depths].
# Collapsing to a single int when all are equal triggers the
# stateful-transform's auto-minimize path, which sizes each
# consumer's ping-pong from max-acquire instead of honoring the
# declared depth — silently deadlocking multi-consumer fanout
# designs where one consumer must buffer ahead of the others.
depths = [self._prod.depth] + [con.depth for con in self._cons]

self._op = object_fifo(
self.name,
self._prod_tile_op(),
self._cons_tiles_ops(),
self._get_depths(),
depths,
np_ndarray_type_to_memref_type(self._obj_type),
dimensionsToStream=self._dims_to_stream,
dimensionsFromStreamPerConsumer=dims_from_stream_per_cons,
Expand Down
89 changes: 89 additions & 0 deletions test/python/objFifo_iron_multi_cons_depth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2026 AMD Inc.

# RUN: %python %s | FileCheck %s

import numpy as np

from aie.iron import ObjectFifo, Program, Runtime, Worker
from aie.iron.controlflow import range_
from aie.iron.device import NPU1Col1, Tile

# Regression for: IRON ObjectFifo collapsed all-equal per-handle depths to a
# single int when emitting `aie.objectfifo`. The `aie-objectFifo-stateful-
# transform` lowering interprets a single-int `elemNumber` as "producer
# depth only" and auto-sizes each consumer-side buffer from max-acquire,
# silently dropping below the user's declared depth. For multi-consumer
# fanout with uneven acquire patterns (one consumer must buffer ahead of a
# peer that's waiting on upstream data), this deadlocks at runtime.
#
# Fix: always emit the per-handle ArrayAttr `[prod_depth, *cons_depths]`
# so the lowering uses each declared depth directly, even when all values
# match. Applies uniformly to every ObjectFifo (1-cons and N-cons).


# CHECK-DAG: aie.objectfifo @of_multi({{.*}}, [4 : i32, 4 : i32, 4 : i32]) : !aie.objectfifo<memref<16xi32>>
# CHECK-DAG: aie.objectfifo @of_a_out({{.*}}, [2 : i32, 2 : i32]) : !aie.objectfifo<memref<16xi32>>
# CHECK-DAG: aie.objectfifo @of_b_out({{.*}}, [2 : i32, 2 : i32]) : !aie.objectfifo<memref<16xi32>>
def test_objectfifo_multi_consumer_depth_array():
"""Multi-consumer fanout must emit ArrayAttr depth so the lowering honors
each cons(depth=N), not auto-minimize per consumer from max-acquire.
1-producer-1-consumer ObjectFifos (of_a_out, of_b_out) also emit
ArrayAttr — uniform handling, no silent collapse."""

dev = NPU1Col1()
tile_ty = np.ndarray[(16,), np.dtype[np.int32]]

of_multi = ObjectFifo(tile_ty, depth=4, name="of_multi")
of_a_out = ObjectFifo(tile_ty, depth=2, name="of_a_out")
of_b_out = ObjectFifo(tile_ty, depth=2, name="of_b_out")

def prod_body(p):
for _ in range_(4):
p.acquire(1)
p.release(1)

def cons_a_body(c, p_out):
# max-acquire = 1; pre-fix lowering would shrink to ping-pong=2
# even though declared depth was 4.
for _ in range_(4):
c.acquire(1)
p_out.acquire(1)
c.release(1)
p_out.release(1)

def cons_b_body(c, p_out):
for _ in range_(4):
c.acquire(1)
p_out.acquire(1)
c.release(1)
p_out.release(1)

w_prod = Worker(prod_body, fn_args=[of_multi.prod()], tile=Tile(0, 2))
w_cons_a = Worker(
cons_a_body,
fn_args=[of_multi.cons(), of_a_out.prod()],
tile=Tile(0, 3),
)
w_cons_b = Worker(
cons_b_body,
fn_args=[of_multi.cons(), of_b_out.prod()],
tile=Tile(0, 4),
)

rt = Runtime()
tensor_ty = np.ndarray[(16,), np.dtype[np.int32]]
with rt.sequence(tensor_ty, tensor_ty) as (out_a, out_b):
rt.start(w_prod, w_cons_a, w_cons_b)
rt.drain(of_a_out.cons(), out_a, wait=True)
rt.drain(of_b_out.cons(), out_b, wait=True)

module = Program(dev, rt).resolve_program()
print(module)


if __name__ == "__main__":
test_objectfifo_multi_consumer_depth_array()
Loading