Skip to content

Commit fc6772e

Browse files
committed
[IRON] ObjectFifo: always emit ArrayAttr per-handle depths
`_get_depths()` collapsed all-equal per-handle depths to a single int. The stateful-transform's lowering interprets a single-int elemNumber as "producer depth only" and auto-sizes each consumer-side ping-pong from max-acquire (via findObjectFifoSize), silently dropping below the user's declared depth. For multi-consumer fanout with uneven acquire patterns — one consumer needs to buffer ahead while peers wait on upstream data — this back-pressures the producer DMA and deadlocks at runtime. Fix: delete _get_depths and inline the per-handle list [prod_depth, *cons_depths] at the resolve() call site. Uniform handling for 1-cons and N-cons; lowering always goes through the ArrayAttr branch that honors each declared depth directly. Regression: test/python/objFifo_iron_multi_cons_depth.py — multi-cons fanout with all depths equal must emit [4 : i32, 4 : i32, 4 : i32], not a single 4 : i32.
1 parent f446ca5 commit fc6772e

2 files changed

Lines changed: 106 additions & 15 deletions

File tree

python/iron/dataflow/objectfifo.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -253,20 +253,6 @@ def _cons_tiles_ops(self) -> list[Tile]:
253253
)
254254
return [cons.endpoint.tile.op for cons in self._cons]
255255

256-
def _get_depths(self) -> int | list[int]:
257-
if not self._prod:
258-
raise ValueError(
259-
"Cannot return depths since prod ObjectFifoHandle is not created."
260-
)
261-
if len(self._cons) == 0:
262-
raise ValueError(
263-
"Cannot return depths since no cons ObjectFifoHandles are created."
264-
)
265-
depths = [self._prod.depth] + [con.depth for con in self._cons]
266-
if len(set(depths)) == 1:
267-
return depths[0]
268-
return depths
269-
270256
def _get_endpoint(self, is_prod: bool) -> list[ObjectFifoEndpoint]:
271257
if is_prod:
272258
if self._prod:
@@ -290,11 +276,27 @@ def resolve(
290276
for con in self._cons
291277
]
292278

279+
if not self._prod:
280+
raise ValueError(
281+
f"ObjectFifo {self.name}: producer handle not created."
282+
)
283+
if len(self._cons) == 0:
284+
raise ValueError(
285+
f"ObjectFifo {self.name}: no consumer handles created."
286+
)
287+
# Always emit the per-handle ArrayAttr [prod_depth, *cons_depths].
288+
# Collapsing to a single int when all are equal triggers the
289+
# stateful-transform's auto-minimize path, which sizes each
290+
# consumer's ping-pong from max-acquire instead of honoring the
291+
# declared depth — silently deadlocking multi-consumer fanout
292+
# designs where one consumer must buffer ahead of the others.
293+
depths = [self._prod.depth] + [con.depth for con in self._cons]
294+
293295
self._op = object_fifo(
294296
self.name,
295297
self._prod_tile_op(),
296298
self._cons_tiles_ops(),
297-
self._get_depths(),
299+
depths,
298300
np_ndarray_type_to_memref_type(self._obj_type),
299301
dimensionsToStream=self._dims_to_stream,
300302
dimensionsFromStreamPerConsumer=dims_from_stream_per_cons,
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
2+
# See https://llvm.org/LICENSE.txt for license information.
3+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4+
#
5+
# (c) Copyright 2026 AMD Inc.
6+
7+
# RUN: %python %s | FileCheck %s
8+
9+
import numpy as np
10+
11+
from aie.iron import ObjectFifo, Program, Runtime, Worker
12+
from aie.iron.controlflow import range_
13+
from aie.iron.device import NPU1Col1, Tile
14+
15+
# Regression for: IRON ObjectFifo collapsed all-equal per-handle depths to a
16+
# single int when emitting `aie.objectfifo`. The `aie-objectFifo-stateful-
17+
# transform` lowering interprets a single-int `elemNumber` as "producer
18+
# depth only" and auto-sizes each consumer-side buffer from max-acquire,
19+
# silently dropping below the user's declared depth. For multi-consumer
20+
# fanout with uneven acquire patterns (one consumer must buffer ahead of a
21+
# peer that's waiting on upstream data), this deadlocks at runtime.
22+
#
23+
# Fix: always emit the per-handle ArrayAttr `[prod_depth, *cons_depths]`
24+
# so the lowering uses each declared depth directly, even when all values
25+
# match. Applies uniformly to every ObjectFifo (1-cons and N-cons).
26+
27+
28+
# CHECK-DAG: aie.objectfifo @of_multi({{.*}}, [4 : i32, 4 : i32, 4 : i32]) : !aie.objectfifo<memref<16xi32>>
29+
# CHECK-DAG: aie.objectfifo @of_a_out({{.*}}, [2 : i32, 2 : i32]) : !aie.objectfifo<memref<16xi32>>
30+
# CHECK-DAG: aie.objectfifo @of_b_out({{.*}}, [2 : i32, 2 : i32]) : !aie.objectfifo<memref<16xi32>>
31+
def test_objectfifo_multi_consumer_depth_array():
32+
"""Multi-consumer fanout must emit ArrayAttr depth so the lowering honors
33+
each cons(depth=N), not auto-minimize per consumer from max-acquire.
34+
1-producer-1-consumer ObjectFifos (of_a_out, of_b_out) also emit
35+
ArrayAttr — uniform handling, no silent collapse."""
36+
37+
dev = NPU1Col1()
38+
tile_ty = np.ndarray[(16,), np.dtype[np.int32]]
39+
40+
of_multi = ObjectFifo(tile_ty, depth=4, name="of_multi")
41+
of_a_out = ObjectFifo(tile_ty, depth=2, name="of_a_out")
42+
of_b_out = ObjectFifo(tile_ty, depth=2, name="of_b_out")
43+
44+
def prod_body(p):
45+
for _ in range_(4):
46+
p.acquire(1)
47+
p.release(1)
48+
49+
def cons_a_body(c, p_out):
50+
# max-acquire = 1; pre-fix lowering would shrink to ping-pong=2
51+
# even though declared depth was 4.
52+
for _ in range_(4):
53+
c.acquire(1)
54+
p_out.acquire(1)
55+
c.release(1)
56+
p_out.release(1)
57+
58+
def cons_b_body(c, p_out):
59+
for _ in range_(4):
60+
c.acquire(1)
61+
p_out.acquire(1)
62+
c.release(1)
63+
p_out.release(1)
64+
65+
w_prod = Worker(prod_body, fn_args=[of_multi.prod()], tile=Tile(0, 2))
66+
w_cons_a = Worker(
67+
cons_a_body,
68+
fn_args=[of_multi.cons(), of_a_out.prod()],
69+
tile=Tile(0, 3),
70+
)
71+
w_cons_b = Worker(
72+
cons_b_body,
73+
fn_args=[of_multi.cons(), of_b_out.prod()],
74+
tile=Tile(0, 4),
75+
)
76+
77+
rt = Runtime()
78+
tensor_ty = np.ndarray[(16,), np.dtype[np.int32]]
79+
with rt.sequence(tensor_ty, tensor_ty) as (out_a, out_b):
80+
rt.start(w_prod, w_cons_a, w_cons_b)
81+
rt.drain(of_a_out.cons(), out_a, wait=True)
82+
rt.drain(of_b_out.cons(), out_b, wait=True)
83+
84+
module = Program(dev, rt).resolve_program()
85+
print(module)
86+
87+
88+
if __name__ == "__main__":
89+
test_objectfifo_multi_consumer_depth_array()

0 commit comments

Comments
 (0)