Skip to content

Commit 603e81d

Browse files
authored
Merge branch 'main' into update-llvm-version
2 parents fa55d83 + e368f3e commit 603e81d

5 files changed

Lines changed: 226 additions & 46 deletions

File tree

docs/Presentations.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
</tr>
99
</thead>
1010
<tbody>
11+
<tr>
12+
<td style="vertical-align: middle;">ASPLOS 2026 Workshop - IRON AI Engine API for Ryzen AI NPU</td>
13+
<td style="vertical-align: middle;"><a href="https://www.amd.com/content/dam/amd/en/documents/solutions/ai/iron-aie-api-for-ryzen-ai-npu-tutorial-asplos-2026.pdf"><img src="https://xilinx.github.io/xup_aie_training/images/pdf.png" alt="alt text" /></a></td>
14+
</tr>
1115
<tr>
1216
<td style="vertical-align: middle;">ISCA 2025 Workshop - Leveraging the IRON AI Engine API to Program the Ryzen AI NPU</td>
1317
<td style="vertical-align: middle;"><a href="https://www.amd.com/content/dam/amd/en/documents/products/processors/ryzen/ai/iron-for-ryzen-ai-tutorial-isca-2025.pdf"><img src="https://xilinx.github.io/xup_aie_training/images/pdf.png" alt="alt text" /></a></td>

programming_examples/ml/resnet/layers_conv2_x/resnet.py

Lines changed: 37 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -204,19 +204,6 @@
204204
conv2dk1_skip_ui8,
205205
]
206206

207-
# runtime parameters
208-
rtp = []
209-
for i in range(3):
210-
rtp.append([])
211-
for j in range(2, 6):
212-
rtp[i].append(
213-
Buffer(
214-
np.ndarray[(16,), np.dtype[np.int32]],
215-
name=f"rtpComputeTile{i}{j}",
216-
use_write_rtp=True,
217-
)
218-
)
219-
220207
# Cores - we move in a snake-like pattern, that depends on
221208
# shared memory between neighbors, so we'll explicitly place all cores
222209
cores = [
@@ -225,6 +212,23 @@
225212
[Tile(2, 2), Tile(2, 3), Tile(2, 4), Tile(2, 5)],
226213
]
227214

215+
216+
# Runtime parameters: one RTP buffer per worker that reads RTPs.
217+
# Only conv1_fn (cores[i][0]) and conv1_skip_fn (cores[i][2]) use RTPs;
218+
# conv2_fn workers hard-code scale=1 and need no buffer.
219+
def make_rtp(col, row):
220+
return Buffer(
221+
np.ndarray[(16,), np.dtype[np.int32]],
222+
name=f"rtpComputeTile{col}{row}",
223+
use_write_rtp=True,
224+
)
225+
226+
227+
# rtp_conv1[i] -> buffer for conv1_fn worker in column i
228+
# rtp_conv1_skip[i] -> buffer for conv1_skip_fn worker in column i
229+
rtp_conv1 = [make_rtp(cores[i][0].col, cores[i][0].row) for i in range(n_cols)]
230+
rtp_conv1_skip = [make_rtp(cores[i][2].col, cores[i][2].row) for i in range(n_cols)]
231+
228232
# input tensor (with broadcast for skip connection)
229233
act1_fifo_names = ["act1_00_02_01", "act1_04_15_11", "act1_13_22_21"]
230234
act1_fifos = []
@@ -467,18 +471,17 @@ def conv1_skip_fn(
467471
# Create workers and place each one on a particular compute core
468472
workers = []
469473
for i in range(n_cols):
470-
placement = cores[i][0]
471474
w = Worker(
472475
conv1_fn,
473476
[
474477
wts_sub_fifos[i][0].cons(),
475478
act1_fifos[i].cons(),
476479
act2_fifos[i].prod(),
477480
conv1_kernels_call[i],
478-
rtp[placement.col][placement.row - 2],
481+
rtp_conv1[i],
479482
i,
480483
],
481-
placement=placement,
484+
placement=cores[i][0],
482485
)
483486
workers.append(w)
484487
w = Worker(
@@ -493,11 +496,6 @@ def conv1_skip_fn(
493496
placement=cores[i][1],
494497
)
495498
workers.append(w)
496-
placement = cores[i][2]
497-
if i == 0:
498-
skip_rtp = rtp[0][3]
499-
else:
500-
skip_rtp = rtp[placement.col][placement.row - 2]
501499
w = Worker(
502500
conv1_skip_fn,
503501
[
@@ -507,10 +505,10 @@ def conv1_skip_fn(
507505
conv3_out_fifos[i].prod(),
508506
skip_fifos[i].cons(),
509507
conv3_kernels_call[i],
510-
skip_rtp,
508+
rtp_conv1_skip[i],
511509
i,
512510
],
513-
placement=placement,
511+
placement=cores[i][2],
514512
stack_size=0xA00,
515513
)
516514
workers.append(w)
@@ -535,29 +533,22 @@ def conv1_skip_fn(
535533
outputToL3,
536534
):
537535

538-
# Set runtime parameters
539-
def set_rtps(rtp):
540-
# Only set RTPs for tiles that actually read them (conv1_fn and conv1_skip_fn
541-
# workers). conv2_fn workers use a hardcoded scale=1 and have no RTP arg,
542-
# so their corresponding buffers are never placed/resolved.
543-
544-
# col 0: conv1_fn at Tile(0,2) → rtp[0][0]; conv1_skip_fn at Tile(0,4) → rtp[0][3]
545-
rtp[0][0][0] = 1
546-
rtp[0][3][0] = 1
547-
rtp[0][3][1] = 0
548-
rtp[0][3][2] = 1
549-
550-
# col 1: conv1_fn at Tile(1,5) → rtp[1][3]; conv1_skip_fn at Tile(1,3) → rtp[1][1]
551-
rtp[1][3][0] = 1
552-
rtp[1][1][0] = 1
553-
rtp[1][1][1] = 0
554-
555-
# col 2: conv1_fn at Tile(2,2) → rtp[2][0]; conv1_skip_fn at Tile(2,4) → rtp[2][2]
556-
rtp[2][0][0] = 1
557-
rtp[2][2][0] = 1
558-
rtp[2][2][1] = 0
559-
560-
rt.inline_ops(set_rtps, [rtp])
536+
# Set runtime parameters for conv1_fn workers (scale)
537+
# and conv1_skip_fn workers (scale, skipScale, [skipConvScale for col 0])
538+
def set_rtps(rtp_conv1, rtp_conv1_skip):
539+
rtp_conv1[0][0] = 1 # col 0 conv1 scale
540+
rtp_conv1[1][0] = 1 # col 1 conv1 scale
541+
rtp_conv1[2][0] = 1 # col 2 conv1 scale
542+
543+
rtp_conv1_skip[0][0] = 1 # col 0 skip scale
544+
rtp_conv1_skip[0][1] = 0 # col 0 skipScale
545+
rtp_conv1_skip[0][2] = 1 # col 0 skipConvScale (init only)
546+
rtp_conv1_skip[1][0] = 1 # col 1 skip scale
547+
rtp_conv1_skip[1][1] = 0 # col 1 skipScale
548+
rtp_conv1_skip[2][0] = 1 # col 2 skip scale
549+
rtp_conv1_skip[2][1] = 0 # col 2 skipScale
550+
551+
rt.inline_ops(set_rtps, [rtp_conv1, rtp_conv1_skip])
561552

562553
# Start workers
563554
rt.start(*workers)

python/iron/runtime/task.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from ... import ir # type: ignore
1111

12+
from ..buffer import Buffer
1213
from ..resolvable import Resolvable
1314
from ..worker import Worker
1415
from .taskgroup import RuntimeTaskGroup
@@ -77,9 +78,20 @@ def __init__(
7778
self._args = args
7879
RuntimeTask.__init__(self, task_group)
7980

81+
@staticmethod
82+
def _resolve_buffers(obj, loc, ip):
83+
"""Recursively resolve any Buffer instances found in obj (handles nested lists/tuples)."""
84+
if isinstance(obj, Buffer):
85+
obj.resolve(loc=loc, ip=ip)
86+
elif isinstance(obj, (list, tuple)):
87+
for item in obj:
88+
InlineOpRuntimeTask._resolve_buffers(item, loc, ip)
89+
8090
def resolve(
8191
self,
8292
loc: ir.Location | None = None,
8393
ip: ir.InsertionPoint | None = None,
8494
) -> None:
95+
for arg in self._args:
96+
InlineOpRuntimeTask._resolve_buffers(arg, loc, ip)
8597
self._fn(*self._args)

test/python/buffer_resolution.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
# Copyright (C) 2025, Advanced Micro Devices, Inc.
2+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3+
4+
# RUN: %python %s | FileCheck %s
5+
6+
"""Regression tests for Buffer placement and resolution behaviour.
7+
8+
Covers the bug reported in https://github.com/Xilinx/mlir-aie/issues/3011:
9+
- A Buffer passed to a Worker is placed automatically by the placer and
10+
resolved before inline_ops callbacks fire, so indexing inside the callback
11+
works correctly.
12+
- A Buffer that is created but never given to any Worker has no tile and
13+
therefore cannot be resolved; InlineOpRuntimeTask must raise a clear
14+
ValueError rather than a confusing AttributeError from __setitem__.
15+
- Multiple RTP buffers (one per worker) in a list can all be written inside a
16+
single inline_ops callback, reflecting the common RTP-initialisation pattern
17+
seen in ML examples such as resnet layers_conv2_x.
18+
"""
19+
20+
import numpy as np
21+
22+
from aie.iron import Buffer, ObjectFifo, Program, Runtime, Worker
23+
from aie.iron.placers import SequentialPlacer
24+
from aie.iron.device import NPU1Col1, NPU2
25+
26+
rtp_ty = np.ndarray[(16,), np.dtype[np.int32]]
27+
data_ty = np.ndarray[(64,), np.dtype[np.int32]]
28+
29+
30+
# ---------------------------------------------------------------------------
31+
# Test 1: Buffer given to a Worker is resolved before inline_ops fires,
32+
# so element writes inside the callback produce correct rtp_write ops.
33+
# CHECK-LABEL: TEST: rtp_buffer_written_in_inline_ops
34+
# CHECK: aiex.npu.rtp_write(@my_rtp, 0, 7)
35+
# CHECK: aiex.npu.rtp_write(@my_rtp, 1, 3)
36+
# ---------------------------------------------------------------------------
37+
print("\nTEST: rtp_buffer_written_in_inline_ops")
38+
39+
of_in = ObjectFifo(data_ty, name="in")
40+
of_out = ObjectFifo(data_ty, name="out")
41+
rtp_buf = Buffer(rtp_ty, name="my_rtp", use_write_rtp=True)
42+
43+
44+
def core_fn(of_in, of_out, rtp):
45+
scale = rtp[0]
46+
elem_in = of_in.acquire(1)
47+
elem_out = of_out.acquire(1)
48+
of_in.release(1)
49+
of_out.release(1)
50+
51+
52+
worker = Worker(core_fn, [of_in.cons(), of_out.prod(), rtp_buf])
53+
54+
rt = Runtime()
55+
with rt.sequence(data_ty, data_ty) as (inp, out):
56+
57+
def set_rtp(buf):
58+
buf[0] = 7
59+
buf[1] = 3
60+
61+
rt.inline_ops(set_rtp, [rtp_buf])
62+
rt.start(worker)
63+
rt.fill(of_in.prod(), inp)
64+
rt.drain(of_out.cons(), out, wait=True)
65+
66+
module = Program(NPU1Col1(), rt).resolve_program(SequentialPlacer())
67+
print(module)
68+
69+
70+
# ---------------------------------------------------------------------------
71+
# Test 2: Multiple RTP buffers (one per worker) in a list, all written in one
72+
# inline_ops callback — mirrors the resnet layers_conv2_x pattern.
73+
# CHECK-LABEL: TEST: multiple_rtp_buffers_in_inline_ops
74+
# CHECK: aiex.npu.rtp_write(@rtp_w0, 0, 1)
75+
# CHECK: aiex.npu.rtp_write(@rtp_w1, 0, 2)
76+
# CHECK: aiex.npu.rtp_write(@rtp_w2, 0, 3)
77+
# ---------------------------------------------------------------------------
78+
print("\nTEST: multiple_rtp_buffers_in_inline_ops")
79+
80+
n_workers = 3
81+
of_ins = [ObjectFifo(data_ty, name=f"in{i}") for i in range(n_workers)]
82+
of_outs = [ObjectFifo(data_ty, name=f"out{i}") for i in range(n_workers)]
83+
rtps = [Buffer(rtp_ty, name=f"rtp_w{i}", use_write_rtp=True) for i in range(n_workers)]
84+
85+
86+
def core_fn_rtp(of_in, of_out, rtp):
87+
scale = rtp[0]
88+
elem_in = of_in.acquire(1)
89+
elem_out = of_out.acquire(1)
90+
of_in.release(1)
91+
of_out.release(1)
92+
93+
94+
workers = [
95+
Worker(core_fn_rtp, [of_ins[i].cons(), of_outs[i].prod(), rtps[i]])
96+
for i in range(n_workers)
97+
]
98+
99+
rt2 = Runtime()
100+
with rt2.sequence(data_ty, data_ty, data_ty, data_ty, data_ty, data_ty) as (
101+
i0,
102+
i1,
103+
i2,
104+
o0,
105+
o1,
106+
o2,
107+
):
108+
109+
def set_rtps(rtps):
110+
rtps[0][0] = 1
111+
rtps[1][0] = 2
112+
rtps[2][0] = 3
113+
114+
rt2.inline_ops(set_rtps, [rtps])
115+
rt2.start(*workers)
116+
rt2.fill(of_ins[0].prod(), i0)
117+
rt2.fill(of_ins[1].prod(), i1)
118+
rt2.fill(of_ins[2].prod(), i2)
119+
rt2.drain(of_outs[0].cons(), o0, wait=True)
120+
rt2.drain(of_outs[1].cons(), o1, wait=True)
121+
rt2.drain(of_outs[2].cons(), o2, wait=True)
122+
123+
module2 = Program(NPU2(), rt2).resolve_program(SequentialPlacer())
124+
print(module2)
125+
126+
127+
# ---------------------------------------------------------------------------
128+
# Test 3: A Buffer never given to any Worker raises ValueError (not the
129+
# confusing AttributeError from __setitem__) when inline_ops fires.
130+
# This is the exact failure mode of GitHub issue #3011.
131+
# CHECK-LABEL: TEST: unplaced_buffer_in_inline_ops_raises
132+
# CHECK: PASSED
133+
# ---------------------------------------------------------------------------
134+
print("\nTEST: unplaced_buffer_in_inline_ops_raises")
135+
136+
of_in3 = ObjectFifo(data_ty, name="in3")
137+
of_out3 = ObjectFifo(data_ty, name="out3")
138+
placed_rtp = Buffer(rtp_ty, name="placed_rtp", use_write_rtp=True)
139+
orphan_rtp = Buffer(
140+
rtp_ty, name="orphan_rtp", use_write_rtp=True
141+
) # never given to a Worker
142+
143+
144+
def core_fn3(of_in, of_out, rtp):
145+
scale = rtp[0]
146+
elem_in = of_in.acquire(1)
147+
elem_out = of_out.acquire(1)
148+
of_in.release(1)
149+
of_out.release(1)
150+
151+
152+
worker3 = Worker(core_fn3, [of_in3.cons(), of_out3.prod(), placed_rtp])
153+
154+
rt3 = Runtime()
155+
with rt3.sequence(data_ty, data_ty) as (inp3, out3):
156+
157+
def write_both(placed, orphan):
158+
placed[0] = 1
159+
orphan[0] = 1 # orphan has no tile → should raise ValueError
160+
161+
rt3.inline_ops(write_both, [placed_rtp, orphan_rtp])
162+
rt3.start(worker3)
163+
rt3.fill(of_in3.prod(), inp3)
164+
rt3.drain(of_out3.cons(), out3, wait=True)
165+
166+
try:
167+
Program(NPU1Col1(), rt3).resolve_program(SequentialPlacer())
168+
print("FAILED: expected ValueError but no exception was raised")
169+
except ValueError as e:
170+
assert "placed" in str(e).lower(), f"unexpected message: {e}"
171+
print("PASSED")
172+
except Exception as e:
173+
print(f"FAILED: expected ValueError, got {type(e).__name__}: {e}")

0 commit comments

Comments
 (0)