Skip to content
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
43c7c1c
[IRON] Reserve subclass slots for specialized FIFO primitives
Apr 25, 2026
0ff6514
[IRON] CascadeFifo — first-class cascade-stream primitive (Apache-2.0…
Apr 25, 2026
7da47f1
[IRON] AccumFifo — FP32 inter-tile accumulator state passing (AM020 C…
Apr 25, 2026
9d57bbc
[IRON] Worker.fn_args FifoHandle registry -- extensible type dispatch…
Apr 25, 2026
be7e7a3
[IRON] PacketFifo -- variable-rate packet-switched stream primitive
Apr 25, 2026
2ee45cc
[IRON] MemtileAggregator -- memtile-mediated 4-into-1 fan-in helper (…
Apr 25, 2026
96156d8
[IRON] SparseFifo — on-the-fly weight decompression
Apr 25, 2026
afc2980
[IRON] normalize *FifoHandle subclass contract
Apr 25, 2026
263bbd4
[IRON] VariableRateFifo — producer-side conditional-forward FIFO
Apr 27, 2026
779baa0
[IRON] Strip project-internal task references from __init__.py
Apr 27, 2026
0ecb976
[AIE] plumb SparseFifo discardable attrs to BD Enable_Compression bit
Apr 25, 2026
ad7b968
[AIE] ObjectFifoStatefulTransform: VariableRateFifo unroll-skip + att…
Apr 27, 2026
e25baa8
[IRON] Address review issues 4, 9, 10
Apr 27, 2026
1cd1060
[IRON] VariableRateFifo example: actually exercise discard(1)
Apr 27, 2026
2594626
[IRON] Address smaller review issues (registry brittleness, _Registry…
Apr 27, 2026
c14c5e2
[IRON] Document parent-constructor bypass in AccumFifoHandle / Packet…
Apr 27, 2026
e036b85
[IRON] PacketFifo: add end-to-end lowering tests against an aie.devic…
Apr 27, 2026
91789d1
[IRON] Address Copilot review feedback (strip-residue cleanup)
Apr 27, 2026
7d2a761
Update python/iron/sparse.py
matteius Apr 27, 2026
51decc8
[example] Add dispatch_overhead_bisector AIE2P diagnostic example
Apr 27, 2026
995685a
[IRON] Address Copilot round-2 review feedback
Apr 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 137 additions & 17 deletions lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -810,32 +810,41 @@ struct AIEObjectFifoStatefulTransformPass
}

/// Function used to create a Bd block.
///
/// Returns the newly created DMABDOp so the caller can decorate it with
/// dataflow-source-specific discardable attributes (e.g. the
/// SparseFifo (de)compression bit propagated from the originating
template <typename MyOp>
void createBd(OpBuilder &builder, LockOp acqLock, int acqMode,
LockAction acqLockAction, LockOp relLock, int relMode,
MyOp buff, int offset, int len, Block *succ,
BDDimLayoutArrayAttr dims, BDPadLayoutArrayAttr padDimensions,
std::optional<PacketInfoAttr> bdPacket) {
DMABDOp createBd(OpBuilder &builder, LockOp acqLock, int acqMode,
LockAction acqLockAction, LockOp relLock, int relMode,
MyOp buff, int offset, int len, Block *succ,
BDDimLayoutArrayAttr dims,
BDPadLayoutArrayAttr padDimensions,
std::optional<PacketInfoAttr> bdPacket) {
if (acqLock)
UseLockOp::create(builder, builder.getUnknownLoc(), acqLock,
acqLockAction, acqMode);
if (bdPacket) {
DMABDPACKETOp::create(builder, builder.getUnknownLoc(),
bdPacket->getPktType(), bdPacket->getPktId());
}
if (!dims.getValue().empty() && padDimensions) {
DMABDOp::create(builder, builder.getUnknownLoc(), buff, offset, len, dims,
padDimensions);
} else if (!dims.getValue().empty()) {
DMABDOp::create(builder, builder.getUnknownLoc(), buff, offset, len,
dims);
} else {
DMABDOp::create(builder, builder.getUnknownLoc(), buff, offset, len);
}
DMABDOp bdOp = [&]() {
if (!dims.getValue().empty() && padDimensions) {
return DMABDOp::create(builder, builder.getUnknownLoc(), buff, offset,
len, dims, padDimensions);
}
if (!dims.getValue().empty()) {
return DMABDOp::create(builder, builder.getUnknownLoc(), buff, offset,
len, dims);
}
return DMABDOp::create(builder, builder.getUnknownLoc(), buff, offset,
len);
}();
if (acqLock)
UseLockOp::create(builder, builder.getUnknownLoc(), relLock,
LockAction::Release, relMode);
NextBDOp::create(builder, builder.getUnknownLoc(), succ);
return bdOp;
}

/// Function used to create a Bd block.
Expand Down Expand Up @@ -880,8 +889,82 @@ struct AIEObjectFifoStatefulTransformPass
: state.locksPerFifo[op][prodLockIndex];
}
}
createBd(builder, acqLock, acqMode, acqLockAction, relLock, relMode, buff,
offset, len, succ, dims, padDimensions, bdPacket);
DMABDOp bdOp = createBd(builder, acqLock, acqMode, acqLockAction, relLock,
relMode, buff, offset, len, succ, dims,
padDimensions, bdPacket);

// originating ObjectFifoCreateOp onto each DMABDOp we just created so
// downstream BD-emit (AIEDMATasksToNPU -> AIEDmaToNpu) can flip the
// per-channel ``Enable_Compression`` bit on the AIE2/AIE2P tile DMA BD
// config word. See ``python/iron/sparse.py`` for the discardable-attr
// contract; see AM020 Ch. 2 p. 27 for the hardware bit. The default
// (no attrs on the ObjectFifoCreateOp) is the pre-existing behaviour:
// no discardable attr on the DMABDOp -> no compression bit set.
propagateSparseCompressionAttr(bdOp.getOperation(), op.getOperation(),
channelDir);
}

/// discardable attrs (set by ``aie.iron.sparse.SparseFifo.resolve``)
/// from the originating ObjectFifoCreateOp and, if the channel
/// direction selects the matching half of the
/// (compress_mm2s, decompress_s2mm) pair AND the BD lives on a
/// compute (AIE) tile, attach a discardable boolean
/// ``aie.enable_compression = true`` attribute on the new DMABDOp.
/// the cross-module footgun guard): the SparseFifo lowering
/// already attaches the intent to the ObjectFifoCreateOp; this
/// propagates it to the DMABDOp where the ObjectFifoCreateOp
/// itself is about to be erased.
///
/// Cross-module footgun (AM029 / aie_registers_aie2.json):
/// * Compute-tile MEMORY_MODULE DMA_BD0_1 bit 31 = Enable_Compression
/// * Memory-tile MEMORY_TILE_MODULE DMA_BD0_1 bits 31:26 = D0_Pad_Before
/// * Shim DMA: AM029 documents Enable_Compression for "AIE-ML
/// memory and AIE-ML tile DMA" only — not for shim.
/// Setting aie.enable_compression on a memtile or shim BD would
/// either silently corrupt an unrelated field (memtile) or set an
/// undocumented bit (shim). So we walk up to the BD's owning tile
/// and bail out unless it's a compute tile.
static void propagateSparseCompressionAttr(Operation *bdOp, Operation *fifoOp,
DMAChannelDir channelDir) {
if (!bdOp || !fifoOp)
return;
StringRef attrName;
if (channelDir == DMAChannelDir::MM2S) {
attrName = "aie.compress_mm2s";
} else if (channelDir == DMAChannelDir::S2MM) {
attrName = "aie.decompress_s2mm";
} else {
return;
}
auto enable = fifoOp->getAttrOfType<BoolAttr>(attrName);
if (!enable || !enable.getValue())
return;

// Cross-module footgun guard: only emit on compute-tile BDs. The
// BD lives in either MemOp (compute), MemTileDMAOp (memtile), or
// ShimDMAOp (shim). Walk up to find the parent and check the
// tile type via the device's target model.
TileOp tileOp;
if (auto memOp = bdOp->getParentOfType<MemOp>())
tileOp = memOp.getTileOp();
else if (bdOp->getParentOfType<MemTileDMAOp>() ||
bdOp->getParentOfType<ShimDMAOp>())
return; // memtile / shim — bit means something else (or undocumented).
else
return; // unknown parent; conservative skip.

if (!tileOp)
return;
auto deviceOp = tileOp->getParentOfType<DeviceOp>();
if (!deviceOp)
return;
const auto &targetModel = deviceOp.getTargetModel();
if (tileOp.isShimTile() ||
targetModel.isMemTile(tileOp.getCol(), tileOp.getRow()))
return;

bdOp->setAttr("aie.enable_compression",
BoolAttr::get(bdOp->getContext(), true));
}

/// Function that either calls createAIETileDMA(), createShimDMA() or
Expand Down Expand Up @@ -1315,8 +1398,19 @@ struct AIEObjectFifoStatefulTransformPass
remainderMap[forLoop.getOperation()] = 0;
for (auto acqOp : body->getOps<ObjectFifoAcquireOp>()) {
if (acqOp.getOperation()->getParentOp() == forLoop) {
foundMap[forLoop.getOperation()] = true;
ObjectFifoCreateOp op = acqOp.getObjectFifo();
// VariableRateFifo opts out of LCM-based loop unrolling.
// The producer's loop body contains a conditional
// acquire/release that the LCM-unroll math cannot model;
// the runtime-counter machinery handles asymmetric rates
// correctly without unrolling. If the loop has only
// variable-rate accesses, foundMap stays false and the
// loop is left alone.
auto vrAttr = op->getAttrOfType<BoolAttr>("aie.variable_rate");
if (vrAttr && vrAttr.getValue()) {
continue;
}
foundMap[forLoop.getOperation()] = true;
objFifoSizes.insert(op.size());
}
}
Expand Down Expand Up @@ -1924,6 +2018,32 @@ struct AIEObjectFifoStatefulTransformPass
builder.getI32IntegerAttr(*bdChainIterCount));
}
replaceSplitFifo(createOp, consumerFifo, consumerTileOp);

// the original createOp to the new consumerFifo. Without this,
// the consumer-side ObjectFifoCreateOp is attr-less and the
// downstream propagateSparseCompressionAttr (called from
// createBdBlock for each consumer-side BD) finds no
// ``aie.decompress_s2mm`` to read and silently emits no
// ``aie.enable_compression`` on consumer-side S2MM BDs. The
// of this propagation; the lit test verified only the BD-emit
// pass's final hop given a hand-constructed
// NpuWriteBdOp{aie.enable_compression = true} and never drove
// the full pipeline through this split-fifo path. See
// tests/aie2p_microtests/dma_compression_loopback/ for the
// microtest that surfaced this gap.
for (StringRef attrName : {"aie.compress_mm2s",
"aie.decompress_s2mm",
"aie.sparsity_pattern",
"aie.sparsity_n",
"aie.sparsity_m",
// VariableRateFifo marker; read by
// unrollForLoops to exclude the fifo
// from LCM-based loop unrolling.
"aie.variable_rate"}) {
if (auto attr = createOp->getAttr(attrName))
consumerFifo->setAttr(attrName, attr);
}

if (createOp.getAieStream()) {
int streamEnd = createOp.getAieStream().value();
if (streamEnd > 0) {
Expand Down
18 changes: 17 additions & 1 deletion lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ struct AIEDMATasksToNPUPass
}
}

NpuWriteBdOp::create(
NpuWriteBdOp newBdOp = NpuWriteBdOp::create(
builder, bd_op.getLoc(), tile.getCol(), bd_id, len_addr_granularity,
offset,
/*enable_packet=*/enable_packet,
Expand All @@ -546,6 +546,22 @@ struct AIEDMATasksToNPUPass
/*d0_zero_after=*/padAfter[0], /*d1_zero_after=*/padAfter[1],
/*d2_zero_after=*/padAfter[2],
/*burst_length=*/bd_op.getBurstLength());

// ObjectFifo lowering pass (AIEObjectFifoStatefulTransform) tagged the
// source DMABDOp with ``aie.enable_compression = true`` when the
// originating ObjectFifoCreateOp carried the SparseFifo discardable
// attrs (``aie.compress_mm2s`` for MM2S, ``aie.decompress_s2mm`` for
// S2MM). The BD-emit pass (AIEDmaToNpu) reads this attribute back from
// the NpuWriteBdOp to flip the per-channel ``Enable_Compression`` bit
// on the AIE2/AIE2P tile DMA BD config word (AM020 Ch. 2 p. 27 +
// ``aie_registers_aie2.json``).
if (auto compAttr = bd_op->getAttrOfType<BoolAttr>(
"aie.enable_compression");
compAttr && compAttr.getValue()) {
newBdOp->setAttr("aie.enable_compression",
BoolAttr::get(newBdOp.getContext(), true));
}

return setAddressForSingleBD(builder, bd_op, tile);
}

Expand Down
20 changes: 19 additions & 1 deletion lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,25 @@ struct WriteBdToBlockWritePattern : OpConversionPattern<NpuWriteBdOp> {
// DMA_BDX_1
// Enable_Compression [31], Enable_Packet [30], Out_Of_Order_BD_ID
// [29:24], Packet_ID [23:19], Packet_Type [18:16]
words[1] = 0; // Enable_Compression
//
// ``aie.enable_compression`` is set on the NpuWriteBdOp by
// ``AIEDMATasksToNPU::rewriteSingleBD``, which forwarded it from the
// source ``aie.dma_bd`` op, which in turn was tagged by
// ``AIEObjectFifoStatefulTransform::createBdBlock`` based on the
// originating ObjectFifoCreateOp's SparseFifo attrs
// (``aie.compress_mm2s`` on MM2S, ``aie.decompress_s2mm`` on S2MM;
// see ``python/iron/sparse.py``). Default (no attr) preserves the
// pre-existing behaviour: ``Enable_Compression = 0``. The bit is
// documented in ``aie_registers_aie2.json`` as "Enable Compression
// (MM2S), decompression (S2MM). Only effective if channel has
// (de)compression enabled" (AM020 Ch. 2 p. 27, compute-tile DMA).
uint32_t enableCompression = 0;
if (auto compAttr = op->getAttrOfType<BoolAttr>(
"aie.enable_compression");
compAttr && compAttr.getValue()) {
enableCompression = 1;
}
words[1] = (enableCompression & 0x1) << 31; // Enable_Compression
words[1] |= (op.getEnablePacket() & 0x1) << 30;
words[1] |= (op.getOutOfOrderId() & 0x3f) << 24;
words[1] |= (op.getPacketId() & 0x1f) << 19;
Expand Down
52 changes: 52 additions & 0 deletions programming_examples/basic/variable_rate_filter/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
##===- Makefile -----------------------------------------------------------===##
#
# This file licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# Copyright (C) 2026, Advanced Micro Devices, Inc.
#
# Mirrors the passthrough_kernel Makefile shape; the only
# difference is the kernel object filename and the topology .py.
#
##===----------------------------------------------------------------------===##

srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))

include ${srcdir}/../../makefile-common

devicename ?= $(if $(filter 1,$(NPU2)),npu2,npu)
targetname = variable_rate_filter
in_size = 4096 # in bytes (multiple of 64; >= 512)
out_size = 4096 # in bytes (>= in_size for worst-case 100%-forward)
CHESS ?= false

aie_py_src=${targetname}.py

.PHONY: all clean

all: build/final_${in_size}.xclbin

build/filter_first_byte_even.cc.o: filter_first_byte_even.cc
mkdir -p ${@D}
ifeq ($(devicename),npu)
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -c $< -o ${@F}
else ifeq ($(devicename),npu2)
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2P_FLAGS} -c $< -o ${@F}
else
echo "Device type not supported"
endif

build/aie_${in_size}.mlir: ${srcdir}/${aie_py_src}
mkdir -p ${@D}
python3 $< -d ${devicename} -i1s ${in_size} -os ${out_size} > $@

build/final_${in_size}.xclbin: build/aie_${in_size}.mlir build/filter_first_byte_even.cc.o
mkdir -p ${@D}
cd ${@D} && aiecc --aie-generate-xclbin --aie-generate-npu-insts \
--no-xchesscc --no-xbridge \
--xclbin-name=${@F} --npu-insts-name=insts_${in_size}.bin \
$(<:%=../%)

clean:
rm -rf build _build ${targetname}*.exe
Loading
Loading