From 7bb6059b876c4482df2d5b2d66a9e14b54da6d15 Mon Sep 17 00:00:00 2001 From: andrej Date: Tue, 12 May 2026 15:51:18 -0600 Subject: [PATCH 1/7] add higher-level scratchpad parameter abstraction --- .gitignore | 1 + include/aie/Dialect/AIE/IR/AIEDialect.h | 11 + include/aie/Dialect/AIE/IR/AIEOps.td | 2 + include/aie/Dialect/AIEX/AIEUtils.h | 14 + include/aie/Dialect/AIEX/IR/AIEX.td | 99 +++++- include/aie/Dialect/AIEX/IR/AIEXAttrs.td | 9 + .../aie/Dialect/AIEX/Transforms/AIEXPasses.h | 4 + .../aie/Dialect/AIEX/Transforms/AIEXPasses.td | 29 ++ .../AIEToConfiguration/AIEToConfiguration.cpp | 8 +- lib/Dialect/AIE/IR/AIEDialect.cpp | 17 +- lib/Dialect/AIEX/IR/AIEXDialect.cpp | 24 +- .../AIEX/Transforms/AIECtrlPacketToDma.cpp | 4 +- .../AIEX/Transforms/AIEDMATasksToNPU.cpp | 8 + lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp | 9 + .../AIEX/Transforms/AIELowerParameters.cpp | 301 ++++++++++++++++++ .../AIEX/Transforms/AIEXToStandard.cpp | 3 + lib/Dialect/AIEX/Transforms/CMakeLists.txt | 1 + lib/Dialect/AIEX/Utils/AIEUtils.cpp | 45 +++ python/CMakeLists.txt | 22 ++ python/ParameterScratchpadModule.cpp | 37 +++ python/dialects/aiex.py | 28 ++ python/iron/__init__.py | 1 + python/iron/parameter.py | 91 ++++++ python/iron/program.py | 11 + python/iron/runtime/dmatask.py | 4 + python/iron/runtime/runtime.py | 50 ++- python/iron/worker.py | 3 + python/utils/parameter_scratchpad.py | 73 +++++ runtime_lib/test_lib/CMakeLists.txt | 6 +- runtime_lib/test_lib/parameter_scratchpad.h | 175 ++++++++++ test/Dialect/AIEX/invalid_parameters.mlir | 38 +++ test/npu-xrt/scratchpad_addr_offset/aie.mlir | 92 ++++++ test/npu-xrt/scratchpad_addr_offset/test.py | 89 ++++++ .../aie_design.py | 94 ++++++ .../scratchpad_addr_offset_python/test.py | 89 ++++++ test/npu-xrt/scratchpad_params/aie.mlir | 74 +++++ test/npu-xrt/scratchpad_params/run.lit | 9 + test/npu-xrt/scratchpad_params/test.cpp | 88 +++++ .../scratchpad_params_python/aie_design.py | 85 +++++ test/npu-xrt/scratchpad_params_python/test.py | 94 ++++++ tools/aiecc/aiecc.cpp | 25 +- 41 files changed, 1849 insertions(+), 18 deletions(-) create mode 100644 lib/Dialect/AIEX/Transforms/AIELowerParameters.cpp create mode 100644 python/ParameterScratchpadModule.cpp create mode 100644 python/iron/parameter.py create mode 100644 python/utils/parameter_scratchpad.py create mode 100644 runtime_lib/test_lib/parameter_scratchpad.h create mode 100644 test/Dialect/AIEX/invalid_parameters.mlir create mode 100644 test/npu-xrt/scratchpad_addr_offset/aie.mlir create mode 100644 test/npu-xrt/scratchpad_addr_offset/test.py create mode 100644 test/npu-xrt/scratchpad_addr_offset_python/aie_design.py create mode 100644 test/npu-xrt/scratchpad_addr_offset_python/test.py create mode 100644 test/npu-xrt/scratchpad_params/aie.mlir create mode 100644 test/npu-xrt/scratchpad_params/run.lit create mode 100644 test/npu-xrt/scratchpad_params/test.cpp create mode 100644 test/npu-xrt/scratchpad_params_python/aie_design.py create mode 100644 test/npu-xrt/scratchpad_params_python/test.py diff --git a/.gitignore b/.gitignore index af53dc4e784..6d6dd699dc2 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ __pycache__ **.cmake include/**/Makefile lib/**/Makefile +**/*.prj/** CLAUDE.md /platforms/vck190_bare/petalinux/build diff --git a/include/aie/Dialect/AIE/IR/AIEDialect.h b/include/aie/Dialect/AIE/IR/AIEDialect.h index 3634165838b..9c1e3f78c1f 100644 --- a/include/aie/Dialect/AIE/IR/AIEDialect.h +++ b/include/aie/Dialect/AIE/IR/AIEDialect.h @@ -59,6 +59,17 @@ uint32_t getShimBurstLengthBytes(const AIE::AIETargetModel &tm, uint32_t getShimBurstLengthEncoding(const AIE::AIETargetModel &tm, uint32_t burstLength); +// Generate a symbol name guaranteed to be unique within the symbol table of +// `symbolTableOp`. Names are formed as "" for increasing n; the +// counter is advanced past the chosen value so repeated calls with the same +// counter remain efficient and produce distinct names. The returned name is +// not inserted into the symbol table; the caller is responsible for creating +// a symbol with that name before the next call (otherwise the same name will +// be returned again). +std::string generateUniqueSymbolName(mlir::Operation *symbolTableOp, + llvm::StringRef prefix, + unsigned &counter); + mlir::LogicalResult verifyOffsetSizeAndStrideOp(mlir::OffsetSizeAndStrideOpInterface op); diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td index 1e41135d5b0..ea1307f53d6 100644 --- a/include/aie/Dialect/AIE/IR/AIEOps.td +++ b/include/aie/Dialect/AIE/IR/AIEOps.td @@ -1011,6 +1011,8 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", []> { OptionalAttr:$bd_id, OptionalAttr:$packet, DefaultValuedOptionalAttr:$burst_length, + // if set, the aiex.parameter that will override the BD's address + OptionalAttr:$offset_parameter, // should never be assigned by user... OptionalAttr:$next_bd_id ); diff --git a/include/aie/Dialect/AIEX/AIEUtils.h b/include/aie/Dialect/AIEX/AIEUtils.h index 97d98586e09..7a38a344e54 100644 --- a/include/aie/Dialect/AIEX/AIEUtils.h +++ b/include/aie/Dialect/AIEX/AIEUtils.h @@ -36,5 +36,19 @@ struct SubviewTraceResult { // // This function checks that all subviews remain static and contiguous. std::optional traceSubviewToBlockArgument(Value value); + +// Emit an `aiex.npu.update_from_scratchpad` op that adds the runtime offset +// (held in the scratchpad slot referenced by `bdOp`'s `offset_parameter` / +// `offset_state_table_idx` attributes, multiplied by the element size of +// `bufType`) into the BD address register at `registerAddr`. +// +// `bdOp` must carry both the `offset_parameter` (FlatSymbolRefAttr pointing at +// an `aiex.parameter`) and `offset_state_table_idx` (IntegerAttr, set by +// `--aie-lower-parameters`) attributes. The referenced parameter must have +// type `i32`. +LogicalResult +emitUpdateBdAddressFromOffsetParameter(OpBuilder &builder, Operation *bdOp, + BaseMemRefType bufType, + uint64_t registerAddr); } } // namespace xilinx \ No newline at end of file diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td index 476bfacb0ce..56fd72d33f8 100644 --- a/include/aie/Dialect/AIEX/IR/AIEX.td +++ b/include/aie/Dialect/AIEX/IR/AIEX.td @@ -690,7 +690,9 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [ DefaultValuedOptionalAttr:$d0_zero_after, DefaultValuedOptionalAttr:$d1_zero_after, DefaultValuedOptionalAttr:$d2_zero_after, - DefaultValuedOptionalAttr:$burst_length + DefaultValuedOptionalAttr:$burst_length, + // if set, the aiex.parameter that will override the BD's address + OptionalAttr:$offset_parameter ); let assemblyFormat = [{ @@ -1390,6 +1392,101 @@ def AIEX_SetLockOp: AIEX_Op<"set_lock", [HasParent<"AIE::RuntimeSequenceOp">, Sk }]; } +//===----------------------------------------------------------------------===// +// Parameter ops +//===----------------------------------------------------------------------===// + +def AIEX_ParameterOp: AIEX_Op<"parameter", [Symbol]> { + let summary = "Declare a scratchpad runtime parameter"; + let description = [{ + Declares a named runtime parameter that can be set from the host by writing to DDR and read by AIE cores using `aiex.read_parameter`. + Parameters are communicated via the scratchpad memory mechanism (CREATE_SCRATCHPAD + UPDATE_FROM_SCRATCHPAD firmware opcodes). + + `aiex.parameter` ops are declared at **module scope** (outside any `aie.device`). + The scratchpad is a single hardware resource shared by all PDIs loaded by a + runtime sequence, so parameters are global to the whole module and may be + referenced from any device. + + Parameters can alternatively also be used to offset BD addresses by using them as the `offset_parameter` attribute in `aiex.dma_bd` and `aiex.dma_memcpy_nd`. + The two kinds of use are exclusive. If used this way, they cannot also be read from the cores. + If used as an address offset on a BD, the parameter is a multiple of the BD's element size. + + Each parameter occupies one StateTable entry (4 bytes in the scratchpad) in DDR. + The `--aie-lower-parameters` pass assigns the `state_table_idx` and the `kind` attribute (derived from the parameter's usage: + `core` if read by a core via `aiex.read_parameter`, `addr` if used as a DMA offset via the `offset_parameter` attribute on a DMA op). + Indices are unique across the entire module. + + The `type` attribute specifies the data type of the parameter (bf16, f32, or an integer type up to i32). + For `kind == addr`, the `type` must be `i32`. + The actual encoding uses a 30-bit value range due to the firmware's 2-bit masking. + + Example (at module scope): + ```mlir + aiex.parameter @foo : i32 + aiex.parameter @bar : bf16 + aie.device(npu2) { ... } + ``` + }]; + let arguments = (ins + SymbolNameAttr:$sym_name, + TypeAttr:$type, + // assigned by `--aie-lower-parameters` pass: + OptionalAttr:$state_table_idx, + OptionalAttr:$kind + ); + let results = (outs); + let assemblyFormat = [{ $sym_name `:` $type attr-dict }]; +} + +def AIEX_ReadParameterOp: AIEX_Op<"read_parameter", []> { + let summary = "Read a scratchpad runtime parameter value on an AIE core"; + let description = [{ + Reads a runtime parameter previously declared with `aiex.parameter` in an `aie.core`. + You must first synchronize the scratchpad to the core buffers from the runtime sequence using `aiex.sync_parameters_from_host`. + + The `--aie-lower-parameters` creates an `aie.buffer` on the core for each unique parameter read, and then replaces each instance of this op with: + 1. A `memref.load` from that buffer + 2. Arithmetic to decode the value (right-shift by 2 to undo firmware masking) + + Example: + ```mlir + %val = aiex.read_parameter @foo : i32 + ``` + }]; + let arguments = (ins + FlatSymbolRefAttr:$parameter, + // assigned by `--aie-lower-parameters` pass: + OptionalAttr:$buffer + ); + let results = (outs AnyType:$result); + let assemblyFormat = [{ $parameter `:` type($result) attr-dict }]; + let hasVerifier = 1; +} + +def AIEX_SyncParametersFromHostOp: AIEX_Op<"sync_parameters_from_host", + [HasParent<"AIE::RuntimeSequenceOp">]> { + let summary = "Sync all parameters from host scratchpad to core buffers"; + let description = [{ + Lowers to: + 1. `aiex.npu.create_scratchpad` with size = 4 * num_parameters + 2. For each parameter of kind `core` with a destination aie.buffer on a core: + a. `aiex.npu.write32` to zero-out the target buffer + b. `aiex.npu.update_from_scratchpad` with func=incr, func_arg=0 + 3. For each parameter of kind `addr`: + a. `aiex.npu_update_from_scratchpad` that adds the parameter value as an offset to the DMA BD's address + + Can only be used inside `aie.runtime_sequence`. + + Example: + ```mlir + aiex.sync_parameters_from_host() + ``` + }]; + let arguments = (ins); + let results = (outs); + let assemblyFormat = [{ attr-dict }]; +} + // Include CERT operations include "aie/Dialect/AIEX/IR/CERTOps.td" diff --git a/include/aie/Dialect/AIEX/IR/AIEXAttrs.td b/include/aie/Dialect/AIEX/IR/AIEXAttrs.td index 437d0bb29cc..7b18030e9d2 100644 --- a/include/aie/Dialect/AIEX/IR/AIEXAttrs.td +++ b/include/aie/Dialect/AIEX/IR/AIEXAttrs.td @@ -23,4 +23,13 @@ def StateTableFunc : I32EnumAttr<"StateTableFunc", let cppNamespace = "::xilinx::AIEX"; } +def ParameterKindCore : I32EnumAttrCase<"Core", 0, "core">; +def ParameterKindAddr : I32EnumAttrCase<"Addr", 1, "addr">; + +def ParameterKind : I32EnumAttr<"ParameterKind", + "Usage kind of a scratchpad runtime parameter", + [ParameterKindCore, ParameterKindAddr]> { + let cppNamespace = "::xilinx::AIEX"; +} + #endif // AIEX_ATTRS diff --git a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.h b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.h index 9de3bfd59e6..0d6d5da8cd0 100644 --- a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.h +++ b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.h @@ -58,6 +58,10 @@ std::unique_ptr> createAIEExpandLoadPdiPass(); std::unique_ptr> createAIEXInlineTraceConfigPass(); +std::unique_ptr> +createAIELowerParametersPass(); +std::unique_ptr> +createAIELowerParametersPass(AIELowerParametersOptions options); /// Generate the code for registering passes. #define GEN_PASS_REGISTRATION diff --git a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td index 30de3c62e6e..a51ed0a9e38 100644 --- a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td +++ b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td @@ -270,6 +270,35 @@ def AIELowerSetLock : Pass<"aie-lower-set-lock", "AIE::DeviceOp"> { ]; } +def AIELowerParameters : Pass<"aie-lower-parameters", "mlir::ModuleOp"> { + let summary = "Lower parameter ops to scratchpad + update_from_scratchpad"; + let description = [{ + Assigns globally-unique state table indices to every `aiex.parameter` in the module, + creates per-core buffers for each `aiex.read_parameter` usage on cores, + lowers `aiex.sync_parameters_from_host` to create_scratchpad + update_from_scratchpad sequences, + and emits a single parameter layout file. + + The scratchpad is a single hardware resource shared by all PDIs loaded by a runtime sequence, so all parameters in the module must occupy distinct indices. + The total parameter count is capped at 32. + + The pass also sets a `kind` attribute on each `aiex.parameter` based on how it is used: `core` if read by `aiex.read_parameter`, `addr` if used as a DMA `offset_parameter`. + A parameter may not be used as both. + }]; + + let constructor = "xilinx::AIEX::createAIELowerParametersPass()"; + let dependentDialects = [ + "xilinx::AIE::AIEDialect", + "xilinx::AIEX::AIEXDialect", + "mlir::arith::ArithDialect", + "mlir::memref::MemRefDialect", + ]; + let options = [ + Option<"outputParamsFile", "output-params-file", "std::string", + /*default=*/"\"\"", + "Path to write the parameter layout file to."> + ]; +} + def AIETransformBfpTypes : Pass<"aie-transform-bfp-types", "AIE::DeviceOp"> { let summary = "Transform bfp types to standard builtin types"; diff --git a/lib/Conversion/AIEToConfiguration/AIEToConfiguration.cpp b/lib/Conversion/AIEToConfiguration/AIEToConfiguration.cpp index 4cf93cd63b2..737937587d2 100644 --- a/lib/Conversion/AIEToConfiguration/AIEToConfiguration.cpp +++ b/lib/Conversion/AIEToConfiguration/AIEToConfiguration.cpp @@ -649,7 +649,7 @@ static LogicalResult convertTransactionOpsToMLIR( } OpBuilder::InsertionGuard guard(builder); builder.setInsertionPointToStart(device.getBody()); - int id = 0; + unsigned id = 0; for (auto &op : operations) { if (op.cmd.Opcode != XAIE_IO_BLOCKWRITE) { global_data.push_back(nullptr); @@ -659,10 +659,8 @@ static LogicalResult convertTransactionOpsToMLIR( const uint32_t *d = reinterpret_cast(op.cmd.DataPtr); std::vector data32(d, d + size); - std::string name = blockwrite_prefix; - do { - name = blockwrite_prefix + std::to_string(id++); - } while (device.lookupSymbol(name)); + std::string name = + AIE::generateUniqueSymbolName(device, blockwrite_prefix, id); MemRefType memrefType = MemRefType::get({size}, builder.getI32Type()); TensorType tensorType = diff --git a/lib/Dialect/AIE/IR/AIEDialect.cpp b/lib/Dialect/AIE/IR/AIEDialect.cpp index b9a6f92c52c..f5b8b5454e0 100644 --- a/lib/Dialect/AIE/IR/AIEDialect.cpp +++ b/lib/Dialect/AIE/IR/AIEDialect.cpp @@ -127,6 +127,17 @@ uint32_t xilinx::AIE::getShimBurstLengthEncoding(const AIE::AIETargetModel &tm, return getShimBurstLength(tm, burstLength).first; } +std::string +xilinx::AIE::generateUniqueSymbolName(mlir::Operation *symbolTableOp, + llvm::StringRef prefix, + unsigned &counter) { + std::string name; + do { + name = (prefix + llvm::Twine(counter++)).str(); + } while (mlir::SymbolTable::lookupSymbolIn(symbolTableOp, name)); + return name; +} + LogicalResult xilinx::AIE::myVerifyOffsetSizeAndStrideOp(OffsetSizeAndStrideOpInterface op) { std::array maxRanks = op.getArrayAttrMaxRanks(); @@ -3026,12 +3037,14 @@ LogicalResult RuntimeSequenceOp::verifyBeforeMaterialization() { !llvm::isa(symbolDefOp) && !llvm::isa(symbolDefOp) && !llvm::isa(symbolDefOp) && - !llvm::isa(symbolDefOp)) { + !llvm::isa(symbolDefOp) && + symbolDefOp->getName().getStringRef() != "aiex.parameter") { op->emitOpError() << "references symbol '" << symbolRef.getRootReference().getValue() << "' which must be either a ShimDMAAllocationOp, DeviceOp, " - "RuntimeSequenceOp, BufferOp or GlobalOp, but got: " + "RuntimeSequenceOp, BufferOp, GlobalOp or ParameterOp, but " + "got: " << symbolDefOp->getName().getStringRef(); return WalkResult::interrupt(); } diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index aeda8495d3b..3af105a0d0c 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -482,7 +482,8 @@ struct LinearizeContiguousTransfer op.getIssueTokenAttr(), op.getD0ZeroBeforeAttr(), op.getD1ZeroBeforeAttr(), op.getD2ZeroBeforeAttr(), op.getD0ZeroAfterAttr(), op.getD1ZeroAfterAttr(), - op.getD2ZeroAfterAttr(), op.getBurstLengthAttr()); + op.getD2ZeroAfterAttr(), op.getBurstLengthAttr(), + op.getOffsetParameterAttr()); return mlir::success(); } }; @@ -1117,6 +1118,27 @@ AIE::DeviceOp AIEX::ConfigureOp::getReferencedDeviceOp() { return referencedDevice; } +//===----------------------------------------------------------------------===// +// ReadParameterOp +//===----------------------------------------------------------------------===// + +LogicalResult AIEX::ReadParameterOp::verify() { + auto device = (*this)->getParentOfType(); + if (!device) { + return emitOpError("must be inside an aie.device"); + } + if (!(*this)->getParentOfType()) { + return emitOpError("must be inside an aie.core"); + } + auto module = (*this)->getParentOfType(); + if (!module || !module.lookupSymbol(getParameter())) { + return emitOpError("references unknown parameter '") + << getParameter() + << "' (aiex.parameter ops are declared at module scope)"; + } + return success(); +} + LogicalResult AIEX::ConfigureOp::verify() { AIE::DeviceOp parentDev = getOperation()->getParentOfType(); AIE::DeviceOp referencedDev = getReferencedDeviceOp(); diff --git a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp index bbe5d6fd31a..0914b2fc8ed 100644 --- a/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp +++ b/lib/Dialect/AIEX/Transforms/AIECtrlPacketToDma.cpp @@ -196,7 +196,9 @@ struct AIECtrlPacketToDmaPass SmallVector{}, SmallVector{}, SmallVector{}, ArrayRef(staticOffsets), ArrayRef(staticSizes), ArrayRef(staticStrides), - nullptr, metadata, 0, true, 0, 0, 0, 0, 0, 0); + nullptr, metadata, 0, true, 0, 0, 0, 0, 0, 0, + /*burst_length=*/0, + /*offset_parameter=*/FlatSymbolRefAttr()); auto shimRow = builder.getI32IntegerAttr(0); auto shimCol = builder.getI32IntegerAttr(col); diff --git a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp index 1bef5b29fa1..d72ded02dd3 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp @@ -242,6 +242,14 @@ struct AIEDMATasksToNPUPass /*addr*/ register_addr, /*arg_idx*/ arg_idx, /*arg_plus*/ offset); + // If this BD has an offset_parameter, emit update_from_scratchpad to add + // the runtime offset to the BD address register. + if (bd_op.getOffsetParameterAttr()) { + auto bufType = llvm::cast(bd_op.getBuffer().getType()); + if (failed(emitUpdateBdAddressFromOffsetParameter( + builder, bd_op, bufType, register_addr))) + return failure(); + } } else if (AIE::BufferOp buffer = llvm::dyn_cast(buf.getDefiningOp())) { uint64_t buf_addr; diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index 691d53e844e..5f3be2d7920 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -461,6 +461,15 @@ struct DmaToNpuPattern : OpConversionPattern { targetModel.getDmaBdAddressOffset(tileCol, tileRow); NpuAddressPatchOp::create(rewriter, op->getLoc(), addr, arg_idx, offset); + // If this DMA op has an offset_parameter, emit an update_from_scratchpad + // to add the runtime offset to the BD address register. + if (op.getOffsetParameterAttr()) { + auto bufType = cast(op.getMemref().getType()); + if (failed(emitUpdateBdAddressFromOffsetParameter(rewriter, op, bufType, + addr))) + return failure(); + } + // push the patched bd onto the dma task queue NpuPushQueueOp::create( rewriter, op->getLoc(), column, row, infoOp.getChannelDirAttr(), diff --git a/lib/Dialect/AIEX/Transforms/AIELowerParameters.cpp b/lib/Dialect/AIEX/Transforms/AIELowerParameters.cpp new file mode 100644 index 00000000000..03dd6a02d4a --- /dev/null +++ b/lib/Dialect/AIEX/Transforms/AIELowerParameters.cpp @@ -0,0 +1,301 @@ +//===- AIELowerParameters.cpp - Lower parameter ops to scratchpad ---------===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// +// +// This pass lowers aiex.parameter, aiex.read_parameter, and +// aiex.sync_parameters_from_host ops to the lower-level scratchpad ops +// (create_scratchpad, write32, update_from_scratchpad). +// +// State table indices are assigned globally across the entire module: every +// aiex.parameter in any aie.device gets a unique index in [0, 32). This is +// necessary because the scratchpad is a single hardware resource shared by all +// PDIs loaded by a runtime sequence. +// +// A single params.txt file is emitted describing every parameter in the module +// (name, global state_table_idx, type, and kind: "core" or "addr"). +// +//===----------------------------------------------------------------------===// + +#include "aie/Dialect/AIE/IR/AIEDialect.h" +#include "aie/Dialect/AIEX/IR/AIEXDialect.h" +#include "aie/Dialect/AIEX/Transforms/AIEXPasses.h" + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" + +#include "llvm/Support/raw_ostream.h" + +using namespace mlir; +using namespace xilinx; +using namespace xilinx::AIE; +using namespace xilinx::AIEX; + +namespace xilinx::AIEX { +#define GEN_PASS_DEF_AIELOWERPARAMETERS +#include "aie/Dialect/AIEX/Transforms/AIEXPasses.h.inc" +} // namespace xilinx::AIEX + +namespace { + +struct AIELowerParametersPass + : public xilinx::AIEX::impl::AIELowerParametersBase< + AIELowerParametersPass> { + using AIELowerParametersBase::AIELowerParametersBase; + + // For each read_parameter of a unique parameter, create a 2xi32 buffer and + // store a reference to it on the ReadParameterOp as the `buffer` attribute. + void allocateBuffers(DeviceOp device, OpBuilder &builder) { + MLIRContext *ctx = device.getContext(); + unsigned uniquingCounter = 0; + + DenseMap, BufferOp> seen; + + device.walk([&](ReadParameterOp readOp) { + auto coreOp = readOp->getParentOfType(); + TileOp tile = coreOp.getTileOp(); + StringRef paramName = readOp.getParameter(); + auto key = std::make_pair(paramName, tile.getOperation()); + + if (!seen.count(key)) { + builder.setInsertionPointAfter(tile); + // Buffer must be 8 bytes: update_from_scratchpad always writes a 48-bit + // value across two 32-bit registers at [RegOff] and [RegOff+4]. The + // firmware masks Reg[0] with 0xFFFFFFFC (lower 2 bits forced to 0) + // because it was designed for 4-byte-aligned DMA BD addresses. The host + // library's `ParameterScratchpad::write` left-shifts by 2 before + // writing to the scratchpad, and the core right-shifts by 2 after + // loading. Note that this limits effective parameter values to 30 + // bits. + auto bufType = MemRefType::get({2}, builder.getI32Type()); + std::string prefix = + ("__param_" + paramName + "_" + std::to_string(tile.getCol()) + + "_" + std::to_string(tile.getRow()) + "_") + .str(); + std::string bufName = + AIE::generateUniqueSymbolName(device, prefix, uniquingCounter); + auto buf = BufferOp::create( + builder, readOp.getLoc(), bufType, tile, + builder.getStringAttr(bufName), /*address=*/nullptr, + /*initial_value=*/nullptr, /*mem_bank=*/nullptr); + seen[key] = buf; + } + + readOp.setBufferAttr( + FlatSymbolRefAttr::get(ctx, *seen[key].getSymName())); + }); + } + + // Lower each read_parameter to: load from buffer[0], shift right by 2, and + // cast to the result type. The buffer to use comes from the `buffer` + // attribute set by allocateBuffers(). + void lowerReadParameters(DeviceOp device, OpBuilder &builder) { + SmallVector readOps; + device.walk([&](ReadParameterOp op) { readOps.push_back(op); }); + + for (auto readOp : readOps) { + FlatSymbolRefAttr bufRef = readOp.getBufferAttr(); + auto buf = device.lookupSymbol(bufRef.getAttr()); + + builder.setInsertionPoint(readOp); + Value c0 = builder.create(readOp.getLoc(), 0); + Value raw = builder.create(readOp.getLoc(), buf, c0); + Value c2 = builder.create( + readOp.getLoc(), builder.getI32IntegerAttr(2)); + Value decoded = builder.create(readOp.getLoc(), raw, c2); + + Type resultType = readOp.getResult().getType(); + Value result = decoded; + if (resultType != builder.getI32Type()) { + if (resultType.isInteger()) { + result = builder.create(readOp.getLoc(), resultType, + decoded); + } else if (resultType.isBF16()) { + Value masked = builder.create( + readOp.getLoc(), builder.getI16Type(), decoded); + result = builder.create(readOp.getLoc(), resultType, + masked); + } else if (resultType.isF32()) { + result = builder.create(readOp.getLoc(), resultType, + decoded); + } + } + + readOp.getResult().replaceAllUsesWith(result); + readOp.erase(); + } + } + + // Lower sync_parameters_from_host to create_scratchpad + + // update_from_scratchpad sequences. `scratchpadSlots` is the total number of + // parameters in the whole module (the scratchpad is a single hardware + // resource shared across all PDIs loaded by a runtime sequence). + void lowerSyncOps(DeviceOp device, OpBuilder &builder, + unsigned scratchpadSlots) { + auto module = device->getParentOfType(); + // Collect unique (stateIdx, bufferRef) pairs from ReadParameterOp attrs. + SmallVector> paramEntries; + DenseSet seenBufs; + device.walk([&](ReadParameterOp readOp) { + FlatSymbolRefAttr bufRef = readOp.getBufferAttr(); + if (!seenBufs.insert(bufRef.getValue()).second) + return; + auto paramOp = module.lookupSymbol(readOp.getParameter()); + uint8_t stateIdx = + static_cast(paramOp.getStateTableIdx().value()); + paramEntries.push_back({stateIdx, bufRef}); + }); + + device.walk([&](SyncParametersFromHostOp syncOp) { + builder.setInsertionPoint(syncOp); + Location loc = syncOp.getLoc(); + + NpuCreateScratchpadOp::create( + builder, loc, static_cast(scratchpadSlots * 4)); + + for (auto &[stateIdx, bufRef] : paramEntries) { + NpuUpdateFromScratchpadOp::create( + builder, loc, stateIdx, StateTableFunc::Incr, + /*func_arg=*/static_cast(0), + /*address=*/static_cast(0), bufRef, /*column=*/nullptr, + /*row=*/nullptr); + } + + syncOp.erase(); + }); + } + + // Emit a single params.txt for the whole module. + // + // Format (one entry per line, easily parsed with std::ifstream >>): + // + // + // ... + // where kind is "core" (shift-2 encoded, for read_parameter) or "addr" + // (raw, for offset_parameter on DMA ops). + LogicalResult emitParamsFile(ArrayRef allParams) { + if (outputParamsFile.empty()) + return success(); + + std::error_code ec; + llvm::raw_fd_ostream out(outputParamsFile, ec); + if (ec) + return emitError(UnknownLoc::get(&getContext()), + "failed to open params output file '") + << outputParamsFile << "': " << ec.message(); + + out << allParams.size() << "\n"; + for (auto p : allParams) { + std::string typeStr; + llvm::raw_string_ostream os(typeStr); + p.getType().print(os); + StringRef kindStr = + p.getKind().value() == ParameterKind::Addr ? "addr" : "core"; + out << p.getSymName() << " " + << static_cast(p.getStateTableIdx().value()) << " " + << typeStr << " " << kindStr << "\n"; + } + return success(); + } + + void runOnOperation() override { + ModuleOp module = getOperation(); + OpBuilder builder(&getContext()); + + // Step 1: collect every parameter in the module. + SmallVector allParams; + module.walk([&](ParameterOp p) { allParams.push_back(p); }); + + if (allParams.size() > 32) { + InFlightDiagnostic diag = + module.emitError("Module declares ") + << allParams.size() + << " parameters but the scratchpad supports at most 32. The " + "scratchpad is a single hardware resource shared by all PDIs " + "loaded by a runtime sequence."; + for (auto p : allParams) + diag.attachNote(p.getLoc()) << "parameter '" << p.getSymName() << "'"; + return signalPassFailure(); + } + + // Step 2: determine each parameter's kind from its usage, erroring on + // mixed use. A parameter is "core" if any aiex.read_parameter references + // it; "addr" if any DMA op references it via offset_parameter. If both, + // emit an error. + DenseMap usedAsCore; + DenseMap usedAsAddr; + module.walk([&](ReadParameterOp op) { + usedAsCore[op.getParameter()] = true; + }); + auto markAddr = [&](Operation *op, FlatSymbolRefAttr ref) { + if (ref) + usedAsAddr[ref.getValue()] = true; + }; + module.walk([&](NpuDmaMemcpyNdOp op) { + markAddr(op, op.getOffsetParameterAttr()); + }); + module.walk( + [&](AIE::DMABDOp op) { markAddr(op, op.getOffsetParameterAttr()); }); + + for (auto p : allParams) { + StringRef name = p.getSymName(); + bool core = usedAsCore.lookup(name); + bool addr = usedAsAddr.lookup(name); + if (core && addr) { + p.emitError("parameter '") + << name + << "' is used both as an aiex.read_parameter source (core) and " + "as a DMA offset_parameter (addr); a parameter must have a " + "single kind"; + return signalPassFailure(); + } + p.setKindAttr(ParameterKindAttr::get( + &getContext(), addr ? ParameterKind::Addr : ParameterKind::Core)); + } + + // Step 3: assign global state_table_idx in walk order, 0..N-1. + for (auto [i, p] : llvm::enumerate(allParams)) { + p.setStateTableIdxAttr(builder.getIntegerAttr( + builder.getIntegerType(8, /*isSigned=*/false), i)); + } + + unsigned totalParams = allParams.size(); + + // Step 4: per-device lowering. + SmallVector devices; + module.walk([&](DeviceOp d) { devices.push_back(d); }); + for (auto d : devices) { + allocateBuffers(d, builder); + lowerSyncOps(d, builder, totalParams); + lowerReadParameters(d, builder); + } + + // Step 5: emit the single params.txt for the module. + if (failed(emitParamsFile(allParams))) + return signalPassFailure(); + + // ParameterOps are kept around so that later passes (e.g. DMA lowering) + // can resolve `offset_parameter` symbol references back to their + // state_table_idx / kind / type. `--aiex-standard-lowering` will remove + // them at the end of the pipeline. + } +}; + +} // namespace + +std::unique_ptr> +xilinx::AIEX::createAIELowerParametersPass() { + return std::make_unique(); +} + +std::unique_ptr> +xilinx::AIEX::createAIELowerParametersPass(AIELowerParametersOptions options) { + return std::make_unique(std::move(options)); +} diff --git a/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp b/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp index 0ad346e0f84..fc003204b5f 100644 --- a/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp @@ -65,6 +65,9 @@ struct AIEXToStandardPass removepatterns.add>(m.getContext(), m); removepatterns.add>(m.getContext(), m); + removepatterns.add>(m.getContext(), m); + removepatterns.add>(m.getContext(), + m); if (failed(applyPartialConversion(m, target, std::move(removepatterns)))) signalPassFailure(); diff --git a/lib/Dialect/AIEX/Transforms/CMakeLists.txt b/lib/Dialect/AIEX/Transforms/CMakeLists.txt index fb205cb4048..20231184975 100644 --- a/lib/Dialect/AIEX/Transforms/CMakeLists.txt +++ b/lib/Dialect/AIEX/Transforms/CMakeLists.txt @@ -22,6 +22,7 @@ add_mlir_dialect_library(AIEXTransforms AIESubstituteShimDMAAllocations.cpp AIECtrlPacketToDma.cpp AIELowerSetLock.cpp + AIELowerParameters.cpp AIETransformBfpTypes.cpp AIETxnToControlPacket.cpp AIEExpandLoadPdi.cpp diff --git a/lib/Dialect/AIEX/Utils/AIEUtils.cpp b/lib/Dialect/AIEX/Utils/AIEUtils.cpp index 1a2676ec468..8e5a1bf40fd 100644 --- a/lib/Dialect/AIEX/Utils/AIEUtils.cpp +++ b/lib/Dialect/AIEX/Utils/AIEUtils.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "aie/Dialect/AIEX/AIEUtils.h" +#include "aie/Dialect/AIEX/IR/AIEXDialect.h" using namespace mlir; using namespace xilinx; @@ -143,3 +144,47 @@ memref::GlobalOp AIEX::getOrCreateDataMemref(OpBuilder &builder, } return global; } + +LogicalResult AIEX::emitUpdateBdAddressFromOffsetParameter( + OpBuilder &builder, Operation *bdOp, BaseMemRefType bufType, + uint64_t registerAddr) { + auto paramRef = bdOp->getAttrOfType("offset_parameter"); + if (!paramRef) + return bdOp->emitOpError( + "emitUpdateBdAddressFromOffsetParameter called without " + "offset_parameter attribute."); + + auto module = bdOp->getParentOfType(); + if (!module) + return bdOp->emitOpError("not contained in a module."); + auto paramOp = module.lookupSymbol(paramRef.getAttr()); + if (!paramOp) + return bdOp->emitOpError("offset_parameter '") + << paramRef.getValue() + << "' not found. Declare it at module scope with aiex.parameter."; + if (!paramOp.getStateTableIdx().has_value()) + return bdOp->emitOpError("offset_parameter '") + << paramRef.getValue() + << "' has no state_table_idx. Run --aie-lower-parameters first."; + if (!paramOp.getType().isInteger(32)) { + auto err = bdOp->emitOpError("offset_parameter '") + << paramRef.getValue() << "' must have type i32, got " + << paramOp.getType() << "."; + err.attachNote(paramOp.getLoc()) << "Parameter declared here."; + return err; + } + + uint8_t stateIdx = + static_cast(paramOp.getStateTableIdx().value()); + uint32_t elemBytes = bufType.getElementTypeBitWidth() / 8; + // Use func=mul with func_arg=elemBytes so the firmware computes + // StateTable[idx] * elemBytes = byte offset, added into the BD address + // register. UpdateReg only reads from the state table (it never writes + // back), so this is safe to repeat across runtime sequence invocations. + AIEX::NpuUpdateFromScratchpadOp::create( + builder, bdOp->getLoc(), stateIdx, AIEX::StateTableFunc::Mul, + /*func_arg=*/elemBytes, + /*address=*/static_cast(registerAddr), + /*buffer=*/nullptr, /*column=*/nullptr, /*row=*/nullptr); + return success(); +} diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index be25a56a4b8..8c6a0628a6c 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -41,6 +41,7 @@ declare_mlir_python_sources(AIEPythonSources.Utils utils/jit.py utils/ml.py utils/npukernel.py + utils/parameter_scratchpad.py utils/regdb.py utils/hostruntime/__init__.py utils/hostruntime/hostruntime.py @@ -395,6 +396,27 @@ else () endif () +################################################################################ +# ParameterScratchpad pybind11 module (no MLIR/XRT deps — uses buffer pointer) +################################################################################ +find_package(pybind11 CONFIG QUIET) +if(pybind11_FOUND) + pybind11_add_module(_parameter_scratchpad + ${CMAKE_CURRENT_SOURCE_DIR}/ParameterScratchpadModule.cpp + ) + target_include_directories(_parameter_scratchpad PRIVATE + ${AIE_SOURCE_DIR}/runtime_lib/test_lib + ) + set_target_properties(_parameter_scratchpad PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${AIE_PYTHON_PACKAGES_DIR}/aie/_mlir_libs" + ) + add_dependencies(AIEPythonModules _parameter_scratchpad) + install(TARGETS _parameter_scratchpad + LIBRARY DESTINATION "${AIE_PYTHON_INSTALL_DIR}/aie/_mlir_libs" + COMPONENT aie-python + ) +endif() + target_include_directories(AIEPythonModules.extension._aie.dso PUBLIC ${VITIS_AIETOOLS_DIR}/include) # Copy the runtime libs into the _mlir_libs directory for convenience. diff --git a/python/ParameterScratchpadModule.cpp b/python/ParameterScratchpadModule.cpp new file mode 100644 index 00000000000..aa49542735b --- /dev/null +++ b/python/ParameterScratchpadModule.cpp @@ -0,0 +1,37 @@ +//===- ParameterScratchpadModule.cpp - Python bindings ----------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "parameter_scratchpad.h" + +namespace py = pybind11; + +PYBIND11_MODULE(_parameter_scratchpad, m) { + m.doc() = "Python bindings for test_utils::ParameterScratchpad"; + + py::class_(m, "ParameterScratchpad") + .def(py::init([](py::buffer buf, const std::string ¶msPath) { + py::buffer_info info = buf.request(/*writable=*/true); + auto *ptr = static_cast(info.ptr); + return new test_utils::ParameterScratchpad(ptr, paramsPath); + }), + py::arg("buffer"), py::arg("params_path"), + py::keep_alive<1, 2>()) // prevent GC of buffer while alive + .def("write_bytes", + [](test_utils::ParameterScratchpad &self, const std::string &name, + py::bytes data) { + std::string s = data; + self.writeBytes(name, s.data(), s.size()); + }, + py::arg("name"), py::arg("data")) + .def("read", &test_utils::ParameterScratchpad::read, py::arg("name")); +} diff --git a/python/dialects/aiex.py b/python/dialects/aiex.py index 409a135273d..1ef7b49d21b 100644 --- a/python/dialects/aiex.py +++ b/python/dialects/aiex.py @@ -97,6 +97,7 @@ def __init__( issue_token: bool | None = None, burst_length: int = 0, packet: tuple[int] | None = None, + offset_parameter: str | None = None, ): if tap and not (offsets is None and sizes is None and strides is None): raise ValueError( @@ -137,6 +138,7 @@ def __init__( issue_token=issue_token, burst_length=burst_length, packet=packet, + offset_parameter=offset_parameter, ) @@ -206,6 +208,7 @@ def shim_dma_bd( transfer_len: int | None = None, burst_length: int = 0, packet: tuple[int] | None = None, + offset_parameter: str | None = None, ): if tap and not (offset is None and sizes is None and strides is None): raise ValueError( @@ -237,6 +240,7 @@ def shim_dma_bd( dimensions=dimensions, burst_length=burst_length, packet=packet, + offset_parameter=offset_parameter, ) @@ -251,6 +255,7 @@ def shim_dma_single_bd_task( issue_token: bool = False, burst_length: int = 0, packet: tuple[int] | None = None, + offset_parameter: str | None = None, ): """_summary_ Enables data transfers between the AIE Engine array and external memory. @@ -302,6 +307,7 @@ def shim_dma_single_bd_task( transfer_len=transfer_len, burst_length=burst_length, packet=packet, + offset_parameter=offset_parameter, ) EndOp() return task @@ -345,3 +351,25 @@ def dma_start_task(*args: DMAConfigureTaskForOp): def set_lock_value(lock: aie.LockOp, value: int): return set_lock(lock, value) + + +# Parameter ops + +_orig_read_parameter = read_parameter + + +def read_parameter(name: str, result_type: Type) -> _orig_read_parameter: + """Read a runtime parameter inside an ``aie.core`` body. + + Args: + name: The ``@sym_name`` of the ``aiex.parameter`` declaration. + result_type: The MLIR scalar type of the result (e.g. ``T.bf16()``, ``T.i32()``). + + Returns: + An SSA value of the given type. + + Example:: + + val = aiex.read_parameter("foo", T.bf16()) + """ + return _orig_read_parameter(result_type, name) diff --git a/python/iron/__init__.py b/python/iron/__init__.py index 23f0280a82e..ba81a2dca72 100644 --- a/python/iron/__init__.py +++ b/python/iron/__init__.py @@ -15,6 +15,7 @@ from .buffer import Buffer from .kernel import ExternalFunction, Kernel +from .parameter import Parameter from .program import Program from .worker import Worker, WorkerRuntimeBarrier from .runtime import Runtime diff --git a/python/iron/parameter.py b/python/iron/parameter.py new file mode 100644 index 00000000000..8bd9c289ce3 --- /dev/null +++ b/python/iron/parameter.py @@ -0,0 +1,91 @@ +# parameter.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. +"""Parameter: a named runtime value set from the host and read by Workers.""" + +import numpy as np + +from .. import ir # type: ignore +from ..dialects import aiex +from ..helpers.util import np_dtype_to_mlir_type, NpuDType +from .resolvable import Resolvable + + +class Parameter(Resolvable): + """A named runtime parameter communicated from host to AIE cores via the + scratchpad mechanism. + + Declare a ``Parameter`` at design time. Pass it to a :class:`Worker` via + ``fn_args`` and call :meth:`read` inside the ``core_fn`` to obtain its + current value. The :class:`Runtime` emits the necessary + ``aiex.sync_parameters_from_host`` when :meth:`Runtime.sync_parameters` is + called. + + Example:: + + import numpy as np + from aie.iron import Parameter, Worker, Runtime, Program + + seq_len = Parameter("seq_len", np.int32) + + def core_body(p): + v = p.read() + ... + + worker = Worker(core_body, [seq_len]) + + rt = Runtime() + with rt.sequence(output_type) as out: + rt.sync_parameters() + ... + """ + + def __init__(self, name: str, dtype: NpuDType): + """Create a Parameter. + + Args: + name: Symbol name for the parameter (must be unique within the + device). + dtype: The numpy scalar type (e.g. ``np.int32``, ``np.float32``, + ``bfloat16``). + """ + self._name = name + self._dtype = dtype + self._resolved = False + + @property + def name(self) -> str: + """The symbol name of this parameter.""" + return self._name + + @property + def dtype(self) -> NpuDType: + """The numpy scalar type of this parameter.""" + return self._dtype + + def read(self): + """Emit ``aiex.read_parameter`` inside a core body. + + Must be called within an active MLIR insertion point (i.e. inside a + Worker's ``core_fn``). + + Returns: + An MLIR SSA value of the parameter's type. + """ + mlir_type = np_dtype_to_mlir_type(self._dtype) + return aiex.read_parameter(self._name, mlir_type) + + def resolve( + self, + loc: ir.Location | None = None, + ip: ir.InsertionPoint | None = None, + ) -> None: + """Emit ``aiex.parameter @name : type`` at module scope.""" + if not self._resolved: + mlir_type = np_dtype_to_mlir_type(self._dtype) + aiex.parameter(self._name, mlir_type, loc=loc, ip=ip) + self._resolved = True diff --git a/python/iron/program.py b/python/iron/program.py index bc0075b6614..8dbe1e47e65 100644 --- a/python/iron/program.py +++ b/python/iron/program.py @@ -16,6 +16,7 @@ from .device import Device from .runtime import Runtime +from .parameter import Parameter from .resolvable import Resolvable from ..utils import trace as trace_utils @@ -54,6 +55,16 @@ def resolve_program(self, device_name="main"): # For dynamically created device classes, the constructor takes no arguments self._device = device_type() + # Resolve parameters at module scope (before the aie.device). + # aiex.parameter ops are global across all devices because the + # scratchpad is a single hardware resource shared by all PDIs. + for w in self._rt.workers: + for arg in w.fn_args: + if isinstance(arg, Parameter): + arg.resolve() + for p in self._rt._parameters: + p.resolve() + @device(self._device.resolve(), sym_name=device_name) def device_body(): # Collect all fifos diff --git a/python/iron/runtime/dmatask.py b/python/iron/runtime/dmatask.py index 958f6a04df2..b508b9a76fa 100644 --- a/python/iron/runtime/dmatask.py +++ b/python/iron/runtime/dmatask.py @@ -26,6 +26,7 @@ def __init__( tap: TensorAccessPattern, task_group: RuntimeTaskGroup | None = None, wait: bool = False, + offset_parameter: str | None = None, ): """A RuntimeTask that will resolve to a DMA Operation. @@ -35,11 +36,13 @@ def __init__( tap (TensorAccessPattern): The access pattern associated with the operation. task_group (RuntimeTaskGroup | None, optional): The task group associated with the operation. Defaults to None. wait (bool, optional): Whether this task should conclude with a call to await or a call to free. Defaults to False. + offset_parameter (str | None, optional): Name of a Parameter whose value is used as the byte offset for this DMA transfer. Defaults to None. """ self._object_fifo = object_fifo self._rt_data = rt_data self._tap = tap self._wait = wait + self._offset_parameter = offset_parameter self._task = None RuntimeTask.__init__(self, task_group) @@ -70,5 +73,6 @@ def resolve( self._rt_data.op, tap=self._tap, issue_token=self._wait, + offset_parameter=self._offset_parameter, ) dma_start_task(self._task) diff --git a/python/iron/runtime/runtime.py b/python/iron/runtime/runtime.py index a204f9e478c..403e99f4afc 100644 --- a/python/iron/runtime/runtime.py +++ b/python/iron/runtime/runtime.py @@ -27,6 +27,7 @@ from ..dataflow import ObjectFifoHandle from ..device import Tile, AnyShimTile from ..resolvable import Resolvable +from ..parameter import Parameter from ..worker import Worker, WorkerRuntimeBarrier, _BarrierSetOp from .dmatask import DMATask from .data import RuntimeData @@ -63,6 +64,7 @@ def __init__( self._tasks: list[RuntimeTask] = [] self._fifos = set() self._workers = [] + self._parameters: list[Parameter] = [] self._open_task_groups = [] self._trace_size = None self._trace_workers = None @@ -146,6 +148,7 @@ def fill( task_group: RuntimeTaskGroup | None = None, wait: bool = False, tile: Tile = AnyShimTile, + offset_parameter: "Parameter | str | None" = None, ) -> None: """Conceptually fill an ObjectFifoHandle (of type producer) with data from a runtime buffer. This should be called within a Runtime.sequence() context. @@ -158,6 +161,7 @@ def fill( task_group (RuntimeTaskGroup | None, optional): A TaskGroup to associate this task with. Defaults to None. wait (bool, optional): Whether this Task should be awaited on or not. If not, it will be freed when the task group is finished. Defaults to False. tile (Tile | None, optional): The Shim tile to associate the data transfer with. Defaults to AnyShimTile. + offset_parameter (Parameter | str | None, optional): A Parameter (or its name) whose value is used as the byte offset for this DMA transfer. Defaults to None. Raises: ValueError: Arguments are validated. @@ -171,9 +175,18 @@ def fill( if tap is None: tap = source.default_tap() + offset_param_name = None + if offset_parameter is not None: + if isinstance(offset_parameter, Parameter): + offset_param_name = offset_parameter.name + if offset_parameter not in self._parameters: + self._parameters.append(offset_parameter) + else: + offset_param_name = offset_parameter + in_fifo.endpoint = rt_endpoint self._fifos.add(in_fifo) - self._tasks.append(DMATask(in_fifo, source, tap, task_group, wait)) + self._tasks.append(DMATask(in_fifo, source, tap, task_group, wait, offset_param_name)) def drain( self, @@ -183,6 +196,7 @@ def drain( task_group: RuntimeTaskGroup | None = None, wait: bool = False, tile: Tile = AnyShimTile, + offset_parameter: "Parameter | str | None" = None, ) -> None: """Conceptually fill an ObjectFifoHandle (of type consumer) of data and write that data to a runtime buffer. This should be called within a Runtime.sequence() context. @@ -195,6 +209,7 @@ def drain( task_group (RuntimeTaskGroup | None, optional): A TaskGroup to associate this task with. Defaults to None. wait (bool, optional): Whether this Task should be awaited on or not. If not, it will be freed when the task group is finished. Defaults to False. tile (Tile | None, optional): The Shim tile to associate the data transfer with. Defaults to AnyShimTile. + offset_parameter (Parameter | str | None, optional): A Parameter (or its name) whose value is used as the byte offset for this DMA transfer. Defaults to None. Raises: ValueError: Arguments are validated. @@ -208,9 +223,18 @@ def drain( if tap is None: tap = dest.default_tap() + offset_param_name = None + if offset_parameter is not None: + if isinstance(offset_parameter, Parameter): + offset_param_name = offset_parameter.name + if offset_parameter not in self._parameters: + self._parameters.append(offset_parameter) + else: + offset_param_name = offset_parameter + out_fifo.endpoint = rt_endpoint self._fifos.add(out_fifo) - self._tasks.append(DMATask(out_fifo, dest, tap, task_group, wait)) + self._tasks.append(DMATask(out_fifo, dest, tap, task_group, wait, offset_param_name)) def start(self, *args: Worker): """A placeholder operation to indicate that one or more Worker should be started on the device. @@ -291,6 +315,15 @@ def set_barrier(self, barrier: WorkerRuntimeBarrier, value: int): """ self._tasks.append(_BarrierSetOp(barrier, value)) + def sync_parameters(self): + """Emit ``aiex.sync_parameters_from_host`` in the runtime sequence. + + Call this within a :meth:`sequence` context after all parameters have + been written on the host side and before starting workers that read + them. + """ + self._tasks.append(_SyncParametersTask()) + @property def workers(self) -> list[Worker]: """The workers associated with the Runtime by calls to start()""" @@ -391,3 +424,16 @@ def finish_task_group(tg, task_group_actions): if task_group_actions[default_task_group]: finish_task_group(default_task_group, task_group_actions) + + +class _SyncParametersTask(Resolvable): + """Emits ``aiex.sync_parameters_from_host`` during runtime sequence resolution.""" + + def resolve( + self, + loc: ir.Location | None = None, + ip: ir.InsertionPoint | None = None, + ) -> None: + from ...dialects.aiex import sync_parameters_from_host + + sync_parameters_from_host(loc=loc, ip=ip) diff --git a/python/iron/worker.py b/python/iron/worker.py index c946b9eb81c..673df2b4b82 100644 --- a/python/iron/worker.py +++ b/python/iron/worker.py @@ -19,6 +19,7 @@ from .dataflow.objectfifo import ObjectFifoHandle, ObjectFifo from .dataflow.endpoint import ObjectFifoEndpoint from .buffer import Buffer +from .parameter import Parameter from .resolvable import Resolvable @@ -101,6 +102,8 @@ def do_nothing_core_fun(*args) -> None: f"cannot reassign to {self._tile}" ) arg._tile = self._tile + elif isinstance(arg, Parameter): + pass # Parameters are device-level symbols; no tile placement needed elif isinstance(arg, ObjectFifo): # This is an easy error to make, so we catch it early raise ValueError( diff --git a/python/utils/parameter_scratchpad.py b/python/utils/parameter_scratchpad.py new file mode 100644 index 00000000000..c8556b45b8e --- /dev/null +++ b/python/utils/parameter_scratchpad.py @@ -0,0 +1,73 @@ +# parameter_scratchpad.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. +"""Host-side runtime for writing named parameters to AIE cores via the +scratchpad mechanism. + +Thin Python wrapper around the C++ ``test_utils::ParameterScratchpad`` +class (exposed via pybind11). The encoding logic (shift-2, delta) lives +entirely in C++; this layer only manages the ``pyxrt.bo`` handle for +``sync()``. + +Usage:: + + import pyxrt + from aie.utils.parameter_scratchpad import ParameterScratchpad + + run = pyxrt.run(kernel) + params = ParameterScratchpad(run, "params.txt") + params.write("seq_len", 42) + params.sync() + run.start() +""" + +import struct +from pathlib import Path + +import pyxrt + +from aie._mlir_libs._parameter_scratchpad import ( + ParameterScratchpad as _ParameterScratchpadImpl, +) + + +def _to_bytes(value) -> bytes: + """Convert any scalar to its little-endian in-memory bytes.""" + if isinstance(value, int): + return struct.pack(" None: + """Write a parameter value to the scratchpad. + + Args: + name: The parameter name (must match a name in the params file). + value: A scalar value — ``int``, ``float``, or any type with + a ``tobytes()`` method (``np.float32``, ``bfloat16``, etc.). + """ + self._impl.write_bytes(name, _to_bytes(value)) + + def sync(self) -> None: + """Sync the scratchpad buffer to device.""" + self._bo.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + + def read(self, name: str) -> int: + """Read back a parameter's current decoded value (for debugging).""" + return self._impl.read(name) diff --git a/runtime_lib/test_lib/CMakeLists.txt b/runtime_lib/test_lib/CMakeLists.txt index 0d1bdd3ff7d..df0e499225e 100644 --- a/runtime_lib/test_lib/CMakeLists.txt +++ b/runtime_lib/test_lib/CMakeLists.txt @@ -65,7 +65,7 @@ endif() # test_utils library if (BUILD_TEST_UTILS_LIBRARY) add_library(test_utils STATIC test_utils.cpp) - set_target_properties(test_utils PROPERTIES PUBLIC_HEADER "test_utils.h;xrt_test_wrapper.h;cxxopts.hpp") + set_target_properties(test_utils PROPERTIES PUBLIC_HEADER "test_utils.h;xrt_test_wrapper.h;cxxopts.hpp;parameter_scratchpad.h") add_runtime_pic_flag(test_utils) # XRT headers on Windows pull in boost/any.hpp which is not typically @@ -86,7 +86,7 @@ if (BUILD_TEST_LIBRARY) list(APPEND headers target.h test_library.h memory_allocator.h hsa_ext_air.h) endif() if (STAGE_TEST_UTILS_PAYLOAD) - list(APPEND headers test_utils.h xrt_test_wrapper.h) + list(APPEND headers test_utils.h xrt_test_wrapper.h parameter_scratchpad.h) endif() foreach(basefile ${headers}) set(dest ${CMAKE_CURRENT_BINARY_DIR}/../include/${basefile}) @@ -127,7 +127,7 @@ if (BUILD_TEST_UTILS_LIBRARY) PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_PREFIX}/runtime_lib/${AIE_RUNTIME_TARGET}/test_lib/include ) elseif (STAGE_TEST_UTILS_PAYLOAD) - install(FILES test_utils.h xrt_test_wrapper.h cxxopts.hpp + install(FILES test_utils.h xrt_test_wrapper.h cxxopts.hpp parameter_scratchpad.h DESTINATION ${CMAKE_INSTALL_PREFIX}/runtime_lib/${AIE_RUNTIME_TARGET}/test_lib/include ) endif() diff --git a/runtime_lib/test_lib/parameter_scratchpad.h b/runtime_lib/test_lib/parameter_scratchpad.h new file mode 100644 index 00000000000..93df1954093 --- /dev/null +++ b/runtime_lib/test_lib/parameter_scratchpad.h @@ -0,0 +1,175 @@ +//===- parameter_scratchpad.h - Host-side parameter runtime ------*- C++-*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// +// +// ParameterScratchpad: Host-side runtime class for writing named parameters +// to AIE cores via the scratchpad mechanism. +// +// Usage: +// auto params = test_utils::ParameterScratchpad(run, "params.json"); +// params.write("foo", 42u); +// params.write("bar", std::bfloat16_t(3.14f)); +// params.sync(); +// +//===----------------------------------------------------------------------===// + +#ifndef AIE_RUNTIME_TEST_LIB_PARAMETER_SCRATCHPAD_H +#define AIE_RUNTIME_TEST_LIB_PARAMETER_SCRATCHPAD_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef TEST_UTILS_USE_XRT +#include +#include +#endif + +namespace test_utils { + +class ParameterScratchpad { +public: +#ifdef TEST_UTILS_USE_XRT + /// Construct from an XRT run handle (C++ usage). + ParameterScratchpad(xrt::run &run, const std::string ¶msPath) { + parseParams(paramsPath); + scratchpadBo = run.get_ctrl_scratchpad_bo(); + boMap = scratchpadBo.map(); + init(); + } +#endif + + /// Construct from a raw buffer pointer (Python bindings / testing). + ParameterScratchpad(uint32_t *buffer, const std::string ¶msPath) + : boMap(buffer) { + parseParams(paramsPath); + init(); + } + + /// Write raw bytes (up to 4) by name, interpreted as a little-endian + /// uint32. The bits are left-shifted by 2 (firmware requirement) and + /// delta-encoded against the previous write. + /// For addr-kind parameters, the value is written raw (no shift, no delta). + void writeBytes(const std::string &name, const void *data, size_t len) { + uint32_t bits = 0; + std::memcpy(&bits, data, std::min(len, size_t(4))); + writeBits(name, bits); + } + + /// Write a raw 32-bit value by name. For core-kind parameters, the bits + /// are left-shifted by 2 (firmware requirement) and delta-encoded against + /// the previous write. For addr-kind parameters, the value is written + /// directly (no shift-2, no delta encoding). + void writeBits(const std::string &name, uint32_t bits) { + auto it = paramMap.find(name); + if (it == paramMap.end()) { + throw std::runtime_error("ParameterScratchpad: unknown parameter '" + + name + "'"); + } + uint8_t idx = it->second; + if (addrParams.count(name)) { + // addr-kind: raw absolute write, no shift-2, no delta encoding. + // The firmware multiplies by element_size and adds to BD address. + boMap[idx] = bits; + } else { + // core-kind: shift-2 + delta encoding. + uint32_t encoded = bits << 2; + boMap[idx] = encoded - prevEncoded[idx]; + prevEncoded[idx] = encoded; + } + } + + /// Write a typed parameter value. The raw bits of the value are + /// left-shifted by 2 as required by the firmware's UPDATE_REG Incr mode. + /// Supports any type up to 32 bits (uint32_t, int16_t, std::bfloat16_t, + /// float, etc.). + template + void write(const std::string &name, T value) { + static_assert(sizeof(T) <= 4, "Parameter values must be at most 32 bits"); + uint32_t bits = 0; + std::memcpy(&bits, &value, sizeof(T)); + writeBits(name, bits); + } + +#ifdef TEST_UTILS_USE_XRT + /// Sync the scratchpad buffer to device. Call after all writes for this run. + void sync() { scratchpadBo.sync(XCL_BO_SYNC_BO_TO_DEVICE); } +#endif + + /// Read back a parameter's current encoded value (for debugging). + uint32_t read(const std::string &name) const { + auto it = paramMap.find(name); + if (it == paramMap.end()) { + throw std::runtime_error("ParameterScratchpad: unknown parameter '" + + name + "'"); + } + if (addrParams.count(name)) { + return boMap[it->second]; // addr-kind: raw value + } + return prevEncoded[it->second] >> 2; + } + +private: +#ifdef TEST_UTILS_USE_XRT + xrt::bo scratchpadBo; +#endif + uint32_t *boMap = nullptr; + size_t scratchpadSizeBytes = 0; + std::unordered_map paramMap; + std::unordered_set addrParams; // params with kind="addr" + std::vector prevEncoded; + + void init() { + prevEncoded.resize(scratchpadSizeBytes / 4, 0); + for (size_t i = 0; i < scratchpadSizeBytes / 4; i++) { + boMap[i] = 0; + } + } + + void parseParams(const std::string &path) { + std::ifstream file(path); + if (!file.is_open()) { + throw std::runtime_error("ParameterScratchpad: cannot open '" + path + + "'"); + } + + // Format: + // + // + // ... + // where kind is "core" or "addr". + unsigned numParams = 0; + file >> numParams; + scratchpadSizeBytes = numParams * 4; + + for (unsigned i = 0; i < numParams; i++) { + std::string name, type, kind; + unsigned idx; + file >> name >> idx >> type; + // kind column is optional for backward compatibility + if (file.peek() != '\n' && file.peek() != EOF) { + file >> kind; + } else { + kind = "core"; + } + paramMap[name] = static_cast(idx); + if (kind == "addr") + addrParams.insert(name); + } + } +}; + +} // namespace test_utils + +#endif // AIE_RUNTIME_TEST_LIB_PARAMETER_SCRATCHPAD_H diff --git a/test/Dialect/AIEX/invalid_parameters.mlir b/test/Dialect/AIEX/invalid_parameters.mlir new file mode 100644 index 00000000000..83a84b50b2c --- /dev/null +++ b/test/Dialect/AIEX/invalid_parameters.mlir @@ -0,0 +1,38 @@ +// RUN: aie-opt %s -split-input-file -verify-diagnostics + +// Verify that read_parameter must be inside aie.core. + +aiex.parameter @foo : i32 +aie.device(npu2) { + aie.runtime_sequence() { + // expected-error @+1 {{'aiex.read_parameter' op expects parent op 'aie.core'}} + %x = aiex.read_parameter @foo : i32 + aie.end + } +} + +// ----- + +// Verify that read_parameter rejects unknown parameter references. + +aie.device(npu2) { + %t = aie.tile(0, 2) + aie.core(%t) { + // expected-error @+1 {{'aiex.read_parameter' op references unknown parameter 'nonexistent'}} + %x = aiex.read_parameter @nonexistent : i32 + aie.end + } +} + +// ----- + +// Verify that sync_parameters_from_host must be inside runtime_sequence. + +aie.device(npu2) { + %t = aie.tile(0, 2) + aie.core(%t) { + // expected-error @+1 {{'aiex.sync_parameters_from_host' op expects parent op 'aie.runtime_sequence'}} + aiex.sync_parameters_from_host + aie.end + } +} diff --git a/test/npu-xrt/scratchpad_addr_offset/aie.mlir b/test/npu-xrt/scratchpad_addr_offset/aie.mlir new file mode 100644 index 00000000000..3264bb9022c --- /dev/null +++ b/test/npu-xrt/scratchpad_addr_offset/aie.mlir @@ -0,0 +1,92 @@ +// (c) Copyright 2025 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Test: DMA address offset patching via offset_parameter. +// +// The host prepares an input buffer with monotonically increasing i32 values +// [0, 1, 2, ..., 31]. The core does a simple passthrough of 8 elements. +// The offset_parameter @input_offset controls where in the input buffer +// the DMA begins reading. +// +// Run 1: input_offset = 0 → output = [0, 1, 2, 3, 4, 5, 6, 7] +// Run 2: input_offset = 8 → output = [8, 9, 10, 11, 12, 13, 14, 15] +// Run 3: input_offset = 16 → output = [16, 17, 18, 19, 20, 21, 22, 23] +// +module { + // Runtime parameter: element offset into the input buffer. + // aiex.parameter ops are declared at module scope (global across devices). + aiex.parameter @input_offset : i32 + + aie.device(npu2) @empty { } + + aie.device(npu2) @test { + + %t00 = aie.tile(0, 0) + %t02 = aie.tile(0, 2) + + // Lock to gate the core until parameters are loaded + %sync_lock = aie.lock(%t02, 0) {init = 0 : i32, sym_name = "sync_lock"} + + // ObjectFIFOs + aie.objectfifo @objfifo_in (%t00, {%t02}, 1 : i32) : !aie.objectfifo> + aie.objectfifo @objfifo_out (%t02, {%t00}, 1 : i32) : !aie.objectfifo> + + // Core: passthrough — copy input to output + aie.core(%t02) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + + // Wait for parameters + DMA to be ready + aie.use_lock(%sync_lock, Acquire, 1) + aie.use_lock(%sync_lock, Release, 0) + + %in_view = aie.objectfifo.acquire @objfifo_in (Consume, 1) : !aie.objectfifosubview> + %in_buf = aie.objectfifo.subview.access %in_view[0] : !aie.objectfifosubview> -> memref<8xi32> + + %out_view = aie.objectfifo.acquire @objfifo_out (Produce, 1) : !aie.objectfifosubview> + %out_buf = aie.objectfifo.subview.access %out_view[0] : !aie.objectfifosubview> -> memref<8xi32> + + scf.for %i = %c0 to %c8 step %c1 { + %v = memref.load %in_buf[%i] : memref<8xi32> + memref.store %v, %out_buf[%i] : memref<8xi32> + } + + aie.objectfifo.release @objfifo_in (Consume, 1) + aie.objectfifo.release @objfifo_out (Produce, 1) + + aie.end + } + + // Runtime sequence + aie.runtime_sequence @sequence(%in : memref<32xi32>, %out : memref<8xi32>) { + + aiex.npu.load_pdi { device_ref = @empty } + aiex.npu.load_pdi { device_ref = @test } + + // Load scratchpad parameters from host + aiex.sync_parameters_from_host + + // Unblock core + aiex.set_lock(%sync_lock, 1) + + // Input DMA — offset_parameter patches the BD address at runtime + %t_in = aiex.dma_configure_task_for @objfifo_in { + aie.dma_bd(%in : memref<32xi32>, 0, 8) {offset_parameter = @input_offset} + aie.end + } + + // Output DMA + %t_out = aiex.dma_configure_task_for @objfifo_out { + aie.dma_bd(%out : memref<8xi32>, 0, 8) + aie.end + } {issue_token = true} + + aiex.dma_start_task(%t_in) + aiex.dma_start_task(%t_out) + aiex.dma_await_task(%t_out) + + aiex.set_lock(%sync_lock, 0) + } + } +} diff --git a/test/npu-xrt/scratchpad_addr_offset/test.py b/test/npu-xrt/scratchpad_addr_offset/test.py new file mode 100644 index 00000000000..adad6590e51 --- /dev/null +++ b/test/npu-xrt/scratchpad_addr_offset/test.py @@ -0,0 +1,89 @@ +# (c) Copyright 2025 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Test for DMA address offset patching via offset_parameter. +# +# REQUIRES: ryzen_ai_npu2, peano, xrt_python_bindings +# +# RUN: aiecc.py -v --generate-full-elf --no-xchesscc --no-xbridge --dynamic-objFifos %S/aie.mlir +# RUN: cp aie.mlir.prj/params.txt . +# RUN: %run_on_npu2% %python %s +# +# Setup: +# - Input buffer: 32 i32 values [0, 1, 2, ..., 31] +# - Core: passthrough of 8 elements +# - offset_parameter @input_offset controls the DMA read start position +# +# We run three times with different offsets and verify the output each time. + +import struct +import sys + +import numpy as np +import pyxrt + +from aie.utils.parameter_scratchpad import ParameterScratchpad + + +def main(): + N_INPUT = 32 + N_OUTPUT = 8 + + device = pyxrt.device(0) + elf = pyxrt.elf("aie.elf") + context = pyxrt.hw_context(device, elf) + kernel = pyxrt.ext.kernel(context, "test:sequence") + + # Input buffer: [0, 1, 2, ..., 31] as i32 + input_data = np.arange(N_INPUT, dtype=np.int32) + bo_in = pyxrt.ext.bo(device, N_INPUT * 4) + bo_in.write(input_data.tobytes(), 0) + bo_in.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + + # Output buffer: 8 x i32 + bo_out = pyxrt.ext.bo(device, N_OUTPUT * 4) + + run = pyxrt.run(kernel) + run.set_arg(0, bo_in) + run.set_arg(1, bo_out) + + params = ParameterScratchpad(run, "params.txt") + + test_cases = [ + (0, list(range(0, 8))), + (8, list(range(8, 16))), + (16, list(range(16, 24))), + ] + + all_pass = True + for run_idx, (offset, expected) in enumerate(test_cases, 1): + # Clear output + bo_out.write(bytes(N_OUTPUT * 4), 0) + bo_out.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + + # Write offset parameter (in elements) + params.write("input_offset", np.int32(offset)) + params.sync() + + run.start() + run.wait2() + + bo_out.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) + mv = bo_out.map() + result = np.frombuffer(bytes(mv[:N_OUTPUT * 4]), dtype=np.int32).tolist() + + status = "PASS" if result == expected else "FAIL" + if result != expected: + all_pass = False + print(f"Run {run_idx} — offset={offset:2d} expected={expected} got={result} {status}") + + if all_pass: + print("PASS!") + return 0 + else: + print("FAIL.") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/npu-xrt/scratchpad_addr_offset_python/aie_design.py b/test/npu-xrt/scratchpad_addr_offset_python/aie_design.py new file mode 100644 index 00000000000..b637408fe72 --- /dev/null +++ b/test/npu-xrt/scratchpad_addr_offset_python/aie_design.py @@ -0,0 +1,94 @@ +# (c) Copyright 2025 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# IRON design: DMA address offset patching via offset_parameter. +# +# REQUIRES: dont_run +# RUN: echo +# +# The host prepares an input buffer with monotonically increasing i32 values +# [0, 1, 2, ..., 31]. The core does a simple passthrough of 8 elements. +# The offset_parameter @input_offset controls where in the input buffer +# the DMA begins reading. +# +# Run 1: input_offset = 0 -> output = [0, 1, 2, 3, 4, 5, 6, 7] +# Run 2: input_offset = 8 -> output = [8, 9, 10, 11, 12, 13, 14, 15] +# Run 3: input_offset = 16 -> output = [16, 17, 18, 19, 20, 21, 22, 23] +# +# Usage: +# python3 aie_design.py > aie.mlir + +import numpy as np + +from aie.iron import ObjectFifo, Program, Runtime, Worker, WorkerRuntimeBarrier +from aie.iron.device import NPU2Col1 +from aie.iron.parameter import Parameter +from aie.dialects.aiex import npu_load_pdi +from aie.helpers.taplib import TensorAccessPattern + + +def design(): + device_name = "test" + + # Types: input is 32 x i32 (full buffer), but each transfer is 8 elements + in_ty = np.ndarray[(32,), np.dtype[np.int32]] + out_ty = np.ndarray[(8,), np.dtype[np.int32]] + tile_ty = np.ndarray[(8,), np.dtype[np.int32]] + + # Parameter: element offset into the input buffer + input_offset = Parameter("input_offset", np.int32) + + # ObjectFIFOs + of_in = ObjectFifo(tile_ty, name="objfifo_in") + of_out = ObjectFifo(tile_ty, name="objfifo_out") + + # Barrier to gate the core until parameters are loaded + barrier = WorkerRuntimeBarrier() + + # Core function: passthrough — copy input to output + def core_fn(of_in, of_out, barrier): + barrier.wait_for_value(1) + barrier.release_with_value(0) + + in_elem = of_in.acquire(1) + out_elem = of_out.acquire(1) + for i in range(8): + out_elem[i] = in_elem[i] + of_in.release(1) + of_out.release(1) + + worker = Worker( + core_fn, + [of_in.cons(), of_out.prod(), barrier], + while_true=False, + ) + + # Runtime sequence + rt = Runtime() + with rt.sequence(in_ty, out_ty) as (in_tensor, out_tensor): + rt.inline_ops(lambda: npu_load_pdi(device_ref="empty"), []) + rt.inline_ops(lambda: npu_load_pdi(device_ref=device_name), []) + rt.sync_parameters() + rt.set_barrier(barrier, 1) + rt.start(worker) + + # Input DMA — offset_parameter patches the BD address at runtime + in_tap = TensorAccessPattern((32,), offset=0, sizes=[1, 1, 1, 8], strides=[0, 0, 0, 1]) + rt.fill(of_in.prod(), in_tensor, tap=in_tap, offset_parameter=input_offset) + + # Output DMA + rt.drain(of_out.cons(), out_tensor, wait=True) + + rt.set_barrier(barrier, 0) + + module = Program(NPU2Col1(), rt).resolve_program(device_name=device_name) + + # Insert empty device to force PDI reload + mlir_text = str(module) + empty_device = ' aie.device(npu2) @empty { }\n' + mlir_text = mlir_text.replace('module {\n', 'module {\n' + empty_device, 1) + return mlir_text + + +mlir_text = design() +print(mlir_text) diff --git a/test/npu-xrt/scratchpad_addr_offset_python/test.py b/test/npu-xrt/scratchpad_addr_offset_python/test.py new file mode 100644 index 00000000000..44a927cbd61 --- /dev/null +++ b/test/npu-xrt/scratchpad_addr_offset_python/test.py @@ -0,0 +1,89 @@ +# (c) Copyright 2025 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Test for DMA address offset patching via offset_parameter (IRON flow). +# +# REQUIRES: ryzen_ai_npu2, peano, xrt_python_bindings +# +# RUN: %python %S/aie_design.py > aie.mlir +# RUN: aiecc.py -v --generate-full-elf --no-xchesscc --no-xbridge --dynamic-objFifos aie.mlir +# RUN: cp aie.mlir.prj/params.txt . +# RUN: %run_on_npu2% %python %s +# +# Setup: +# - Input buffer: 32 i32 values [0, 1, 2, ..., 31] +# - Core: passthrough of 8 elements +# - offset_parameter @input_offset controls the DMA read start position +# +# We run three times with different offsets and verify the output each time. + +import sys + +import numpy as np +import pyxrt + +from aie.utils.parameter_scratchpad import ParameterScratchpad + + +def main(): + N_INPUT = 32 + N_OUTPUT = 8 + + device = pyxrt.device(0) + elf = pyxrt.elf("aie.elf") + context = pyxrt.hw_context(device, elf) + kernel = pyxrt.ext.kernel(context, "test:sequence") + + # Input buffer: [0, 1, 2, ..., 31] as i32 + input_data = np.arange(N_INPUT, dtype=np.int32) + bo_in = pyxrt.ext.bo(device, N_INPUT * 4) + bo_in.write(input_data.tobytes(), 0) + bo_in.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + + # Output buffer: 8 x i32 + bo_out = pyxrt.ext.bo(device, N_OUTPUT * 4) + + run = pyxrt.run(kernel) + run.set_arg(0, bo_in) + run.set_arg(1, bo_out) + + params = ParameterScratchpad(run, "params.txt") + + test_cases = [ + (0, list(range(0, 8))), + (8, list(range(8, 16))), + (16, list(range(16, 24))), + ] + + all_pass = True + for run_idx, (offset, expected) in enumerate(test_cases, 1): + # Clear output + bo_out.write(bytes(N_OUTPUT * 4), 0) + bo_out.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) + + # Write offset parameter (in elements) + params.write("input_offset", np.int32(offset)) + params.sync() + + run.start() + run.wait2() + + bo_out.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) + mv = bo_out.map() + result = np.frombuffer(bytes(mv[:N_OUTPUT * 4]), dtype=np.int32).tolist() + + status = "PASS" if result == expected else "FAIL" + if result != expected: + all_pass = False + print(f"Run {run_idx} — offset={offset:2d} expected={expected} got={result} {status}") + + if all_pass: + print("PASS!") + return 0 + else: + print("FAIL.") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/test/npu-xrt/scratchpad_params/aie.mlir b/test/npu-xrt/scratchpad_params/aie.mlir new file mode 100644 index 00000000000..5d66852ec9f --- /dev/null +++ b/test/npu-xrt/scratchpad_params/aie.mlir @@ -0,0 +1,74 @@ +// (c) Copyright 2025 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +module { + // Parameters (declared at module scope; global across all devices) + aiex.parameter @foo : bf16 + aiex.parameter @bar : bf16 + + // Empty device needed to force load_pdi reconfiguration + aie.device(npu2) @empty { } + + // Actual test configuration PDI + aie.device(npu2) @test { + + %t00 = aie.tile(0, 0) + %t02 = aie.tile(0, 2) + + // Parameter sync + %sync_lock = aie.lock(%t02, 0) {init = 0 : i32, sym_name = "sync_lock"} + + // Output ObjectFIFO + aie.objectfifo @objfifo_out (%t02, {%t00}, 1 : i32) : !aie.objectfifo> + + // Core + aie.core(%t02) { + %c0 = arith.constant 0 : index + + // Block until run-time parameters are ready + aie.use_lock(%sync_lock, Acquire, 1) + + // Read the bf16 parameters written by UPDATE_REG + %foo = aiex.read_parameter @foo : bf16 + %bar = aiex.read_parameter @bar : bf16 + + // Release lock + aie.use_lock(%sync_lock, Release, 0) + + // Calculate result: foo * bar in bf16 + %val_bf16 = arith.mulf %foo, %bar : bf16 + + %out_view = aie.objectfifo.acquire @objfifo_out (Produce, 1) : !aie.objectfifosubview> + %out_buf = aie.objectfifo.subview.access %out_view[0] : !aie.objectfifosubview> -> memref<2xbf16> + memref.store %val_bf16, %out_buf[%c0] : memref<2xbf16> + aie.objectfifo.release @objfifo_out (Produce, 1) + + aie.end + } + + // Runtime sequence: + aie.runtime_sequence @sequence(%out : memref<2xbf16>) { + + aiex.npu.load_pdi { device_ref = @empty } + aiex.npu.load_pdi { device_ref = @test } + + // Load values of parameters from host DDR and write to each core's parameter buffers + aiex.sync_parameters_from_host + + // Unblock the core (lock was init=0, now set to 1) + aiex.set_lock(%sync_lock, 1) + + // Configure output DMA + %t_out = aiex.dma_configure_task_for @objfifo_out { + aie.dma_bd(%out : memref<2xbf16>, 0, 2) + aie.end + } {issue_token = true} + + aiex.dma_start_task(%t_out) + aiex.dma_await_task(%t_out) + + aiex.set_lock(%sync_lock, 0) + } + + } +} diff --git a/test/npu-xrt/scratchpad_params/run.lit b/test/npu-xrt/scratchpad_params/run.lit new file mode 100644 index 00000000000..d89a073d39c --- /dev/null +++ b/test/npu-xrt/scratchpad_params/run.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2025 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai_npu2, peano +// +// RUN: aiecc -v --generate-full-elf --no-xchesscc --no-xbridge --dynamic-objFifos %S/aie.mlir +// RUN: cp aie.mlir.prj/params.txt . +// RUN: g++-13 -o test %S/test.cpp -std=c++23 -lstdc++ %xrt_flags %test_utils_flags +// RUN: %run_on_npu2% ./test diff --git a/test/npu-xrt/scratchpad_params/test.cpp b/test/npu-xrt/scratchpad_params/test.cpp new file mode 100644 index 00000000000..2f2dd0a064e --- /dev/null +++ b/test/npu-xrt/scratchpad_params/test.cpp @@ -0,0 +1,88 @@ +// (c) Copyright 2025 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Test host application for scratchpad register-write use case. +// +// Demonstrates passing an arbitrary runtime parameter to an AIE core via: +// Host scratchpad → UPDATE_REG → core local buffer → ObjectFIFO → DDR output +// +// Synchronization uses a lock: the runtime sequence sets the lock after +// UPDATE_REG completes, and the core blocks on lock acquire until then. +// + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +constexpr std::bfloat16_t FOO_1 = (std::bfloat16_t)3.0; +constexpr std::bfloat16_t BAR_1 = (std::bfloat16_t)4.0; +constexpr std::bfloat16_t FOO_2 = (std::bfloat16_t)2.0; +constexpr std::bfloat16_t BAR_2 = (std::bfloat16_t)5.0; + +int main(int argc, const char *argv[]) { + auto device = xrt::device(0); + + std::string kernelName = "test:sequence"; + xrt::elf ctx_elf{"aie.elf"}; + xrt::hw_context context = xrt::hw_context(device, ctx_elf); + auto kernel = xrt::ext::kernel(context, kernelName); + + xrt::bo bo_out = xrt::ext::bo{device, 2 * sizeof(std::bfloat16_t)}; + auto *buf_out = bo_out.map(); + memset(buf_out, 0, 2 * sizeof(std::bfloat16_t)); + bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + auto run = xrt::run(kernel); + run.set_arg(0, bo_out); + auto params = test_utils::ParameterScratchpad(run, "params.txt"); + + // Run 1: 3.0 * 4.0 = 12.0 + params.write("foo", FOO_1); + params.write("bar", BAR_1); + params.sync(); + + run.start(); + run.wait2(); + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + std::bfloat16_t result1 = buf_out[0]; + std::bfloat16_t expected1 = FOO_1 * BAR_1; + + std::cout << "Run 1 — Expected: " << expected1 << ", Got: " << result1 + << std::endl; + + // Run 2: 2.0 * 5.0 = 10.0 + params.write("foo", FOO_2); + params.write("bar", BAR_2); + params.sync(); + memset(buf_out, 0, 2 * sizeof(std::bfloat16_t)); + bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + run.start(); + run.wait2(); + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + std::bfloat16_t result2 = buf_out[0]; + std::bfloat16_t expected2 = FOO_2 * BAR_2; + + std::cout << "Run 2 — Expected: " << expected2 << ", Got: " << result2 + << std::endl; + + if (result1 == expected1 && result2 == expected2) { + std::cout << "PASS!" << std::endl; + return 0; + } else { + std::cout << "FAIL." << std::endl; + return 1; + } +} diff --git a/test/npu-xrt/scratchpad_params_python/aie_design.py b/test/npu-xrt/scratchpad_params_python/aie_design.py new file mode 100644 index 00000000000..7ce7b0e58ae --- /dev/null +++ b/test/npu-xrt/scratchpad_params_python/aie_design.py @@ -0,0 +1,85 @@ +# (c) Copyright 2025 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# IRON design: generates the MLIR for the scratchpad parameter test. +# +# REQUIRES: dont_run +# RUN: echo +# +# Core computes: output = foo * bar (bf16 parameters set at runtime). +# +# Usage: +# python3 aie_design.py > aie.mlir + +import numpy as np +from ml_dtypes import bfloat16 + +from aie.iron import ObjectFifo, Program, Runtime, Worker, WorkerRuntimeBarrier +from aie.iron.device import NPU2Col1 +from aie.iron.parameter import Parameter +from aie.dialects.aiex import npu_load_pdi +from aie.dialects.arith import ConstantOp, mulf +from aie.dialects.memref import store +from aie.ir import IndexType, IntegerAttr + + +def design(): + device_name = "test" + + # Output type: 2 x bf16 + # Note: We only calculate 1 bf16 value, but DMAs operate at 4-byte granularity so this is the smallest possible size + out_ty = np.ndarray[(2,), np.dtype[bfloat16]] + + # Parameters + foo = Parameter("foo", bfloat16) + bar = Parameter("bar", bfloat16) + + # ObjectFIFO for output + of_out = ObjectFifo(out_ty, name="objfifo_out") + + # Barrier to gate the core until parameters are loaded + barrier = WorkerRuntimeBarrier() + + # Core function: read parameters, multiply, write to output + def core_fn(of_out, foo, bar, barrier): + barrier.wait_for_value(1) + val_foo = foo.read() + val_bar = bar.read() + barrier.release_with_value(0) + + elem = of_out.acquire(1) + result = val_foo * val_bar + elem[0] = result + of_out.release(1) + + worker = Worker( + core_fn, + [of_out.prod(), foo, bar, barrier], + while_true=False, + ) + + # Runtime sequence: load empty device first to force PDI reconfiguration + rt = Runtime() + with rt.sequence(out_ty) as out_tensor: + rt.inline_ops(lambda: npu_load_pdi(device_ref="empty"), []) + rt.inline_ops(lambda: npu_load_pdi(device_ref=device_name), []) + rt.sync_parameters() + rt.set_barrier(barrier, 1) + rt.start(worker) + rt.drain(of_out.cons(), out_tensor, wait=True) + rt.set_barrier(barrier, 0) + + module = Program(NPU2Col1(), rt).resolve_program(device_name=device_name) + + # Insert empty device at the beginning of the module to force PDI reload. + # The firmware skips reloading a PDI if it's the same as the last one loaded, + # so we force a different (empty) PDI first. + # FIXME: Replace this with the proper IRON abstraction for multiple devices when it becomes available. + mlir_text = str(module) + empty_device = ' aie.device(npu2) @empty { }\n' + mlir_text = mlir_text.replace('module {\n', 'module {\n' + empty_device, 1) + return mlir_text + + +mlir_text = design() +print(mlir_text) diff --git a/test/npu-xrt/scratchpad_params_python/test.py b/test/npu-xrt/scratchpad_params_python/test.py new file mode 100644 index 00000000000..78c45f9aab2 --- /dev/null +++ b/test/npu-xrt/scratchpad_params_python/test.py @@ -0,0 +1,94 @@ +# (c) Copyright 2025 Advanced Micro Devices, Inc. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Python test for scratchpad parameter passing using ParameterScratchpad. +# +# REQUIRES: ryzen_ai_npu2, peano, xrt_python_bindings +# +# RUN: %python %S/aie_design.py > aie.mlir +# RUN: aiecc.py -v --generate-full-elf --no-xchesscc --no-xbridge --dynamic-objFifos aie.mlir +# RUN: cp aie.mlir.prj/params.txt . +# RUN: %run_on_npu2% %python %s +# +# This is the Python equivalent of the C++ test in ../scratchpad_params/. +# It exercises the full flow: +# 1. aiecc.py compiles aie.mlir → aie.elf + params.txt +# 2. This script loads the ELF, creates a ParameterScratchpad from params.txt, +# writes bf16 parameters, and verifies the core computes foo * bar. +# 3. A second run with different values tests delta encoding. + +import struct +import sys + +import pyxrt +from ml_dtypes import bfloat16 + +from aie.utils.parameter_scratchpad import ParameterScratchpad + + +def read_bf16(bo, offset): + """Read a bfloat16 value from a buffer object at the given byte offset.""" + mv = bo.map() + raw = bytes(mv[offset : offset + 2]) + # Reconstruct float32 from bfloat16 (upper 16 bits of float32) + f32_bytes = b"\x00\x00" + raw + return struct.unpack(" paramsPath(tmpDirName); + sys::path::append(paramsPath, "params.txt"); + paramOpts.outputParamsFile = paramsPath.str().str(); + } + pm.addPass( + xilinx::AIEX::createAIELowerParametersPass(std::move(paramOpts))); + } + + // Resume device-level passes after the module-level parameter lowering. + OpPassManager &devicePm2 = pm.nest(); + // Create buffer address assignment pass with alloc-scheme option xilinx::AIE::AIEAssignBufferAddressesOptions bufferOpts; bufferOpts.clAllocScheme = allocScheme.getValue(); - devicePm.addPass(xilinx::AIE::createAIEAssignBufferAddressesPass(bufferOpts)); + devicePm2.addPass( + xilinx::AIE::createAIEAssignBufferAddressesPass(bufferOpts)); // Infer per-core link_files from func-level link_with attributes - devicePm.addPass(xilinx::AIE::createAIEAssignCoreLinkFilesPass()); + devicePm2.addPass(xilinx::AIE::createAIEAssignCoreLinkFilesPass()); - devicePm.addPass(xilinx::AIE::createAIEVectorTransferLoweringPass()); + devicePm2.addPass(xilinx::AIE::createAIEVectorTransferLoweringPass()); // Step 5: Convert SCF to CF (module-level pass) pm.addPass(createSCFToControlFlowPass()); From 4957b4e125ef3167997a0247d306784028d45621 Mon Sep 17 00:00:00 2001 From: andrej Date: Tue, 12 May 2026 16:06:45 -0600 Subject: [PATCH 2/7] format --- include/aie/Dialect/AIE/IR/AIEDialect.h | 3 +-- include/aie/Dialect/AIEX/AIEUtils.h | 8 ++++---- lib/Dialect/AIE/IR/AIEDialect.cpp | 6 ++---- .../AIEX/Transforms/AIELowerParameters.cpp | 9 ++++----- lib/Dialect/AIEX/Utils/AIEUtils.cpp | 3 +-- python/ParameterScratchpadModule.cpp | 15 ++++++++------- test/npu-xrt/scratchpad_addr_offset/test.py | 6 ++++-- .../scratchpad_addr_offset_python/aie_design.py | 8 +++++--- .../npu-xrt/scratchpad_addr_offset_python/test.py | 6 ++++-- .../scratchpad_params_python/aie_design.py | 4 ++-- 10 files changed, 35 insertions(+), 33 deletions(-) diff --git a/include/aie/Dialect/AIE/IR/AIEDialect.h b/include/aie/Dialect/AIE/IR/AIEDialect.h index 9c1e3f78c1f..b0a9e038caf 100644 --- a/include/aie/Dialect/AIE/IR/AIEDialect.h +++ b/include/aie/Dialect/AIE/IR/AIEDialect.h @@ -67,8 +67,7 @@ uint32_t getShimBurstLengthEncoding(const AIE::AIETargetModel &tm, // a symbol with that name before the next call (otherwise the same name will // be returned again). std::string generateUniqueSymbolName(mlir::Operation *symbolTableOp, - llvm::StringRef prefix, - unsigned &counter); + llvm::StringRef prefix, unsigned &counter); mlir::LogicalResult verifyOffsetSizeAndStrideOp(mlir::OffsetSizeAndStrideOpInterface op); diff --git a/include/aie/Dialect/AIEX/AIEUtils.h b/include/aie/Dialect/AIEX/AIEUtils.h index 7a38a344e54..a4c8e7ff53f 100644 --- a/include/aie/Dialect/AIEX/AIEUtils.h +++ b/include/aie/Dialect/AIEX/AIEUtils.h @@ -46,9 +46,9 @@ std::optional traceSubviewToBlockArgument(Value value); // an `aiex.parameter`) and `offset_state_table_idx` (IntegerAttr, set by // `--aie-lower-parameters`) attributes. The referenced parameter must have // type `i32`. -LogicalResult -emitUpdateBdAddressFromOffsetParameter(OpBuilder &builder, Operation *bdOp, - BaseMemRefType bufType, - uint64_t registerAddr); +LogicalResult emitUpdateBdAddressFromOffsetParameter(OpBuilder &builder, + Operation *bdOp, + BaseMemRefType bufType, + uint64_t registerAddr); } } // namespace xilinx \ No newline at end of file diff --git a/lib/Dialect/AIE/IR/AIEDialect.cpp b/lib/Dialect/AIE/IR/AIEDialect.cpp index f5b8b5454e0..ca2add20055 100644 --- a/lib/Dialect/AIE/IR/AIEDialect.cpp +++ b/lib/Dialect/AIE/IR/AIEDialect.cpp @@ -127,10 +127,8 @@ uint32_t xilinx::AIE::getShimBurstLengthEncoding(const AIE::AIETargetModel &tm, return getShimBurstLength(tm, burstLength).first; } -std::string -xilinx::AIE::generateUniqueSymbolName(mlir::Operation *symbolTableOp, - llvm::StringRef prefix, - unsigned &counter) { +std::string xilinx::AIE::generateUniqueSymbolName( + mlir::Operation *symbolTableOp, llvm::StringRef prefix, unsigned &counter) { std::string name; do { name = (prefix + llvm::Twine(counter++)).str(); diff --git a/lib/Dialect/AIEX/Transforms/AIELowerParameters.cpp b/lib/Dialect/AIEX/Transforms/AIELowerParameters.cpp index 03dd6a02d4a..6327091b809 100644 --- a/lib/Dialect/AIEX/Transforms/AIELowerParameters.cpp +++ b/lib/Dialect/AIEX/Transforms/AIELowerParameters.cpp @@ -157,8 +157,8 @@ struct AIELowerParametersPass builder.setInsertionPoint(syncOp); Location loc = syncOp.getLoc(); - NpuCreateScratchpadOp::create( - builder, loc, static_cast(scratchpadSlots * 4)); + NpuCreateScratchpadOp::create(builder, loc, + static_cast(scratchpadSlots * 4)); for (auto &[stateIdx, bufRef] : paramEntries) { NpuUpdateFromScratchpadOp::create( @@ -231,9 +231,8 @@ struct AIELowerParametersPass // emit an error. DenseMap usedAsCore; DenseMap usedAsAddr; - module.walk([&](ReadParameterOp op) { - usedAsCore[op.getParameter()] = true; - }); + module.walk( + [&](ReadParameterOp op) { usedAsCore[op.getParameter()] = true; }); auto markAddr = [&](Operation *op, FlatSymbolRefAttr ref) { if (ref) usedAsAddr[ref.getValue()] = true; diff --git a/lib/Dialect/AIEX/Utils/AIEUtils.cpp b/lib/Dialect/AIEX/Utils/AIEUtils.cpp index 8e5a1bf40fd..fd513ca2bdb 100644 --- a/lib/Dialect/AIEX/Utils/AIEUtils.cpp +++ b/lib/Dialect/AIEX/Utils/AIEUtils.cpp @@ -174,8 +174,7 @@ LogicalResult AIEX::emitUpdateBdAddressFromOffsetParameter( return err; } - uint8_t stateIdx = - static_cast(paramOp.getStateTableIdx().value()); + uint8_t stateIdx = static_cast(paramOp.getStateTableIdx().value()); uint32_t elemBytes = bufType.getElementTypeBitWidth() / 8; // Use func=mul with func_arg=elemBytes so the firmware computes // StateTable[idx] * elemBytes = byte offset, added into the BD address diff --git a/python/ParameterScratchpadModule.cpp b/python/ParameterScratchpadModule.cpp index aa49542735b..0500177b1ed 100644 --- a/python/ParameterScratchpadModule.cpp +++ b/python/ParameterScratchpadModule.cpp @@ -26,12 +26,13 @@ PYBIND11_MODULE(_parameter_scratchpad, m) { }), py::arg("buffer"), py::arg("params_path"), py::keep_alive<1, 2>()) // prevent GC of buffer while alive - .def("write_bytes", - [](test_utils::ParameterScratchpad &self, const std::string &name, - py::bytes data) { - std::string s = data; - self.writeBytes(name, s.data(), s.size()); - }, - py::arg("name"), py::arg("data")) + .def( + "write_bytes", + [](test_utils::ParameterScratchpad &self, const std::string &name, + py::bytes data) { + std::string s = data; + self.writeBytes(name, s.data(), s.size()); + }, + py::arg("name"), py::arg("data")) .def("read", &test_utils::ParameterScratchpad::read, py::arg("name")); } diff --git a/test/npu-xrt/scratchpad_addr_offset/test.py b/test/npu-xrt/scratchpad_addr_offset/test.py index adad6590e51..6f89a2e9c71 100644 --- a/test/npu-xrt/scratchpad_addr_offset/test.py +++ b/test/npu-xrt/scratchpad_addr_offset/test.py @@ -70,12 +70,14 @@ def main(): bo_out.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) mv = bo_out.map() - result = np.frombuffer(bytes(mv[:N_OUTPUT * 4]), dtype=np.int32).tolist() + result = np.frombuffer(bytes(mv[: N_OUTPUT * 4]), dtype=np.int32).tolist() status = "PASS" if result == expected else "FAIL" if result != expected: all_pass = False - print(f"Run {run_idx} — offset={offset:2d} expected={expected} got={result} {status}") + print( + f"Run {run_idx} — offset={offset:2d} expected={expected} got={result} {status}" + ) if all_pass: print("PASS!") diff --git a/test/npu-xrt/scratchpad_addr_offset_python/aie_design.py b/test/npu-xrt/scratchpad_addr_offset_python/aie_design.py index b637408fe72..69c1cee84e1 100644 --- a/test/npu-xrt/scratchpad_addr_offset_python/aie_design.py +++ b/test/npu-xrt/scratchpad_addr_offset_python/aie_design.py @@ -73,7 +73,9 @@ def core_fn(of_in, of_out, barrier): rt.start(worker) # Input DMA — offset_parameter patches the BD address at runtime - in_tap = TensorAccessPattern((32,), offset=0, sizes=[1, 1, 1, 8], strides=[0, 0, 0, 1]) + in_tap = TensorAccessPattern( + (32,), offset=0, sizes=[1, 1, 1, 8], strides=[0, 0, 0, 1] + ) rt.fill(of_in.prod(), in_tensor, tap=in_tap, offset_parameter=input_offset) # Output DMA @@ -85,8 +87,8 @@ def core_fn(of_in, of_out, barrier): # Insert empty device to force PDI reload mlir_text = str(module) - empty_device = ' aie.device(npu2) @empty { }\n' - mlir_text = mlir_text.replace('module {\n', 'module {\n' + empty_device, 1) + empty_device = " aie.device(npu2) @empty { }\n" + mlir_text = mlir_text.replace("module {\n", "module {\n" + empty_device, 1) return mlir_text diff --git a/test/npu-xrt/scratchpad_addr_offset_python/test.py b/test/npu-xrt/scratchpad_addr_offset_python/test.py index 44a927cbd61..cf8cdaaa6c8 100644 --- a/test/npu-xrt/scratchpad_addr_offset_python/test.py +++ b/test/npu-xrt/scratchpad_addr_offset_python/test.py @@ -70,12 +70,14 @@ def main(): bo_out.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) mv = bo_out.map() - result = np.frombuffer(bytes(mv[:N_OUTPUT * 4]), dtype=np.int32).tolist() + result = np.frombuffer(bytes(mv[: N_OUTPUT * 4]), dtype=np.int32).tolist() status = "PASS" if result == expected else "FAIL" if result != expected: all_pass = False - print(f"Run {run_idx} — offset={offset:2d} expected={expected} got={result} {status}") + print( + f"Run {run_idx} — offset={offset:2d} expected={expected} got={result} {status}" + ) if all_pass: print("PASS!") diff --git a/test/npu-xrt/scratchpad_params_python/aie_design.py b/test/npu-xrt/scratchpad_params_python/aie_design.py index 7ce7b0e58ae..832f2c49787 100644 --- a/test/npu-xrt/scratchpad_params_python/aie_design.py +++ b/test/npu-xrt/scratchpad_params_python/aie_design.py @@ -76,8 +76,8 @@ def core_fn(of_out, foo, bar, barrier): # so we force a different (empty) PDI first. # FIXME: Replace this with the proper IRON abstraction for multiple devices when it becomes available. mlir_text = str(module) - empty_device = ' aie.device(npu2) @empty { }\n' - mlir_text = mlir_text.replace('module {\n', 'module {\n' + empty_device, 1) + empty_device = " aie.device(npu2) @empty { }\n" + mlir_text = mlir_text.replace("module {\n", "module {\n" + empty_device, 1) return mlir_text From 5c73936697431d166342cffd6205a4d2d6eecabd Mon Sep 17 00:00:00 2001 From: andrej Date: Wed, 13 May 2026 14:33:41 -0600 Subject: [PATCH 3/7] fix test --- test/{Dialect => dialect}/AIEX/invalid_parameters.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename test/{Dialect => dialect}/AIEX/invalid_parameters.mlir (91%) diff --git a/test/Dialect/AIEX/invalid_parameters.mlir b/test/dialect/AIEX/invalid_parameters.mlir similarity index 91% rename from test/Dialect/AIEX/invalid_parameters.mlir rename to test/dialect/AIEX/invalid_parameters.mlir index 83a84b50b2c..330540e1478 100644 --- a/test/Dialect/AIEX/invalid_parameters.mlir +++ b/test/dialect/AIEX/invalid_parameters.mlir @@ -5,7 +5,7 @@ aiex.parameter @foo : i32 aie.device(npu2) { aie.runtime_sequence() { - // expected-error @+1 {{'aiex.read_parameter' op expects parent op 'aie.core'}} + // expected-error @+1 {{'aiex.read_parameter' op must be inside an aie.core}} %x = aiex.read_parameter @foo : i32 aie.end } From 7c6f79c463702761f5d500ecbd7184a1d0b96dec Mon Sep 17 00:00:00 2001 From: andrej Date: Wed, 13 May 2026 14:39:42 -0600 Subject: [PATCH 4/7] format --- python/iron/runtime/runtime.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/iron/runtime/runtime.py b/python/iron/runtime/runtime.py index 403e99f4afc..7829f2c1780 100644 --- a/python/iron/runtime/runtime.py +++ b/python/iron/runtime/runtime.py @@ -186,7 +186,9 @@ def fill( in_fifo.endpoint = rt_endpoint self._fifos.add(in_fifo) - self._tasks.append(DMATask(in_fifo, source, tap, task_group, wait, offset_param_name)) + self._tasks.append( + DMATask(in_fifo, source, tap, task_group, wait, offset_param_name) + ) def drain( self, @@ -234,7 +236,9 @@ def drain( out_fifo.endpoint = rt_endpoint self._fifos.add(out_fifo) - self._tasks.append(DMATask(out_fifo, dest, tap, task_group, wait, offset_param_name)) + self._tasks.append( + DMATask(out_fifo, dest, tap, task_group, wait, offset_param_name) + ) def start(self, *args: Worker): """A placeholder operation to indicate that one or more Worker should be started on the device. From 8ce8576ea2fc236b14eb5bb9628bf730040b4067 Mon Sep 17 00:00:00 2001 From: andrej Date: Thu, 14 May 2026 09:45:13 -0600 Subject: [PATCH 5/7] Workaround if `TEST_UTILS_USE_XRT` flag is missing --- runtime_lib/test_lib/parameter_scratchpad.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/runtime_lib/test_lib/parameter_scratchpad.h b/runtime_lib/test_lib/parameter_scratchpad.h index 93df1954093..b06880f66b6 100644 --- a/runtime_lib/test_lib/parameter_scratchpad.h +++ b/runtime_lib/test_lib/parameter_scratchpad.h @@ -31,6 +31,12 @@ #include #include +#if !defined(TEST_UTILS_USE_XRT) && defined(__has_include) +#if __has_include() && __has_include() +#define TEST_UTILS_USE_XRT 1 +#endif +#endif + #ifdef TEST_UTILS_USE_XRT #include #include From bd7cab78653799adb5abd2a8bba763291e502123 Mon Sep 17 00:00:00 2001 From: andrej Date: Thu, 14 May 2026 11:34:42 -0600 Subject: [PATCH 6/7] make addr_offset test use C++ host code --- test/npu-xrt/scratchpad_addr_offset/run.lit | 9 ++ test/npu-xrt/scratchpad_addr_offset/test.cpp | 107 +++++++++++++++++++ test/npu-xrt/scratchpad_addr_offset/test.py | 91 ---------------- 3 files changed, 116 insertions(+), 91 deletions(-) create mode 100644 test/npu-xrt/scratchpad_addr_offset/run.lit create mode 100644 test/npu-xrt/scratchpad_addr_offset/test.cpp delete mode 100644 test/npu-xrt/scratchpad_addr_offset/test.py diff --git a/test/npu-xrt/scratchpad_addr_offset/run.lit b/test/npu-xrt/scratchpad_addr_offset/run.lit new file mode 100644 index 00000000000..d89a073d39c --- /dev/null +++ b/test/npu-xrt/scratchpad_addr_offset/run.lit @@ -0,0 +1,9 @@ +// (c) Copyright 2025 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai_npu2, peano +// +// RUN: aiecc -v --generate-full-elf --no-xchesscc --no-xbridge --dynamic-objFifos %S/aie.mlir +// RUN: cp aie.mlir.prj/params.txt . +// RUN: g++-13 -o test %S/test.cpp -std=c++23 -lstdc++ %xrt_flags %test_utils_flags +// RUN: %run_on_npu2% ./test diff --git a/test/npu-xrt/scratchpad_addr_offset/test.cpp b/test/npu-xrt/scratchpad_addr_offset/test.cpp new file mode 100644 index 00000000000..7c8abe023e2 --- /dev/null +++ b/test/npu-xrt/scratchpad_addr_offset/test.cpp @@ -0,0 +1,107 @@ +// (c) Copyright 2025 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Test for DMA address offset patching via offset_parameter. +// +// Setup: +// - Input buffer: 32 i32 values [0, 1, 2, ..., 31] +// - Core: passthrough of 8 elements +// - offset_parameter @input_offset controls the DMA read start position +// +// We run three times with different offsets and verify the output each time. +// + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +int main(int argc, const char *argv[]) { + constexpr int N_INPUT = 32; + constexpr int N_OUTPUT = 8; + + auto device = xrt::device(0); + + std::string kernelName = "test:sequence"; + xrt::elf ctx_elf{"aie.elf"}; + xrt::hw_context context = xrt::hw_context(device, ctx_elf); + auto kernel = xrt::ext::kernel(context, kernelName); + + // Input buffer: [0, 1, 2, ..., 31] as i32 + xrt::bo bo_in = xrt::ext::bo{device, N_INPUT * sizeof(int32_t)}; + auto *buf_in = bo_in.map(); + for (int i = 0; i < N_INPUT; ++i) + buf_in[i] = i; + bo_in.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // Output buffer: 8 x i32 + xrt::bo bo_out = xrt::ext::bo{device, N_OUTPUT * sizeof(int32_t)}; + auto *buf_out = bo_out.map(); + + auto run = xrt::run(kernel); + run.set_arg(0, bo_in); + run.set_arg(1, bo_out); + + auto params = test_utils::ParameterScratchpad(run, "params.txt"); + + struct TestCase { + int32_t offset; + std::vector expected; + }; + std::vector test_cases = { + {0, {0, 1, 2, 3, 4, 5, 6, 7}}, + {8, {8, 9, 10, 11, 12, 13, 14, 15}}, + {16, {16, 17, 18, 19, 20, 21, 22, 23}}, + }; + + bool all_pass = true; + int run_idx = 0; + for (auto &tc : test_cases) { + ++run_idx; + + // Clear output + memset(buf_out, 0, N_OUTPUT * sizeof(int32_t)); + bo_out.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // Write offset parameter (in elements) + params.write("input_offset", tc.offset); + params.sync(); + + run.start(); + run.wait2(); + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + std::vector result(buf_out, buf_out + N_OUTPUT); + bool pass = (result == tc.expected); + if (!pass) + all_pass = false; + + std::cout << "Run " << run_idx << " — offset=" << tc.offset + << " expected=["; + for (size_t i = 0; i < tc.expected.size(); ++i) + std::cout << tc.expected[i] << (i + 1 < tc.expected.size() ? ", " : ""); + std::cout << "] got=["; + for (size_t i = 0; i < result.size(); ++i) + std::cout << result[i] << (i + 1 < result.size() ? ", " : ""); + std::cout << "] " << (pass ? "PASS" : "FAIL") << std::endl; + } + + if (all_pass) { + std::cout << "PASS!" << std::endl; + return 0; + } else { + std::cout << "FAIL." << std::endl; + return 1; + } +} diff --git a/test/npu-xrt/scratchpad_addr_offset/test.py b/test/npu-xrt/scratchpad_addr_offset/test.py deleted file mode 100644 index 6f89a2e9c71..00000000000 --- a/test/npu-xrt/scratchpad_addr_offset/test.py +++ /dev/null @@ -1,91 +0,0 @@ -# (c) Copyright 2025 Advanced Micro Devices, Inc. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# Test for DMA address offset patching via offset_parameter. -# -# REQUIRES: ryzen_ai_npu2, peano, xrt_python_bindings -# -# RUN: aiecc.py -v --generate-full-elf --no-xchesscc --no-xbridge --dynamic-objFifos %S/aie.mlir -# RUN: cp aie.mlir.prj/params.txt . -# RUN: %run_on_npu2% %python %s -# -# Setup: -# - Input buffer: 32 i32 values [0, 1, 2, ..., 31] -# - Core: passthrough of 8 elements -# - offset_parameter @input_offset controls the DMA read start position -# -# We run three times with different offsets and verify the output each time. - -import struct -import sys - -import numpy as np -import pyxrt - -from aie.utils.parameter_scratchpad import ParameterScratchpad - - -def main(): - N_INPUT = 32 - N_OUTPUT = 8 - - device = pyxrt.device(0) - elf = pyxrt.elf("aie.elf") - context = pyxrt.hw_context(device, elf) - kernel = pyxrt.ext.kernel(context, "test:sequence") - - # Input buffer: [0, 1, 2, ..., 31] as i32 - input_data = np.arange(N_INPUT, dtype=np.int32) - bo_in = pyxrt.ext.bo(device, N_INPUT * 4) - bo_in.write(input_data.tobytes(), 0) - bo_in.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - - # Output buffer: 8 x i32 - bo_out = pyxrt.ext.bo(device, N_OUTPUT * 4) - - run = pyxrt.run(kernel) - run.set_arg(0, bo_in) - run.set_arg(1, bo_out) - - params = ParameterScratchpad(run, "params.txt") - - test_cases = [ - (0, list(range(0, 8))), - (8, list(range(8, 16))), - (16, list(range(16, 24))), - ] - - all_pass = True - for run_idx, (offset, expected) in enumerate(test_cases, 1): - # Clear output - bo_out.write(bytes(N_OUTPUT * 4), 0) - bo_out.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - - # Write offset parameter (in elements) - params.write("input_offset", np.int32(offset)) - params.sync() - - run.start() - run.wait2() - - bo_out.sync(pyxrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) - mv = bo_out.map() - result = np.frombuffer(bytes(mv[: N_OUTPUT * 4]), dtype=np.int32).tolist() - - status = "PASS" if result == expected else "FAIL" - if result != expected: - all_pass = False - print( - f"Run {run_idx} — offset={offset:2d} expected={expected} got={result} {status}" - ) - - if all_pass: - print("PASS!") - return 0 - else: - print("FAIL.") - return 1 - - -if __name__ == "__main__": - sys.exit(main()) From ecd0e909a48113747071bddea42fad109d1c5f65 Mon Sep 17 00:00:00 2001 From: andrej Date: Mon, 18 May 2026 09:32:06 -0600 Subject: [PATCH 7/7] update XDNA tag --- utils/build_drivers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/build_drivers.sh b/utils/build_drivers.sh index 8e4433d6437..5a054b8f1e6 100755 --- a/utils/build_drivers.sh +++ b/utils/build_drivers.sh @@ -93,7 +93,7 @@ fi echo "Setting up XDNA driver repository..." # Clone or update the XDNA driver repository and initialize submodules -XDNA_TAG=beb9e450fe123ecdf395453971576179cedcf1dd +XDNA_TAG=849907e938d0ab3baeca9f6637a69a14217329d2 # (1.7 tag as of 2026/02/17) if [ -d "xdna-driver" ]; then echo "xdna-driver directory already exists. Removing and re-cloning to ensure clean state..."