Skip to content

Commit 1718579

Browse files
committed
Support firmware scratchpad feature
1 parent 9796296 commit 1718579

12 files changed

Lines changed: 485 additions & 1 deletion

File tree

cmake/modulesXilinx

include/aie/Dialect/AIEX/IR/AIEX.td

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
include "aie/Dialect/AIE/IR/AIEAttrs.td"
1515
include "aie/Dialect/AIE/IR/AIEInterfaces.td"
1616

17+
1718
include "mlir/IR/OpBase.td"
1819
include "mlir/IR/AttrTypeBase.td"
1920
include "mlir/IR/EnumAttr.td"
@@ -40,6 +41,13 @@ def AIEX_Dialect : Dialect {
4041
let useDefaultTypePrinterParser = 1;
4142
}
4243

44+
//===----------------------------------------------------------------------===//
45+
// AIEX Attributes
46+
//===----------------------------------------------------------------------===//
47+
48+
include "aie/Dialect/AIEX/IR/AIEXAttrs.td"
49+
50+
4351
//===----------------------------------------------------------------------===//
4452
// AIEX Types
4553
//===----------------------------------------------------------------------===//
@@ -941,6 +949,125 @@ def AIE_NpuPreemptOp: AIEX_Op<"npu.preempt", []> {
941949
}];
942950
}
943951

952+
// XAIE_IO_CREATE_SCRATCHPAD (opcode 10)
953+
def AIE_NpuCreateScratchpadOp: AIEX_Op<"npu.create_scratchpad", []> {
954+
let summary = "Create a control code scratchpad memory region";
955+
let arguments = (
956+
ins UI32Attr:$size,
957+
DefaultValuedOptionalAttr<UI8Attr, "0">:$usage_type
958+
);
959+
let results = (outs );
960+
let assemblyFormat = [{
961+
attr-dict
962+
}];
963+
let description = [{
964+
Create a scratchpad memory for data exchange between the host's main memory and the NPU command processor and copy its contents to the command processor's memory.
965+
966+
When the runtime (XRT) observes that this instruction is present in the runtime sequence, it will allocate a scratchpad memory of the specified size on the host.
967+
When the command processor firmware executes this instruction, it copies the data in the runtime-allocated scratchpad region from the host's main memory to the NPU command processor's memory.
968+
From there, you can write values from the copy of the scratchpad memory in the command processor to arbitrary locations in the NPU (with restrictions) via the `npu.update_from_scratchpad` op.
969+
970+
The flow of data therefore looks like this:
971+
```
972+
create_scratchpad update_from_scratchpad
973+
[Host memory] -------------------> [Command processor memory] ------------------------> [NPU partition memory/registers]
974+
^ |
975+
| |
976+
-----------------
977+
XRT sees inst.
978+
and allocates
979+
```
980+
981+
To get a handle on the allocated scratchpad memory from XRT, use the `run.get_ctrl_scratchpad_bo()` method on the `xrt::run` object in your host application (`test.cpp`).
982+
An example can be found in `test/npu-xrt/scratchpad_regwrite`.
983+
984+
The `usage_type` attribute specifies the scratchpad layout; currently only a value of `0` is supported.
985+
986+
The `size` attribute specifies the size of the scratchpad in bytes.
987+
988+
The host's main memory address is patched into the instruction at runtime by XRT based on the `.ctrl.scratchpad` section in the ELF. The assembler (aiebu) generates patching information for this address when it encounters this opcode.
989+
990+
The scratchpad memory contains a `StateTable`, indexed by 32-bit words by the `npu.update_from_scratchpad` op. `StateTable` constraints: max 32 entries, max total scratchpad size 128 bytes.
991+
}];
992+
}
993+
994+
// XAIE_IO_UPDATE_REG (opcode 12)
995+
def AIE_NpuUpdateFromScratchpadOp: AIEX_Op<"npu.update_from_scratchpad", []> {
996+
let summary = "Add a computed value to an 8-byte section of NPU memory from scratchpad state";
997+
let arguments = (
998+
ins UI8Attr:$state_table_idx,
999+
DefaultValuedAttr<StateTableFunc, "StateTableFunc::Incr">:$func,
1000+
DefaultValuedOptionalAttr<UI32Attr, "0">:$func_arg,
1001+
UI32Attr:$address,
1002+
OptionalAttr<FlatSymbolRefAttr>:$buffer,
1003+
OptionalAttr<I32Attr>:$column,
1004+
OptionalAttr<I32Attr>:$row
1005+
);
1006+
let results = (outs );
1007+
let assemblyFormat = [{
1008+
(`<` $func^ `>`)? attr-dict
1009+
}];
1010+
let description = [{
1011+
Add a computed value based on scratchpad contents to an 8-byte section at the target address.
1012+
1013+
This instruction reads the contents of scratchpad memory created using the `npu.create_scratchpad` op (the `StateTable`), calculates a value, then adds the result to the memory location or register denoted by the given address as follows:
1014+
1015+
1. Reads the existing 64-bit value from the register pair:
1016+
1017+
```
1018+
existing = (*(addr+4) & 0xFFFF) << 32 | (*addr & 0xFFFFFFFC)
1019+
```
1020+
1021+
2. Computes a delta based on the contents of the scratchpad memory created using `npu.create_scratchpad` (the `StateTable`) based on the selected function:
1022+
1023+
- `mul`: `delta = StateTable[state_table_idx] * func_arg`
1024+
- `incr`: `delta = StateTable[state_table_idx] + func_arg`
1025+
- `decr`: `delta = StateTable[state_table_idx] - func_arg`
1026+
1027+
3. Adds the delta to the existing value:
1028+
1029+
```
1030+
result = existing + delta
1031+
```
1032+
1033+
4. Writes back the lower 48 bits of the result:
1034+
1035+
```
1036+
*addr = result & 0xFFFFFFFC (bits [1:0] forced to 0)
1037+
*(addr+4) = (*(addr+4) & 0xFFFF0000) | ((result >> 32) & 0xFFFF)
1038+
```
1039+
1040+
### Constraints
1041+
1042+
- This is always additive. It adds the computed delta to whatever value is already in the register pair. It cannot set an absolute value.
1043+
- The lower 2 bits of the first register are always cleared (4-byte aligned).
1044+
- The upper 16 bits of the second register are preserved unchanged.
1045+
- Always writes 8 contiguous bytes (both registers in the pair).
1046+
1047+
### Rationale
1048+
1049+
The firmware instruction underpinning this operation was originally intended to patch shim buffer descriptor addresses only.
1050+
Because of this, this always writes 48 bits (size of BD addresses) and the lower bits are zeroed (assuring the value is a multiple of the addressable word size).
1051+
1052+
### Attributes
1053+
1054+
- `state_table_idx` is a 32-bit-word index offset into the scratchpad memory. The source value will be read from this offset into the scratchpad.
1055+
- `func` is the function applied to the state table value in firmware, which can be one of `mul`, `incr` or `decr`.
1056+
- `func_arg` is the argument to the function.
1057+
- `address`, `buffer`, `column` and `row` together resolve to the destination address that the value will be written to.
1058+
Address resolution is the same as for `npu.write32`:
1059+
- If `buffer` is specified, `address` is a word offset relative to that
1060+
buffer's start address.
1061+
- If `column` and `row` are present, `address` is a local offset within
1062+
that tile's address space.
1063+
- Otherwise, `address` is an absolute address into the AIE array.
1064+
1065+
}];
1066+
let extraClassDeclaration = [{
1067+
std::optional<uint32_t> getAbsoluteAddress();
1068+
}];
1069+
}
1070+
9441071
def AIE_NpuControlPacketOp: AIEX_Op<"control_packet", []> {
9451072
let summary = "AIE control packet";
9461073
let arguments = (
@@ -1025,6 +1152,12 @@ def AIE_NpuLoadPdiOp: AIEX_Op<"npu.load_pdi", []> {
10251152
runtime by the driver or host program.
10261153

10271154
If a symbol reference is provided, the compiler driver (aiecc.py) will match it to a device symbol name and assign the PDI ID field based on it.
1155+
1156+
### Watch Out!
1157+
1158+
The firmware optimizes out repeated `load_pdi` operations if they refer to the same PDI.
1159+
To force a reload / device reset, intersperse a `load_pdi` to a different PDI between same-device reloads.
1160+
The easiest workaround is to create an empty `aie.device(npu2) @empty {}` and load that before the main device load.
10281161
}];
10291162
let hasCanonicalizeMethod = 1;
10301163
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
//===- AIEXAttrs.td ----------------------------------------*- tablegen -*-===//
2+
//
3+
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
// (c) Copyright 2024 Advanced Micro Devices, Inc.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef AIEX_ATTRS
12+
#define AIEX_ATTRS
13+
14+
include "mlir/IR/EnumAttr.td"
15+
16+
def StateTableFuncMul : I32EnumAttrCase<"Mul", 0, "mul">;
17+
def StateTableFuncIncr : I32EnumAttrCase<"Incr", 1, "incr">;
18+
def StateTableFuncDecr : I32EnumAttrCase<"Decr", 2, "decr">;
19+
20+
def StateTableFunc : I32EnumAttr<"StateTableFunc",
21+
"Function to apply between scratchpad value and register pair",
22+
[StateTableFuncMul, StateTableFuncIncr, StateTableFuncDecr]> {
23+
let cppNamespace = "::xilinx::AIEX";
24+
}
25+
26+
#endif // AIEX_ATTRS

include/aie/Dialect/AIEX/IR/AIEXDialect.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
// Include dialect declarations such as parseAttributes, parseType
1919
#include "aie/Dialect/AIEX/IR/AIEXDialect.h.inc"
20+
#include "aie/Dialect/AIEX/IR/AIEXEnums.h"
2021
#include "mlir/IR/Operation.h"
2122

2223
// include TableGen generated Op definitions
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
//===- AIEXEnums.h ----------------------------------------------*- C++ -*-===//
2+
//
3+
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
// (c) Copyright 2024 Advanced Micro Devices, Inc.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef MLIR_AIEX_ENUMS_H
12+
#define MLIR_AIEX_ENUMS_H
13+
14+
#include "mlir/IR/BuiltinTypes.h"
15+
#include "mlir/IR/Dialect.h"
16+
17+
#include "aie/Dialect/AIEX/IR/AIEXEnums.h.inc"
18+
19+
#endif // MLIR_AIEX_ENUMS_H

include/aie/Dialect/AIEX/IR/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,10 @@ add_mlir_doc(AIEX AIEXDialect ./ -gen-dialect-doc -dialect=aiex)
1010

1111
# AIEX.td pulls in AIEAttrs.td which includes generated event enums
1212
add_dependencies(MLIRAIEXIncGen GenerateAIEEventsTD)
13+
14+
# Add AIEX enums
15+
set(LLVM_TARGET_DEFINITIONS AIEXAttrs.td)
16+
mlir_tablegen(AIEXEnums.h.inc -gen-enum-decls)
17+
mlir_tablegen(AIEXEnums.cpp.inc -gen-enum-defs)
18+
add_public_tablegen_target(MLIRAIEXEnumsIncGen)
19+
add_dependencies(aie-headers MLIRAIEXEnumsIncGen)

lib/Dialect/AIEX/IR/AIEXDialect.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ using namespace xilinx;
3030

3131
#include "aie/Dialect/AIEX/IR/AIEXDialect.cpp.inc"
3232

33+
#include "aie/Dialect/AIEX/IR/AIEXEnums.cpp.inc"
34+
3335
#define GET_TYPEDEF_CLASSES
3436
#include "aie/Dialect/AIEX/IR/AIEXTypes.cpp.inc"
3537

@@ -750,6 +752,14 @@ std::optional<uint32_t> AIEX::NpuWrite32Op::getAbsoluteAddress() {
750752
return ::getAbsoluteAddress(this);
751753
}
752754

755+
//===----------------------------------------------------------------------===//
756+
// NpuUpdateFromScratchpadOp
757+
//===----------------------------------------------------------------------===//
758+
759+
std::optional<uint32_t> AIEX::NpuUpdateFromScratchpadOp::getAbsoluteAddress() {
760+
return ::getAbsoluteAddress(this);
761+
}
762+
753763
//===----------------------------------------------------------------------===//
754764
// NpuMaskWrite32Op
755765
//===----------------------------------------------------------------------===//

lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,9 @@ struct AIEXToStandardPass
6262
removepatterns.add<AIEXOpRemoval<NpuWriteBdOp>>(m.getContext(), m);
6363
removepatterns.add<AIEXOpRemoval<NpuAddressPatchOp>>(m.getContext(), m);
6464
removepatterns.add<AIEXOpRemoval<NpuPreemptOp>>(m.getContext(), m);
65+
removepatterns.add<AIEXOpRemoval<NpuCreateScratchpadOp>>(m.getContext(), m);
66+
removepatterns.add<AIEXOpRemoval<NpuUpdateFromScratchpadOp>>(m.getContext(),
67+
m);
6568

6669
if (failed(applyPartialConversion(m, target, std::move(removepatterns))))
6770
signalPassFailure();

lib/Targets/AIETargetNPU.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,44 @@ void appendPreempt(std::vector<uint32_t> &instructions, NpuPreemptOp op) {
199199
words[0] = XAIE_IO_PREEMPT | (op.getLevel() << 8);
200200
}
201201

202+
void appendCreateScratchpad(std::vector<uint32_t> &instructions,
203+
NpuCreateScratchpadOp op) {
204+
// XAIE_IO_CREATE_SCRATCHPAD encoding (4 words = 16 bytes):
205+
// Byte 0: Opcode (10)
206+
// Byte 1: Usage Type
207+
// Bytes 2-3: padding
208+
// Bytes 4-7: Size
209+
// Bytes 8-15: DDR Address (patched at runtime by XRT)
210+
auto words = reserveAndGetTail(instructions, 4);
211+
212+
words[0] = XAIE_IO_CREATE_SCRATCHPAD;
213+
words[0] |= (static_cast<uint32_t>(op.getUsageType()) << 8);
214+
words[1] = op.getSize();
215+
// DDR address words[2] and words[3] are left as 0;
216+
// they will be patched at runtime by XRT/aiebu based on the
217+
// .ctrl.scratchpad section.
218+
words[2] = 0;
219+
words[3] = 0;
220+
}
221+
222+
void appendUpdateRegFromScratchpad(std::vector<uint32_t> &instructions,
223+
NpuUpdateFromScratchpadOp op) {
224+
// XAIE_IO_UPDATE_REG encoding (3 words = 12 bytes):
225+
// Byte 0: Opcode (12)
226+
// Byte 1: StateTableIdx
227+
// Byte 2: Func
228+
// Byte 3: padding
229+
// Bytes 4-7: FuncArg
230+
// Bytes 8-11: RegOff (absolute offset from AIE array base to register pair)
231+
auto words = reserveAndGetTail(instructions, 3);
232+
233+
words[0] = XAIE_IO_UPDATE_REG;
234+
words[0] |= (static_cast<uint32_t>(op.getStateTableIdx()) << 8);
235+
words[0] |= (static_cast<uint32_t>(op.getFunc()) << 16);
236+
words[1] = op.getFuncArg();
237+
words[2] = *op.getAbsoluteAddress();
238+
}
239+
202240
} // namespace
203241

204242
LogicalResult xilinx::AIE::AIETranslateNpuToBinary(
@@ -264,6 +302,14 @@ LogicalResult xilinx::AIE::AIETranslateNpuToBinary(
264302
.Case<NpuPreemptOp>([&](auto op) {
265303
count++;
266304
appendPreempt(instructions, op);
305+
})
306+
.Case<NpuCreateScratchpadOp>([&](auto op) {
307+
count++;
308+
appendCreateScratchpad(instructions, op);
309+
})
310+
.Case<NpuUpdateFromScratchpadOp>([&](auto op) {
311+
count++;
312+
appendUpdateRegFromScratchpad(instructions, op);
267313
});
268314
}
269315
}

0 commit comments

Comments
 (0)