Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions include/aie/Dialect/AIE/IR/AIEDialect.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,18 @@ void collectBuffers(
// linearized by the compiler.
bool isContiguousBDTransfer(llvm::ArrayRef<BDDimLayoutAttr> dims);

// Verify that a BD's per-dimension sizes and strides (innermost-first) are
// realizable for hardware with the given address generation granularity.
// Checks: positive sizes; innermost contiguous run is a granularity multiple;
// positive strides for non-repeat dims (last stride may be zero); each
// non-innermost stride byte-aligned to granularity (innermost stride==1 is
// always allowed); for elemWidth > granularity, innermost stride must be 1.
mlir::LogicalResult verifyBDSizesStrides(mlir::Operation *forOp,
unsigned elemWidthBits,
uint32_t addressGranularityBits,
llvm::ArrayRef<int64_t> inputSizes,
llvm::ArrayRef<int64_t> inputStrides);

} // namespace xilinx::AIE

namespace llvm {
Expand Down
93 changes: 83 additions & 10 deletions lib/Dialect/AIE/IR/AIEDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2150,6 +2150,74 @@ void DMABDOp::print(::mlir::OpAsmPrinter &printer) {
// A BDDimLayoutAttr array (outermost-first) describes a contiguous row-major
// scan when the innermost stride is 1 and each outer stride equals the product
// of all inner sizes. Used by both DMABDOp verification and canonicalization.
mlir::LogicalResult xilinx::AIE::verifyBDSizesStrides(
mlir::Operation *forOp, unsigned elemWidthBits,
uint32_t addressGranularityBits, llvm::ArrayRef<int64_t> inputSizes,
llvm::ArrayRef<int64_t> inputStrides) {
assert(inputSizes.size() == inputStrides.size());
const int n = static_cast<int>(inputSizes.size());
if (n == 0)
return success();

for (int i = 0; i < n; ++i) {
if (inputSizes[i] <= 0)
return forOp->emitOpError("Size ") << i << " must be a positive integer.";
}

// Innermost contiguous run must be a multiple of address granularity --
// hardware moves whole words; a sub-word innermost run is unrealizable.
if (inputSizes[0] * elemWidthBits % addressGranularityBits != 0) {
std::stringstream msg;
msg << "Transfer sizes must be multiples of "
<< (addressGranularityBits / 8) << " bytes. " << inputSizes[0]
<< " elements at " << (elemWidthBits / 8) << " bytes each equal "
<< (inputSizes[0] * elemWidthBits / 8)
<< " bytes, which is not divisible by " << (addressGranularityBits / 8)
<< ". ";
return forOp->emitOpError(msg.str());
}

// Non-repeat dim strides must be positive when the corresponding size > 1
// (the repeat dim, if present as the outermost, may have stride 0).
const int repeatDim = n - 1;
for (int i = 0; i < n; ++i) {
if (inputSizes[i] > 1 && inputStrides[i] < 1) {
if (i == repeatDim && inputStrides[i] == 0)
continue;
return forOp->emitOpError("Stride ")
<< i << " must be a positive integer.";
}
}

// Stride byte-alignment: innermost stride==1 is always allowed (sub-word
// packed contiguous run, paired with the granularity check above); any other
// stride must be a granularity multiple in bytes.
for (int i = 0; i < n; ++i) {
if (i == 0 && inputStrides[i] == 1)
continue;
if (inputStrides[i] * elemWidthBits % addressGranularityBits != 0) {
std::stringstream msg;
msg << "Stride " << i << " is " << inputStrides[i] << " elements * "
<< (elemWidthBits / 8)
<< " bytes = " << (inputStrides[i] * elemWidthBits / 8)
<< " bytes, which is not divisible by "
<< (addressGranularityBits / 8) << ". ";
return forOp->emitOpError(msg.str());
}
}

// For element widths larger than the granularity (e.g. bfp blocks, i64),
// the hardware cannot encode a non-1 innermost stride; getHardwareStrides-
// Wraps would silently drop the stride. Force innermost stride == 1.
if (elemWidthBits > addressGranularityBits && inputStrides[0] != 1)
return forOp->emitOpError(
"For element widths larger than the address granularity (")
<< (addressGranularityBits / 8)
<< " bytes), innermost dim stride must be 1.";

return success();
}

bool xilinx::AIE::isContiguousBDTransfer(llvm::ArrayRef<BDDimLayoutAttr> dims) {
if (dims.empty())
return true; // no ND layout = trivially contiguous
Expand Down Expand Up @@ -2260,16 +2328,21 @@ LogicalResult DMABDOp::verify() {
return emitOpError() << "Stride may not exceed " << (1 << 20);
}

// Since streams read 32b words, there's no way to read eg 16b with stride
// of 2 (ie lower halfs of each 32b). So force it to be 1 (and then in
// CDODirect/XAIEV2 scale the size by 4/getBufferElementTypeWidthInBytes).
if (getBufferElementTypeWidthInBytes() < 4 && dims->back().getStride() != 1)
return emitOpError(
"For <32b width datatypes, inner-most dim stride must be 1");

if (getBufferElementTypeWidthInBytes() > 4 && dims->back().getStride() != 1)
return emitOpError(
"For >32b width datatypes, inner-most dim stride must be 1");
// Granularity / sub-word / stride alignment checks, shared with
// AIEX::verifyStridesWraps. dims are stored outermost-first; the helper
// expects innermost-first.
SmallVector<int64_t, 4> inputSizes, inputStrides;
for (auto it = dims->rbegin(); it != dims->rend(); ++it) {
inputSizes.push_back(static_cast<int64_t>(it->getSize()));
inputStrides.push_back(static_cast<int64_t>(it->getStride()));
}
DataLayout dataLayout = DataLayout::closest(getOperation());
unsigned elemWidthBits =
dataLayout.getTypeSizeInBits(buffer.getElementType());
if (failed(xilinx::AIE::verifyBDSizesStrides(
getOperation(), elemWidthBits,
targetModel.getAddressGenGranularity(), inputSizes, inputStrides)))
return failure();
}
if (auto paddims = getPadDimensions(); paddims.has_value()) {
auto dims = getDimensions();
Expand Down
48 changes: 3 additions & 45 deletions lib/Dialect/AIEX/IR/AIEXDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,51 +202,9 @@ AIEX::verifyStridesWraps(mlir::Operation *forOp,
std::to_string(tileRow) + ") Must be ShimNOC, Mem or Core.");
}

for (int i = 0; i < 4; i++) {
if (inputSizes[i] <= 0) {
return forOp->emitOpError("Size ") << i << " must be a positive integer.";
}
}

if (inputSizes[0] * elemWidth % addressGranularity != 0) {
std::stringstream msg;
msg << "Transfer sizes must be multiples of " << (addressGranularity / 8)
<< " bytes. " << inputSizes[0] << " elements at " << (elemWidth / 8)
<< " bytes each equal " << (inputSizes[0] * elemWidth / 8)
<< " bytes, which is not divisible by " << (addressGranularity / 8)
<< ". ";
return forOp->emitOpError(msg.str());
}

for (int i = 0; i < 3; i++) {
if (inputSizes[i] > 1 && inputStrides[i] < 1) {
// If inputSize[i] == 1, anything is allowable in the stride, since that
// stride will never be applied. For any larger size, we must verify that
// the stride is positive.
return forOp->emitOpError("Stride ")
<< i << " must be a positive integer.";
}
}
// A value of zero is allowable for the fourth-dimension stride
// (this indicates an interation stride for the repeat of 0)
if (inputSizes[3] > 1 && inputStrides[3] < 0) {
return forOp->emitOpError("Stride 3 must be a non-negative integer.");
}

for (int i = 0; i < 4; i++) {
// strides[0] == 1 is ok iff the transfer size is a multiple of
// addressGranularity, which is checked below
if (i == 0 && inputStrides[i] == 1)
continue;
if (inputStrides[i] * elemWidth % addressGranularity != 0) {
std::stringstream msg;
msg << "Stride " << i << " is " << inputStrides[i] << " elements * "
<< (elemWidth / 8) << " bytes = " << (inputStrides[i] * elemWidth / 8)
<< " bytes, which is not divisible by " << (addressGranularity / 8)
<< ". ";
return forOp->emitOpError(msg.str());
}
}
if (failed(AIE::verifyBDSizesStrides(forOp, elemWidth, addressGranularity,
inputSizes, inputStrides)))
return failure();

if (!skipTransformationChecks && hardwareSizes[0] > (1 << wrap_bits) - 1)
return forOp->emitOpError(
Expand Down
24 changes: 22 additions & 2 deletions test/dialect/AIE/bad_dma_op.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ module {

// -----

// CHECK: For >32b width datatypes, inner-most dim stride must be 1
// CHECK: For element widths larger than the address granularity (4 bytes), innermost dim stride must be 1
module {
aie.device(npu1) {
%tile14 = aie.tile(1, 4)
Expand All @@ -63,7 +63,27 @@ module {
^bd0:
aie.dma_bd(%buf14 : memref<128x!aiex.bfp<"v8bfp16ebs8">>, 0, 128, [<size = 8, stride = 16>]) {}
aie.next_bd ^end
^end:
^end:
aie.end
}
}
}

// -----

// Sub-word innermost contiguous run on i8: innermost size=2 elements * 1 byte
// = 2 bytes, sub-word and unrealizable by 32-bit-granularity DMA.
// CHECK: 2 elements at 1 bytes each equal 2 bytes, which is not divisible by 4
module {
aie.device(npu1) {
%tile14 = aie.tile(1, 4)
%buf14 = aie.buffer(%tile14) { sym_name = "buf14" } : memref<128xi8>
%mem14 = aie.mem(%tile14) {
%srcDma = aie.dma_start("MM2S", 0, ^bd0, ^end)
^bd0:
aie.dma_bd(%buf14 : memref<128xi8>, 0, 24, [<size = 3, stride = 4>, <size = 2, stride = 1>])
aie.next_bd ^end
^end:
aie.end
}
}
Expand Down
32 changes: 27 additions & 5 deletions test/dialect/AIE/nd-dma-bad-stride.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,46 @@
//
//===----------------------------------------------------------------------===//

// RUN: aie-opt --verify-diagnostics %s
// RUN: aie-opt --split-input-file --verify-diagnostics %s

module @tutorial_2b {
// i16 (16b) inner stride of 2 elements = 32b = address-granularity multiple.
// This is realizable in hardware and must pass verification.
module @stride_word_aligned_ok {
aie.device(xcve2802) {
%tile14 = aie.tile(1, 4)
%tile34 = aie.tile(3, 4)

aie.flow(%tile14, DMA : 0, %tile34, DMA : 0)
%buf14 = aie.buffer(%tile14) : memref<128xi16>
%lock14_done = aie.lock(%tile14, 0) { init = 0 : i32 }
%mem14 = aie.mem(%tile14) {
%srcDma = aie.dma_start("MM2S", 0, ^bd0, ^end)
^bd0:
// expected-error@+1 {{'aie.dma_bd' op For <32b width datatypes, inner-most dim stride must be 1}}
aie.dma_bd(%buf14 : memref<128xi16>, 0, 128, [<size = 32, stride = 2>])
aie.next_bd ^end
^end:
aie.end
}
}
}
}

// -----

// i16 (16b) inner stride of 3 elements = 48b, not a granularity multiple.
module @stride_not_word_aligned {
aie.device(xcve2802) {
%tile14 = aie.tile(1, 4)
%tile34 = aie.tile(3, 4)
aie.flow(%tile14, DMA : 0, %tile34, DMA : 0)
%buf14 = aie.buffer(%tile14) : memref<128xi16>
%lock14_done = aie.lock(%tile14, 0) { init = 0 : i32 }
%mem14 = aie.mem(%tile14) {
%srcDma = aie.dma_start("MM2S", 0, ^bd0, ^end)
^bd0:
// expected-error@+1 {{'aie.dma_bd' op Stride 0 is 3 elements * 2 bytes = 6 bytes, which is not divisible by 4}}
aie.dma_bd(%buf14 : memref<128xi16>, 0, 128, [<size = 32, stride = 3>])
aie.next_bd ^end
^end:
aie.end
}
}
}
Loading