diff --git a/include/aie/Dialect/AIE/IR/AIEDialect.h b/include/aie/Dialect/AIE/IR/AIEDialect.h index 3634165838b..36d7df97541 100644 --- a/include/aie/Dialect/AIE/IR/AIEDialect.h +++ b/include/aie/Dialect/AIE/IR/AIEDialect.h @@ -228,6 +228,18 @@ void collectBuffers( // linearized by the compiler. bool isContiguousBDTransfer(llvm::ArrayRef dims); +// Verify that a BD's per-dimension sizes and strides (innermost-first) are +// realizable for hardware with the given address generation granularity. +// Checks: positive sizes; innermost contiguous run is a granularity multiple; +// positive strides for non-repeat dims (last stride may be zero); each +// non-innermost stride byte-aligned to granularity (innermost stride==1 is +// always allowed); for elemWidth > granularity, innermost stride must be 1. +mlir::LogicalResult verifyBDSizesStrides(mlir::Operation *forOp, + unsigned elemWidthBits, + uint32_t addressGranularityBits, + llvm::ArrayRef inputSizes, + llvm::ArrayRef inputStrides); + } // namespace xilinx::AIE namespace llvm { diff --git a/lib/Dialect/AIE/IR/AIEDialect.cpp b/lib/Dialect/AIE/IR/AIEDialect.cpp index 98e34be71b7..7f7a9d7838a 100644 --- a/lib/Dialect/AIE/IR/AIEDialect.cpp +++ b/lib/Dialect/AIE/IR/AIEDialect.cpp @@ -2150,6 +2150,74 @@ void DMABDOp::print(::mlir::OpAsmPrinter &printer) { // A BDDimLayoutAttr array (outermost-first) describes a contiguous row-major // scan when the innermost stride is 1 and each outer stride equals the product // of all inner sizes. Used by both DMABDOp verification and canonicalization. +mlir::LogicalResult xilinx::AIE::verifyBDSizesStrides( + mlir::Operation *forOp, unsigned elemWidthBits, + uint32_t addressGranularityBits, llvm::ArrayRef inputSizes, + llvm::ArrayRef inputStrides) { + assert(inputSizes.size() == inputStrides.size()); + const int n = static_cast(inputSizes.size()); + if (n == 0) + return success(); + + for (int i = 0; i < n; ++i) { + if (inputSizes[i] <= 0) + return forOp->emitOpError("Size ") << i << " must be a positive integer."; + } + + // Innermost contiguous run must be a multiple of address granularity -- + // hardware moves whole words; a sub-word innermost run is unrealizable. + if (inputSizes[0] * elemWidthBits % addressGranularityBits != 0) { + std::stringstream msg; + msg << "Transfer sizes must be multiples of " + << (addressGranularityBits / 8) << " bytes. " << inputSizes[0] + << " elements at " << (elemWidthBits / 8) << " bytes each equal " + << (inputSizes[0] * elemWidthBits / 8) + << " bytes, which is not divisible by " << (addressGranularityBits / 8) + << ". "; + return forOp->emitOpError(msg.str()); + } + + // Non-repeat dim strides must be positive when the corresponding size > 1 + // (the repeat dim, if present as the outermost, may have stride 0). + const int repeatDim = n - 1; + for (int i = 0; i < n; ++i) { + if (inputSizes[i] > 1 && inputStrides[i] < 1) { + if (i == repeatDim && inputStrides[i] == 0) + continue; + return forOp->emitOpError("Stride ") + << i << " must be a positive integer."; + } + } + + // Stride byte-alignment: innermost stride==1 is always allowed (sub-word + // packed contiguous run, paired with the granularity check above); any other + // stride must be a granularity multiple in bytes. + for (int i = 0; i < n; ++i) { + if (i == 0 && inputStrides[i] == 1) + continue; + if (inputStrides[i] * elemWidthBits % addressGranularityBits != 0) { + std::stringstream msg; + msg << "Stride " << i << " is " << inputStrides[i] << " elements * " + << (elemWidthBits / 8) + << " bytes = " << (inputStrides[i] * elemWidthBits / 8) + << " bytes, which is not divisible by " + << (addressGranularityBits / 8) << ". "; + return forOp->emitOpError(msg.str()); + } + } + + // For element widths larger than the granularity (e.g. bfp blocks, i64), + // the hardware cannot encode a non-1 innermost stride; getHardwareStrides- + // Wraps would silently drop the stride. Force innermost stride == 1. + if (elemWidthBits > addressGranularityBits && inputStrides[0] != 1) + return forOp->emitOpError( + "For element widths larger than the address granularity (") + << (addressGranularityBits / 8) + << " bytes), innermost dim stride must be 1."; + + return success(); +} + bool xilinx::AIE::isContiguousBDTransfer(llvm::ArrayRef dims) { if (dims.empty()) return true; // no ND layout = trivially contiguous @@ -2260,16 +2328,21 @@ LogicalResult DMABDOp::verify() { return emitOpError() << "Stride may not exceed " << (1 << 20); } - // Since streams read 32b words, there's no way to read eg 16b with stride - // of 2 (ie lower halfs of each 32b). So force it to be 1 (and then in - // CDODirect/XAIEV2 scale the size by 4/getBufferElementTypeWidthInBytes). - if (getBufferElementTypeWidthInBytes() < 4 && dims->back().getStride() != 1) - return emitOpError( - "For <32b width datatypes, inner-most dim stride must be 1"); - - if (getBufferElementTypeWidthInBytes() > 4 && dims->back().getStride() != 1) - return emitOpError( - "For >32b width datatypes, inner-most dim stride must be 1"); + // Granularity / sub-word / stride alignment checks, shared with + // AIEX::verifyStridesWraps. dims are stored outermost-first; the helper + // expects innermost-first. + SmallVector inputSizes, inputStrides; + for (auto it = dims->rbegin(); it != dims->rend(); ++it) { + inputSizes.push_back(static_cast(it->getSize())); + inputStrides.push_back(static_cast(it->getStride())); + } + DataLayout dataLayout = DataLayout::closest(getOperation()); + unsigned elemWidthBits = + dataLayout.getTypeSizeInBits(buffer.getElementType()); + if (failed(xilinx::AIE::verifyBDSizesStrides( + getOperation(), elemWidthBits, + targetModel.getAddressGenGranularity(), inputSizes, inputStrides))) + return failure(); } if (auto paddims = getPadDimensions(); paddims.has_value()) { auto dims = getDimensions(); diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index aeda8495d3b..59de8ce36fa 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -202,51 +202,9 @@ AIEX::verifyStridesWraps(mlir::Operation *forOp, std::to_string(tileRow) + ") Must be ShimNOC, Mem or Core."); } - for (int i = 0; i < 4; i++) { - if (inputSizes[i] <= 0) { - return forOp->emitOpError("Size ") << i << " must be a positive integer."; - } - } - - if (inputSizes[0] * elemWidth % addressGranularity != 0) { - std::stringstream msg; - msg << "Transfer sizes must be multiples of " << (addressGranularity / 8) - << " bytes. " << inputSizes[0] << " elements at " << (elemWidth / 8) - << " bytes each equal " << (inputSizes[0] * elemWidth / 8) - << " bytes, which is not divisible by " << (addressGranularity / 8) - << ". "; - return forOp->emitOpError(msg.str()); - } - - for (int i = 0; i < 3; i++) { - if (inputSizes[i] > 1 && inputStrides[i] < 1) { - // If inputSize[i] == 1, anything is allowable in the stride, since that - // stride will never be applied. For any larger size, we must verify that - // the stride is positive. - return forOp->emitOpError("Stride ") - << i << " must be a positive integer."; - } - } - // A value of zero is allowable for the fourth-dimension stride - // (this indicates an interation stride for the repeat of 0) - if (inputSizes[3] > 1 && inputStrides[3] < 0) { - return forOp->emitOpError("Stride 3 must be a non-negative integer."); - } - - for (int i = 0; i < 4; i++) { - // strides[0] == 1 is ok iff the transfer size is a multiple of - // addressGranularity, which is checked below - if (i == 0 && inputStrides[i] == 1) - continue; - if (inputStrides[i] * elemWidth % addressGranularity != 0) { - std::stringstream msg; - msg << "Stride " << i << " is " << inputStrides[i] << " elements * " - << (elemWidth / 8) << " bytes = " << (inputStrides[i] * elemWidth / 8) - << " bytes, which is not divisible by " << (addressGranularity / 8) - << ". "; - return forOp->emitOpError(msg.str()); - } - } + if (failed(AIE::verifyBDSizesStrides(forOp, elemWidth, addressGranularity, + inputSizes, inputStrides))) + return failure(); if (!skipTransformationChecks && hardwareSizes[0] > (1 << wrap_bits) - 1) return forOp->emitOpError( diff --git a/test/dialect/AIE/bad_dma_op.mlir b/test/dialect/AIE/bad_dma_op.mlir index 15c13ced2e9..02192a71e55 100644 --- a/test/dialect/AIE/bad_dma_op.mlir +++ b/test/dialect/AIE/bad_dma_op.mlir @@ -53,7 +53,7 @@ module { // ----- -// CHECK: For >32b width datatypes, inner-most dim stride must be 1 +// CHECK: For element widths larger than the address granularity (4 bytes), innermost dim stride must be 1 module { aie.device(npu1) { %tile14 = aie.tile(1, 4) @@ -63,7 +63,27 @@ module { ^bd0: aie.dma_bd(%buf14 : memref<128x!aiex.bfp<"v8bfp16ebs8">>, 0, 128, []) {} aie.next_bd ^end - ^end: + ^end: + aie.end + } + } +} + +// ----- + +// Sub-word innermost contiguous run on i8: innermost size=2 elements * 1 byte +// = 2 bytes, sub-word and unrealizable by 32-bit-granularity DMA. +// CHECK: 2 elements at 1 bytes each equal 2 bytes, which is not divisible by 4 +module { + aie.device(npu1) { + %tile14 = aie.tile(1, 4) + %buf14 = aie.buffer(%tile14) { sym_name = "buf14" } : memref<128xi8> + %mem14 = aie.mem(%tile14) { + %srcDma = aie.dma_start("MM2S", 0, ^bd0, ^end) + ^bd0: + aie.dma_bd(%buf14 : memref<128xi8>, 0, 24, [, ]) + aie.next_bd ^end + ^end: aie.end } } diff --git a/test/dialect/AIE/nd-dma-bad-stride.mlir b/test/dialect/AIE/nd-dma-bad-stride.mlir index 7848a7c268a..d5e8d1424e3 100644 --- a/test/dialect/AIE/nd-dma-bad-stride.mlir +++ b/test/dialect/AIE/nd-dma-bad-stride.mlir @@ -8,24 +8,46 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt --verify-diagnostics %s +// RUN: aie-opt --split-input-file --verify-diagnostics %s -module @tutorial_2b { +// i16 (16b) inner stride of 2 elements = 32b = address-granularity multiple. +// This is realizable in hardware and must pass verification. +module @stride_word_aligned_ok { aie.device(xcve2802) { %tile14 = aie.tile(1, 4) %tile34 = aie.tile(3, 4) - aie.flow(%tile14, DMA : 0, %tile34, DMA : 0) %buf14 = aie.buffer(%tile14) : memref<128xi16> %lock14_done = aie.lock(%tile14, 0) { init = 0 : i32 } %mem14 = aie.mem(%tile14) { %srcDma = aie.dma_start("MM2S", 0, ^bd0, ^end) ^bd0: - // expected-error@+1 {{'aie.dma_bd' op For <32b width datatypes, inner-most dim stride must be 1}} aie.dma_bd(%buf14 : memref<128xi16>, 0, 128, []) aie.next_bd ^end ^end: aie.end } } -} \ No newline at end of file +} + +// ----- + +// i16 (16b) inner stride of 3 elements = 48b, not a granularity multiple. +module @stride_not_word_aligned { + aie.device(xcve2802) { + %tile14 = aie.tile(1, 4) + %tile34 = aie.tile(3, 4) + aie.flow(%tile14, DMA : 0, %tile34, DMA : 0) + %buf14 = aie.buffer(%tile14) : memref<128xi16> + %lock14_done = aie.lock(%tile14, 0) { init = 0 : i32 } + %mem14 = aie.mem(%tile14) { + %srcDma = aie.dma_start("MM2S", 0, ^bd0, ^end) + ^bd0: + // expected-error@+1 {{'aie.dma_bd' op Stride 0 is 3 elements * 2 bytes = 6 bytes, which is not divisible by 4}} + aie.dma_bd(%buf14 : memref<128xi16>, 0, 128, []) + aie.next_bd ^end + ^end: + aie.end + } + } +}