Xilinx · hunhoffe · May 22, 2026 · Jun 13, 2026 · Jun 13, 2026
@@ -238,6 +238,28 @@ void collectBuffers(
 // linearized by the compiler.
 bool isContiguousBDTransfer(llvm::ArrayRef<BDDimLayoutAttr> dims);
 
+// Verify that a DMA buffer descriptor's data-layout transform and explicit
+// iteration are realizable on the given tile type. The data-layout dims
+// (`inputSizes`/`inputStrides`, innermost-first, in element-width units) are
+// pure access dims: all sizes positive, all strides positive (no magic stride-0
+// "repeat" -- iteration is a separate input here). Iteration is described by
+// logical `iterSize`/`iterStride` (element-width units, iterSize == 0 means no
+// iteration). The tile type selects the dimension-count limit and the wrap /
+// step / iteration register bit-widths from the target model.
+//
+// Checks: dim count within getDmaBdMaxDims(tileType); positive sizes/strides;
+// innermost contiguous run is a granularity multiple; sub-word innermost runs
+// rejected; each non-innermost stride byte-aligned to granularity (innermost
+// stride==1 always allowed); for elemWidth > granularity innermost stride must
+// be 1; per-dimension wrap/step ranges (skipped when skipTransformationChecks,
+// e.g. a contiguous shim transfer lowered to linear mode); iteration wrap/step
+// ranges and positive iterStride when iterSize > 1.
+mlir::LogicalResult verifyBDDataLayoutAndIteration(
+    mlir::Operation *forOp, const AIETargetModel &targetModel,
+    AIETileType tileType, unsigned elemWidthBits,
+    llvm::ArrayRef<int64_t> inputSizes, llvm::ArrayRef<int64_t> inputStrides,
+    int64_t iterSize, int64_t iterStride, bool skipTransformationChecks);
+
 } // namespace xilinx::AIE
 
 namespace llvm {

@@ -1023,6 +1023,17 @@ def AIE_DMABDOp: AIE_Op<"dma_bd", []> {
         // state-table index whose value is used as the BD address offset.
         // After lowering, $offset_parameter is removed in favor of this attribute.
         OptionalAttr<UI8Attr>:$offset_state_table_idx,
+        // Explicit buffer-descriptor iteration ("repeat with stride"). These
+        // are logical (1-based) values, denominated -- like the data-layout
+        // strides -- in multiples of the element width. iter_size is the
+        // iteration wrap (number of replays of the BD); iter_size == 0 means no
+        // iteration. iter_stride is the address step added between iterations.
+        // iter_current is the starting iteration index (almost always 0).
+        // Iteration is a separate hardware feature from the $dimensions
+        // data-layout transform; do not encode repeats inside $dimensions.
+        DefaultValuedOptionalAttr<AIEI32Attr, "0">:$iter_size,
+        DefaultValuedOptionalAttr<AIEI32Attr, "0">:$iter_stride,
+        DefaultValuedOptionalAttr<AIEI32Attr, "0">:$iter_current,
         // should never be assigned by user...
         OptionalAttr<AIEI32Attr>:$next_bd_id
   );
@@ -1858,8 +1869,10 @@ def AIE_ObjectFifoCreateOp: AIE_Op<"objectfifo", [HasParent<"DeviceOp">, Symbol]
         // disable_synchronization==true will skip lock generation for
         // objectfifo synchronous accesses
         DefaultValuedAttr<BoolAttr, "false">:$disable_synchronization,
-        // repeat_count==1 means "do it once"
-        OptionalAttr<ConfinedAttr<AIEI32Attr, [IntMinValue<1>]>>:$repeat_count,
+        // repeat_count counts EXTRA replays of the BD chain; 0 means "do it
+        // once" (no repeat), matching aie.dma_start / the dma_task ops /
+        // aiex.npu.dma_memcpy_nd. So N replays the chain N+1 times total.
+        OptionalAttr<ConfinedAttr<AIEI32Attr, [IntMinValue<0>]>>:$repeat_count,
         // aie_stream==0 means enable aie stream port on producer tile
         // aie_stream==1 means enable aie stream port on consumer tile
         // aie_stream==2 means enable aie stream ports on producer and consumer tiles

@@ -294,6 +294,18 @@ class AIETargetModel {
   /// Return the number of buffer descriptors for a given tile type.
   virtual uint32_t getNumBDs(AIETileType tileType) const = 0;
 
+  /// Return the maximum number of data-layout transformation dimensions a DMA
+  /// buffer descriptor supports on the given tile type. Iteration/repeat is a
+  /// separate buffer-descriptor feature and is not counted here.
+  virtual uint32_t getDmaBdMaxDims(AIETileType tileType) const = 0;
+
+  /// Return the bit width of, respectively, the wrap (size), step (stride), and
+  /// iteration-wrap registers in a DMA buffer descriptor for the given tile
+  /// type. A value of N means the field holds an unsigned [0, 2^N - 1].
+  virtual uint32_t getDmaBdWrapBits(AIETileType tileType) const = 0;
+  virtual uint32_t getDmaBdStepBits(AIETileType tileType) const = 0;
+  virtual uint32_t getDmaBdIterBits(AIETileType tileType) const = 0;
+
   /// Get stream switch port index for a given port specification
   /// Return port index for Stream_Switch_Event_Port_Selection register, or
   /// nullopt if invalid
@@ -480,6 +492,10 @@ class AIE1TargetModel : public AIETargetModel {
   uint32_t getNumBDs(AIETileType tileType) const override {
     return 16; // AIE1 has no MemTiles, always 16
   }
+  uint32_t getDmaBdMaxDims(AIETileType tileType) const override { return 3; }
+  uint32_t getDmaBdWrapBits(AIETileType tileType) const override { return 10; }
+  uint32_t getDmaBdStepBits(AIETileType tileType) const override { return 20; }
+  uint32_t getDmaBdIterBits(AIETileType tileType) const override { return 6; }
   bool isBdChannelAccessible(int col, int row, uint32_t bd_id,
                              int channel) const override {
     return true;
@@ -586,6 +602,24 @@ class AIE2TargetModel : public AIETargetModel {
     return tileType == AIETileType::MemTile ? 48 : 16;
   }
 
+  uint32_t getDmaBdMaxDims(AIETileType tileType) const override {
+    return tileType == AIETileType::MemTile ? 4 : 3;
+  }
+  uint32_t getDmaBdWrapBits(AIETileType tileType) const override {
+    return tileType == AIETileType::CoreTile ? 8 : 10;
+  }
+  uint32_t getDmaBdStepBits(AIETileType tileType) const override {
+    switch (tileType) {
+    case AIETileType::CoreTile:
+      return 13;
+    case AIETileType::MemTile:
+      return 17;
+    default: // ShimNOC / ShimPL
+      return 20;
+    }
+  }
+  uint32_t getDmaBdIterBits(AIETileType tileType) const override { return 6; }
+
   bool isBdChannelAccessible(int col, int row, uint32_t bd_id,
                              int channel) const override {
     if (getTileType(col, row) != AIETileType::MemTile) {

@@ -651,16 +651,24 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
     The hardware only supports a single static offset, and this offset is calculated at compile time.
     Thus, all offsets can be equivalently expressed with the lowest dimension only.
 
+    #### Iteration and Repeat
+
+    `sizes`/`strides` describe up to three pure data-layout dimensions. To replay
+    the whole transfer, use `repeat_count` (a pure repeat, no address increment;
+    `repeat_count == 0` means a single pass) or the explicit `iter_size` /
+    `iter_stride` / `iter_current` attributes (a strided iteration). This matches
+    `aie.dma_bd` and the `dma_task` ops; there is no magic stride-0 dimension.
+
     #### Automatic Linearization of Contiguous Accesses
 
     A canonicalization pattern automatically folds a contiguous row-major access pattern into
-    the canonical linear form `[s3, 1, 1, N][st3, 0, 0, 1]`, where N is the product of the
-    inner three sizes. An access is contiguous when `strides[0] == 1` and each outer stride
+    the canonical linear form `[1, 1, N][0, 0, 1]`, where N is the product of the
+    three sizes. An access is contiguous when `strides[0] == 1` and each outer stride
     equals the product of the inner sizes (i.e. a standard row-major scan).
 
     This means users can express naturally multidimensional accesses such as a 2D image
-    `[1, 1, height, width][0, 0, width, 1]` or a 3D activation tensor
-    `[1, H, W, C][0, W*C, C, 1]` without worrying about hardware dimension size limits.
+    `[1, height, width][0, width, 1]` or a 3D activation tensor
+    `[H, W, C][W*C, C, 1]` without worrying about hardware dimension size limits.
     The compiler will fold them to the linear form, which uses a wider hardware register
     and avoids the 10-bit d0 wrap-size constraint that applies to ND transfers.
 
@@ -673,13 +681,16 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
 
   let arguments = (
     ins AnyRankedOrUnrankedMemRef:$memref,
-        // NOTE: these are in reverse order: offset3, offset2, ...
+        // NOTE: these are in reverse order: offset2, offset1, offset0
+        // These describe up to three pure data-layout dimensions. Iteration and
+        // repeat are expressed separately (see iter_* and repeat_count below),
+        // consistent with aie.dma_bd and the dma_task ops.
         Variadic<I64>:$offsets,
         Variadic<I64>:$sizes,
         Variadic<I64>:$strides,
-        ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<4>]>:$static_offsets,
-        ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<4>]>:$static_sizes,
-        ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<4>]>:$static_strides,
+        ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<3>]>:$static_offsets,
+        ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<3>]>:$static_sizes,
+        ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<3>]>:$static_strides,
         OptionalAttr<PacketInfoAttr>:$packet,
         SymbolRefAttr:$metadata,
         I64Attr:$id,
@@ -696,7 +707,18 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
         // Set by --aie-lower-scratchpad-parameters from $offset_parameter: the scratchpad
         // state-table index whose value is used as the BD address offset.
         // After lowering, $offset_parameter is removed in favor of this attribute.
-        OptionalAttr<UI8Attr>:$offset_state_table_idx
+        OptionalAttr<UI8Attr>:$offset_state_table_idx,
+        // Explicit buffer-descriptor iteration ("repeat with stride"), logical
+        // (1-based) values in element-width units, matching aie.dma_bd's iter_*.
+        // iter_size == 0 means no iteration. Mutually exclusive with a non-zero
+        // repeat_count.
+        DefaultValuedOptionalAttr<I64Attr, "0">:$iter_size,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$iter_stride,
+        DefaultValuedOptionalAttr<I64Attr, "0">:$iter_current,
+        // Number of extra times to replay the whole buffer descriptor with no
+        // address increment (a pure repeat). The default 0 means a single pass
+        // (no repeat); this matches the convention on aie.dma_bd / dma_task.
+        DefaultValuedOptionalAttr<I32Attr, "0">:$repeat_count
   );
 
   let assemblyFormat = [{
@@ -726,7 +748,7 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
 
   let extraClassDefinition = [{
     unsigned $cppClass::getOffsetSizeAndStrideStartOperandIndex() { return 1; }
-    std::array<unsigned, 3> $cppClass::getArrayAttrMaxRanks() { return {4, 4, 4}; }
+    std::array<unsigned, 3> $cppClass::getArrayAttrMaxRanks() { return {3, 3, 3}; }
     uint64_t $cppClass::getElementTypeBitwidth() {
       DataLayout dataLayout = DataLayout::closest(*this);
       return dataLayout.getTypeSizeInBits(getMemref().getType().getElementType());

@@ -41,14 +41,6 @@ void getHardwareStridesWraps(const AIE::AIETargetModel &targetModel,
                              llvm::SmallVector<int64_t, 4> inputStrides,
                              llvm::SmallVector<int64_t, 4> &sizes,
                              llvm::SmallVector<int64_t, 4> &strides);
-mlir::LogicalResult
-verifyStridesWraps(mlir::Operation *forOp,
-                   mlir::BaseMemRefType referencedBufType, int tileCol,
-                   int tileRow, llvm::SmallVector<int64_t, 4> inputSizes,
-                   llvm::SmallVector<int64_t, 4> inputStrides,
-                   llvm::SmallVector<int64_t, 4> hardwareSizes,
-                   llvm::SmallVector<int64_t, 4> hardwareStrides,
-                   bool skipTransformationChecks = false);
 bool isLinearTransfer(llvm::ArrayRef<int64_t> sizes,
                       llvm::ArrayRef<int64_t> strides);