mlir-aie/lib/Dialect/AIEX/Transforms/AIEDMATasksToNPU.cpp at 7d2a761ea19f0a9f55f58850427bfc6d61f8d01f · Xilinx/mlir-aie · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
//===- AIEDMATasksToNPU.cpp -------------------------------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// (c) Copyright 2024 Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//

#include <algorithm>
#include <iterator>

#include "aie/Dialect/AIE/IR/AIEDialect.h"
#include "aie/Dialect/AIEX/AIEUtils.h"
#include "aie/Dialect/AIEX/IR/AIEXDialect.h"
#include "aie/Dialect/AIEX/Transforms/AIEXPasses.h"

#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/TypeSwitch.h"

namespace xilinx::AIEX {
#define GEN_PASS_DEF_AIEDMATASKSTONPU
#include "aie/Dialect/AIEX/Transforms/AIEXPasses.h.inc"
} // namespace xilinx::AIEX

using namespace mlir;
using namespace xilinx;
using namespace xilinx::AIEX;

struct DMAStartTaskOpPattern : OpConversionPattern<DMAStartTaskOp> {
  using OpConversionPattern::OpConversionPattern;

  LogicalResult
  matchAndRewrite(DMAStartTaskOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
    DMAConfigureTaskOp task_op = op.getTaskOp();
    if (!task_op) {
      // Cannot rewrite this; probably points to a DMAStartTaskForOp,
      // which we will lower once it has been rewritten into a DMAStartTaskOp.
      return failure();
    }
    AIE::TileOp tile = task_op.getTileOp();
    std::optional<uint32_t> first_bd_id = task_op.getFirstBdId();
    if (!first_bd_id) {
      auto err = op.emitOpError(
          "First buffer descriptor in chain has not been assigned an ID");
      err.attachNote() << "Run the `aie-assign-runtime-buffer-descriptor-ids` "
                          "pass first or manually assign an ID.";
      return failure();
    }
    rewriter.replaceOpWithNewOp<NpuPushQueueOp>(
        op, tile.getCol(), tile.getRow(), task_op.getDirection(),
        task_op.getChannel(), task_op.getIssueToken(), task_op.getRepeatCount(),
        *first_bd_id);
    return success();
  }
};

struct DMAAwaitTaskOpPattern : OpConversionPattern<DMAAwaitTaskOp> {
  using OpConversionPattern::OpConversionPattern;

  LogicalResult
  matchAndRewrite(DMAAwaitTaskOp op, OpAdaptor adaptor,
                  ConversionPatternRewriter &rewriter) const override {
    DMAConfigureTaskOp task_op = op.getTaskOp();
    if (!task_op) {
      return failure();
    }
    if (!task_op.getIssueToken()) {
      auto err = op.emitOpError(
          "Cannot wait on a BD that is not configured to issue a token.");
      err.attachNote(task_op.getLoc())
          << "Consider adding attribute `issue_token=true` here.";
      return err;
    }
    AIE::TileOp tile = task_op.getTileOp();
    rewriter.replaceOpWithNewOp<NpuSyncOp>(op, tile.getCol(), tile.getRow(),
                                           (uint32_t)task_op.getDirection(),
                                           task_op.getChannel(), 1, 1);
    return success();
  }
};

struct AIEDMATasksToNPUPass
    : xilinx::AIEX::impl::AIEDMATasksToNPUBase<AIEDMATasksToNPUPass> {

  bool shouldSkipBlock(Block &block) {
    // Allow blocks in the input IR that contain nothing but a next_bd operation
    // as the entry block. We will skip these blocks and not lower them to
    // anything.
    auto it = block.without_terminator();
    return block.isEntryBlock() && it.begin() == it.end();
  }

  LogicalResult verifyBdInBlock(Block &block) {
    auto bd_ops = block.getOps<AIE::DMABDOp>();
    // Exactly one BD op per block
    int n_bd_ops = std::distance(bd_ops.begin(), bd_ops.end());
    if (n_bd_ops < 1) {
      auto error = block.getTerminator()->emitError(
          "Block ending in this terminator does not contain a required "
          "aie.dma_bd operation.");
      error.attachNote(block.getParentOp()->getLoc())
          << "Error encountered while lowering this BD configuration.";
      return failure();
    } else if (n_bd_ops > 1) {
      auto error = block.getTerminator()->emitOpError(
          "This block contains multiple aie.dma_bd operations. Exactly one is "
          "required.");
      auto it = bd_ops.begin();
      ++it;
      for (; it != bd_ops.end(); ++it) {
        error.attachNote((*it)->getLoc()) << "Extra aie.dma_bd operation here.";
      }
      return failure();
    }
    AIE::DMABDOp bd_op = *bd_ops.begin();
    if (!bd_op.getBdId().has_value()) {
      auto error = bd_op.emitOpError(
          "Cannot lower buffer descriptor without assigned ID.");
      error.attachNote()
          << "Run the `--aie-assign-runtime-sequence-bd-ids` pass first or "
             "manually assign an ID to this buffer descriptor.";
      error.attachNote(block.getParentOp()->getLoc())
          << "Error encountered while lowering this BD configuration.";
      return failure();
    }
    return success();
  }

  LogicalResult verifyOptionalLocksInBlock(Block &block) {
    auto lock_ops = block.getOps<AIE::UseLockOp>();
    int n_lock_ops = std::distance(lock_ops.begin(), lock_ops.end());
    // Allow exactly 0 or 2 lock ops (acquire and release)
    if (n_lock_ops != 0 && n_lock_ops != 2) {
      AIE::UseLockOp lock_op = *lock_ops.begin();
      lock_op.emitOpError(
          "BD blocks must have either 0 or 2 lock operations (acquire and "
          "release). Found ")
          << n_lock_ops << " lock operations.";
      return failure();
    }
    return success();
  }

  LogicalResult verifyNoUnsupportedOpsInBlock(Block &block) {
    WalkResult unsupported_ops = block.walk([&](Operation *inner_op) {
      return llvm::TypeSwitch<Operation *, WalkResult>(inner_op)
          .Case<AIE::DMABDOp>(
              [&](AIE::DMABDOp bd_op) { return WalkResult::advance(); })
          .Case<AIE::UseLockOp>(
              [&](AIE::UseLockOp lock_op) { return WalkResult::advance(); })
          .Case<AIE::NextBDOp>(
              [&](AIE::NextBDOp lock_op) { return WalkResult::advance(); })
          .Case<AIE::EndOp>(
              [&](AIE::EndOp lock_op) { return WalkResult::advance(); })
          .Default([&](Operation *inner_op) {
            auto error = block.getParentOp()->emitOpError(
                "Unsupported operation within BD block.");
            error.attachNote(inner_op->getLoc())
                << "No lowering to NPU instructions available for this "
                   "operation.";
            return WalkResult::interrupt();
          });
    });
    if (unsupported_ops.wasInterrupted()) {
      return failure();
    }
    return success();
  }

  AIE::DMABDOp getBdForBlock(Block &block) {
    auto bd_ops = block.getOps<AIE::DMABDOp>();
    AIE::DMABDOp bd_op = *bd_ops.begin(); // Dereference first (and only, after
                                          // previous checks) bd op iterator
    return bd_op;
  }

  // Returns pair of (acquire_lock_op, release_lock_op) if present
  std::optional<std::pair<AIE::UseLockOp, AIE::UseLockOp>>
  getOptionalLockOpsForBlock(Block &block) {
    auto lock_ops = block.getOps<AIE::UseLockOp>();
    int n_lock_ops = std::distance(lock_ops.begin(), lock_ops.end());
    if (n_lock_ops != 2) {
      return std::nullopt;
    }

    AIE::UseLockOp acquire_op = nullptr;
    AIE::UseLockOp release_op = nullptr;

    for (auto lock_op : lock_ops) {
      if (lock_op.acquire() || lock_op.acquireGE()) {
        acquire_op = lock_op;
      } else if (lock_op.release()) {
        release_op = lock_op;
      }
    }

    if (acquire_op && release_op) {
      return std::make_pair(acquire_op, release_op);
    }
    return std::nullopt;
  }

  LogicalResult setAddressForSingleBD(OpBuilder &builder, AIE::DMABDOp &bd_op,
                                      AIE::TileOp &tile) {
    uint32_t bd_id = bd_op.getBdId().value();
    const AIE::AIETargetModel &target_model = AIE::getTargetModel(bd_op);
    auto buf = bd_op.getBuffer();
    auto col = tile.getCol();
    auto row = tile.getRow();
    uint64_t register_addr = target_model.getDmaBdAddress(col, row, bd_id) +
                             target_model.getDmaBdAddressOffset(col, row);

    // A buffer descriptor can refer to a statically allocated aie.buffer, or to
    // a DDR buffer which will be passed as a runtime argument (block
    // argument). Try to find the root block argument, either directly or
    // through subviews/casts.
    mlir::BlockArgument buf_arg = nullptr;
    int64_t offset = 0;

    if (auto directArg = llvm::dyn_cast<mlir::BlockArgument>(buf)) {
      buf_arg = directArg;
      offset = 0;
    } else if (auto traceResult = traceSubviewToBlockArgument(buf)) {
      buf_arg = traceResult->rootArg;
      offset = traceResult->offsetInBytes;
    }

    if (buf_arg) {
      if (!target_model.isShimNOCTile(tile.getCol(), tile.getRow())) {
        return bd_op->emitOpError("DDR memory (runtime input arguments) can "
                                  "only be referred to on shim tiles.");
      }

      unsigned arg_idx = buf_arg.getArgNumber();
      offset += bd_op.getOffsetInBytes();
      NpuAddressPatchOp::create(builder, bd_op.getLoc(),
                                /*addr*/ register_addr,
                                /*arg_idx*/ arg_idx,
                                /*arg_plus*/ offset);
    } else if (AIE::BufferOp buffer =
                   llvm::dyn_cast<AIE::BufferOp>(buf.getDefiningOp())) {
      uint64_t buf_addr;
      if (!buffer.getAddress().has_value()) {
        return bd_op->emitOpError(
            "Cannot lower buffer without associated address. Run pass "
            "--aie-assign-buffer-addresses first or manually assign an "
            "address.");
      }
      buf_addr = *buffer.getAddress();
      buf_addr += bd_op.getOffsetInBytes();
      if (target_model.isCoreTile(col, row)) {
        NpuMaskWrite32Op::create(builder, bd_op.getLoc(), register_addr,
                                 (buf_addr / 4) << 14, 0x0fffc000, nullptr,
                                 nullptr, nullptr);
      } else if (target_model.isMemTile(col, row)) {
        // On AIE2p (NPU2), memtile DMAs use an offset-based address
        // space where the base depends on the relative position of the
        // buffer's tile (west=0, internal=getMemTileSize, east=2x).
        // On AIE2 (NPU1), memtile DMAs address local memory directly
        // starting at 0. Only add the offset for AIE2p.
        if (target_model.getTargetArch() == AIE::AIEArch::AIE2p) {
          auto addrOffset = target_model.getMemLocalBaseAddress(
              col, row, buffer.getTileOp().getCol(),
              buffer.getTileOp().getRow());
          if (addrOffset)
            buf_addr += addrOffset.value();
        }
        NpuMaskWrite32Op::create(builder, bd_op.getLoc(), register_addr,
                                 buf_addr / 4, 0x0007FFFF, nullptr, nullptr,
                                 nullptr);
      } else {
        NpuWrite32Op::create(builder, bd_op.getLoc(), register_addr, buf_addr,
                             nullptr, nullptr, nullptr);
      }
    } else {
      return bd_op->emitOpError(
          "Buffer argument must be a constant aie.buffer, a runtime sequence "
          "input argument, or a (chain of) subview(s) or cast(s) of a block "
          "argument with constant offsets and strides equal to one.");
    }
    return success();
  }

  LogicalResult
  rewriteSingleBD(OpBuilder &builder, Block &block, AIE::TileOp &tile,
                  AIE::DMAChannelDir channelDir,
                  std::optional<xilinx::AIE::PacketInfoAttr> packet) {
    AIE::DMABDOp bd_op = getBdForBlock(block);
    const auto &target_model = AIE::getTargetModel(bd_op);
    auto buffer_type = llvm::cast<BaseMemRefType>(bd_op.getBuffer().getType());
    uint32_t addr_granularity = target_model.getAddressGenGranularity();

    uint32_t bd_id = bd_op.getBdId().value();
    int64_t offset = bd_op.getOffsetInBytes();
    uint64_t len = bd_op.getLenInBytes();
    uint64_t len_addr_granularity = len * 8 / addr_granularity;

    if (offset * 8 % addr_granularity != 0) {
      return bd_op->emitOpError("Offset must be aligned to ")
             << (addr_granularity / 8) << " byte boundary.";
    }

    if (len < addr_granularity / 8) {
      return bd_op->emitOpError("Transfer size of ")
             << len << " bytes falls below minimum hardware transfer unit of "
             << (addr_granularity / 8) << " bytes.";
    }
    // Process strides/wraps
    std::optional<llvm::ArrayRef<AIE::BDDimLayoutAttr>> dims =
        bd_op.getDimensions();
    llvm::SmallVector<int64_t, 4> sizes = llvm::SmallVector<int64_t, 4>(4, 0);
    llvm::SmallVector<int64_t, 4> strides = llvm::SmallVector<int64_t, 4>(4, 0);

    // Padding
    std::optional<llvm::ArrayRef<AIE::BDPadLayoutAttr>> padDims =
        bd_op.getPadDimensions();
    llvm::SmallVector<int64_t, 4> padBefore =
        llvm::SmallVector<int64_t, 4>(4, 0);
    llvm::SmallVector<int64_t, 4> padAfter =
        llvm::SmallVector<int64_t, 4>(4, 0);
    std::fill(padBefore.begin(), padBefore.end(), 0);
    std::fill(padAfter.begin(), padAfter.end(), 0);

    auto enable_packet = 0;
    auto out_of_order_id = 0;
    auto packet_id = 0;
    auto packet_type = 0;
    auto d0size = 0;
    auto d0stride = 0;
    auto d1size = 0;
    auto d1stride = 0;
    auto d2size = 0;
    auto d2stride = 0;
    auto iteration_size = 0;
    auto iteration_stride = 0;

    if (dims && dims->size() > 0) {
      llvm::SmallVector<int64_t, 4> input_sizes =
          llvm::SmallVector<int64_t, 4>(4, 1);
      llvm::SmallVector<int64_t, 4> input_strides =
          llvm::SmallVector<int64_t, 4>(4, 0);
      if (dims->size() > 4) {
        return bd_op->emitOpError("At most four data layout transformation "
                                  "dimensions may be provided.");
      }

      for (size_t i = 0; i < dims->size(); i++) {
        // Pass down dimensions in reverse order; in the MLIR, this allows
        // us to specify step sizes/wraps in the same order as we would
        // access a multi-dim C array, with the highest dimension first.
        int j = dims->size() - i - 1;
        input_sizes[i] = (*dims)[j].getSize();
        input_strides[i] = (*dims)[j].getStride();
      }

      // d3 (repeat) is excluded; a repeated linear transfer is still linear.
      // A contiguous row-major ND access on a shim NOC tile is also lowered
      // using the wide buffer_length register, exempt from the 10-bit ND
      // wrap-size limit.  Canonicalization zeroes size-1 strides before this
      // pass runs, so isContiguousTransfer is sufficient.
      bool treatAsLinear =
          isLinearTransfer(input_sizes, input_strides) ||
          (target_model.isShimNOCTile(tile.getCol(), tile.getRow()) &&
           isContiguousTransfer(input_sizes, input_strides));

      if (dims->size() > 2) {
        d2size = (target_model.isMemTile(tile.getCol(), tile.getRow()))
                     ? (*dims)[2].getSize()
                     : 0;
      }
      if (padDims.has_value()) {
        if (!target_model.isMemTile(tile.getCol(), tile.getRow()))
          return bd_op->emitOpError()
                 << "Padding is only supported by memtile dma bds.";
        if (padDims->size() > dims->size())
          return bd_op->emitOpError()
                 << "Mismatch number of dimensions between padding(s)"
                 << " and wrap(s) and stride(s).";
        if (channelDir == AIE::DMAChannelDir::MM2S) {
          for (size_t i = 0; i < padDims->size(); i++) {
            int j = padDims->size() - i - 1;
            padBefore[i] = (*padDims)[j].getConstPadBefore();
            padAfter[i] = (*padDims)[j].getConstPadAfter();
          }
          for (size_t i = padDims->size(); i < dims->size(); i++) {
            padBefore[i] = 0;
            padAfter[i] = 0;
          }
        } else
          return bd_op->emitOpError()
                 << "supports padding only for MM2S direction on MemTiles.";
      }
      getHardwareStridesWraps(target_model, bd_op, buffer_type, input_sizes,
                              input_strides, sizes, strides);

      if (failed(verifyStridesWraps(bd_op, buffer_type, tile.getCol(),
                                    tile.getRow(), input_sizes, input_strides,
                                    sizes, strides, treatAsLinear))) {
        return failure();
      }

      iteration_size = sizes[3];
      iteration_stride = strides[3];

      if (!treatAsLinear) {
        // d0_size, d0_stride
        d0size = sizes[0];
        d0stride = strides[0];

        // d1_size, d1_stride
        d1size = sizes[1];
        d1stride = strides[1];

        // d2_stride
        d2stride = strides[2];
        // d2_size set elsewhere
      }
      if (input_sizes[3] > 1 && input_strides[3] == 0) {
        // We allow users to encode the repeat_count as a dimension 3 stride
        // of 0. This must lower to a iteration wrap of 0, so no stride is
        // ever added. We then repeat the BD using the repeat_count in
        // NpuPushQueueOp.
        iteration_size = 0;
        iteration_stride = 0;
      }

      // Ensure the total transfer length and the length expressed in the lowest
      // three dimensions of strides/wraps agree. (Fourth dimension is
      // iteration/repeat count and repeats the whole BD, so should not be
      // incorporated in length of a single BD invocation.)
      uint64_t len_dims_addr_granularity = 1;
      for (size_t i = 0; i < 3; i++) {
        len_dims_addr_granularity *= sizes[i];
      }
      if (len_dims_addr_granularity != len_addr_granularity) {
        auto err =
            bd_op->emitOpError(
                "Buffer descriptor length does not match length of transfer "
                "expressed by lowest three dimensions of data layout "
                "transformation strides/wraps. ")
            << "BD length is " << (len_addr_granularity * addr_granularity / 8)
            << " bytes. "
            << "Lowest three dimensions of data layout transformation would "
               "result in transfer of "
            << (len_dims_addr_granularity * addr_granularity / 8) << " bytes. ";
        err.attachNote() << "Do not include the highest dimension size in "
                            "transfer length, as this is the BD repeat count.";
        return failure();
      }
    } else {
      if (padDims && target_model.isMemTile(tile.getCol(), tile.getRow()) &&
          channelDir == AIE::DMAChannelDir::MM2S) {
        return bd_op->emitOpError()
               << "Padding requires n-d data layouts expressed as "
               << "wrap(s) and stride(s).";
      } else if (padDims) {
        return bd_op->emitOpError() << "Padding is supported only on MemTiles.";
      }
    }
    // find next BD ID, if any
    uint32_t use_next_bd = 0;
    uint32_t next_bd_id = 0;
    if (bd_op.getNextBdId().has_value()) {
      next_bd_id = bd_op.getNextBdId().value();
      use_next_bd = 1;
    }

    // enable_packet
    // auto info = bd_op.getPacket() ? bd_op.getPacket() : packet;
    auto info = bd_op.getPacket().value_or(packet.value_or(nullptr));
    if (info) {
      enable_packet = 1;
      packet_type = info.getPktType();
      packet_id = info.getPktId();
    }

    // Extract lock information if present
    int32_t lock_rel_val = 0;
    int32_t lock_rel_id = 0;
    int32_t lock_acq_enable = 0;
    int32_t lock_acq_val = 0;
    int32_t lock_acq_id = 0;

    auto lock_ops = getOptionalLockOpsForBlock(block);
    if (lock_ops) {
      auto [acquire_op, release_op] = *lock_ops;

      // Get lock IDs from the lock operations
      AIE::LockOp acq_lock = acquire_op.getLockOp();
      AIE::LockOp rel_lock = release_op.getLockOp();

      if (acq_lock.getLockID().has_value()) {
        lock_acq_id = acq_lock.getLockID().value();
        lock_acq_val = acquire_op.getLockValue();
        // For AcquireGreaterEqual, negate the value to signal the hardware
        // to use >= comparison instead of == comparison.
        if (acquire_op.acquireGE())
          lock_acq_val = -lock_acq_val;
        lock_acq_enable = 1;
      }

      if (rel_lock.getLockID().has_value()) {
        lock_rel_id = rel_lock.getLockID().value();
        lock_rel_val = release_op.getLockValue();
      }

      // For memtile, add lock offset using getLockLocalBaseIndex.
      // This matches AIERT.cpp implementation.
      if (target_model.isMemTile(tile.getCol(), tile.getRow())) {
        auto lockOffset = target_model.getLockLocalBaseIndex(
            tile.getCol(), tile.getRow(), acq_lock.colIndex(),
            acq_lock.rowIndex());
        if (lockOffset && acq_lock.getLockID().has_value())
          lock_acq_id += lockOffset.value();
        if (lockOffset && rel_lock.getLockID().has_value())
          lock_rel_id += lockOffset.value();
      }
    }

    NpuWriteBdOp newBdOp = NpuWriteBdOp::create(
        builder, bd_op.getLoc(), tile.getCol(), bd_id, len_addr_granularity,
        offset,
        /*enable_packet=*/enable_packet,
        /*out_of_order_id=*/out_of_order_id,
        /*packet_id=*/packet_id,
        /*packet_type=*/packet_type,
        /*d0_size=*/d0size, /*d0_stride=*/d0stride,
        /*d1_size=*/d1size, /*d1_stride=*/d1stride,
        /*d2_size=*/d2size, /*d2_stride=*/d2stride,
        /*iteration_current=*/0, /*iteration_size=*/iteration_size,
        /*iteration_stride=*/iteration_stride,
        /*next_bd=*/next_bd_id,
        /*row=*/tile.getRow(),
        /*use_next_bd=*/use_next_bd,
        /*valid_bd=*/1,
        /*lock_rel_val=*/lock_rel_val, /*lock_rel_id=*/lock_rel_id,
        /*lock_acq_enable=*/lock_acq_enable,
        /*lock_acq_val=*/lock_acq_val, /*lock_acq_id=*/lock_acq_id,
        /*d0_zero_before=*/padBefore[0],
        /*d1_zero_before=*/padBefore[1], /*d2_zero_before=*/padBefore[2],
        /*d0_zero_after=*/padAfter[0], /*d1_zero_after=*/padAfter[1],
        /*d2_zero_after=*/padAfter[2],
        /*burst_length=*/bd_op.getBurstLength());

    // ObjectFifo lowering pass (AIEObjectFifoStatefulTransform) tagged the
    // source DMABDOp with ``aie.enable_compression = true`` when the
    // originating ObjectFifoCreateOp carried the SparseFifo discardable
    // attrs (``aie.compress_mm2s`` for MM2S, ``aie.decompress_s2mm`` for
    // S2MM). The BD-emit pass (AIEDmaToNpu) reads this attribute back from
    // the NpuWriteBdOp to flip the per-channel ``Enable_Compression`` bit
    // on the AIE2/AIE2P tile DMA BD config word (AM020 Ch. 2 p. 27 +
    // ``aie_registers_aie2.json``).
    if (auto compAttr = bd_op->getAttrOfType<BoolAttr>(
            "aie.enable_compression");
        compAttr && compAttr.getValue()) {
      newBdOp->setAttr("aie.enable_compression",
                       BoolAttr::get(newBdOp.getContext(), true));
    }

    return setAddressForSingleBD(builder, bd_op, tile);
  }

  LogicalResult hoistNextBdOpsIntoAttrs(DMAConfigureTaskOp op) {
    Region &body = op.getBody();
    for (auto it = body.begin(); it != body.end(); ++it) {
      Block &block = *it;
      if (shouldSkipBlock(block)) {
        continue;
      }
      AIE::DMABDOp bd_op = getBdForBlock(block);
      if (AIE::NextBDOp next_bd_op =
              llvm::dyn_cast<AIE::NextBDOp>(block.getTerminator())) {
        if (bd_op.getNextBdId().has_value()) {
          auto error =
              bd_op.emitOpError("Cannot specify both next_bd_id attribute and "
                                "aie.next_bd operation.");
          error.attachNote(next_bd_op.getLoc())
              << "Potentially conflicting next buffer descriptor ID specified "
                 "here.";
          return failure();
        }
        Block &next_bd_block = *next_bd_op.getDest();
        AIE::DMABDOp next_dma_bd_op = getBdForBlock(next_bd_block);
        assert(next_dma_bd_op.getBdId()
                   .has_value()); // Next BD should have assigned ID, and this
                                  // should have been checked by earlier
                                  // verifyBdInBlock() call
        bd_op.setNextBdId(next_dma_bd_op.getBdId().value());
        OpBuilder builder(next_bd_op);
        AIE::EndOp::create(builder, next_bd_op.getLoc());
        next_bd_op.erase();
      }
    }
    return success();
  }

  LogicalResult rewriteSingleDMAConfigureTaskOp(DMAConfigureTaskOp op) {
    OpBuilder builder(op);
    AIE::TileOp tile = op.getTileOp();

    if (!op.use_empty()) {
      auto err = op.emitOpError("Cannot lower while op still has uses.");
      mlir::Operation::use_range uses = op.getOperation()->getUses();
      for (auto it = uses.begin(); it != uses.end(); ++it) {
        err.attachNote(it->getOwner()->getLoc()) << "Used here.";
      }
      return failure();
    }

    Region &body = op.getBody();

    // Verify each BD block first; subsequent functions rely on them being
    // well-formed
    for (auto it = body.begin(); it != body.end(); ++it) {
      if (shouldSkipBlock(*it)) {
        continue;
      }
      if (failed(verifyNoUnsupportedOpsInBlock(*it))) {
        return failure();
      }
      if (failed(verifyBdInBlock(*it))) {
        return failure();
      }
      if (failed(verifyOptionalLocksInBlock(*it))) {
        return failure();
      }
    }

    // Hoist next_bd operations into next_bd_id attribute of the dma_bd
    if (failed(hoistNextBdOpsIntoAttrs(op))) {
      return failure();
    }

    auto channelDir = op.getDirection();
    auto packet = op.getPacket();

    // Lower all BDs
    for (auto it = body.begin(); it != body.end(); ++it) {
      Block &block = *it;
      if (shouldSkipBlock(block)) {
        continue;
      }
      if (failed(rewriteSingleBD(builder, block, tile, channelDir, packet))) {
        return failure();
      }
    }

    op.erase();

    return success();
  }

  LogicalResult rewriteDMAConfigureTaskOp(AIE::DeviceOp device) {
    WalkResult result = device.walk([&](DMAConfigureTaskOp op) {
      if (failed(rewriteSingleDMAConfigureTaskOp(op))) {
        return WalkResult::interrupt();
      }
      return WalkResult::advance();
    });
    if (result.wasInterrupted()) {
      return failure();
    }
    return success();
  }

  void runOnOperation() override {
    AIE::DeviceOp device = getOperation();

    // Convert DMAStartBD and DMAAwaitBD ops
    ConversionTarget target(getContext());
    target.addLegalDialect<AIEXDialect>();
    target.addIllegalOp<DMAStartTaskOp>();
    target.addIllegalOp<DMAAwaitTaskOp>();
    RewritePatternSet patterns(&getContext());
    patterns.insert<DMAStartTaskOpPattern>(&getContext());
    patterns.insert<DMAAwaitTaskOpPattern>(&getContext());
    if (failed(applyPartialConversion(device, target, std::move(patterns)))) {
      signalPassFailure();
    }

    // Lower the configuration for the BDs
    if (failed(rewriteDMAConfigureTaskOp(device))) {
      signalPassFailure();
    }
  }
};

std::unique_ptr<OperationPass<AIE::DeviceOp>>
AIEX::createAIEDMATasksToNPUPass() {
  return std::make_unique<AIEDMATasksToNPUPass>();
}