Skip to content

Commit 48100f9

Browse files
committed
Delete shim DMA column-affinity heuristic (RFC #1567 Stage C #6)
Drops `colAllocConstraint == "same_column"` from ShimDMAAllocator::allocNewDmaChannel. Default for every caller (no override exists), so the effect is universal. Shim DMA columns are now chosen by first-fit + round-robin overflow when channels exhaust; compute-shim column adjacency is delegated to mlir-aie's flow-aware placer (#3055). Also drops the `colAllocConstraint` parameter from the public API entry in AIRToAIESchedulingUtils.h since it now has no values that change behavior. Acknowledged behavioral change: workloads that depended on shim DMAs landing in the same column as the compute-side `(col, row)` may see worse routing until follow-up Stage C #7 consolidates the shim path with ShimTileAllocator's unconstrained `(?, ?)` approach. Tests updated to reflect first-fit shim packing: - good_shim_packet_flow_npu_4col.mlir: 4 packet flows now share shim col 0 via packet switching (was 4 cols). - async_gemm_w_pingpong_to_locks_npu.mlir: both compute cols' shim traffic packs into shim col 0 (DMA:0 + DMA:1). - air_shimcpy_to_npu.mlir (subtest 12): 4 mem-to-shim S2MM flows fill shim cols 0+1 via channel overflow (was 4 cols). - air_shimcpy_to_npu.mlir (subtest 14): MM2S allocation pattern reflows to (col 0 ch 0, col 0 ch 1, col 1 ch 0, col 1 ch 1) instead of same-column striping. Tests: 384/384 check-air-mlir (2 pre-existing AIRToROCDL failures unrelated). clang-format-17 clean.
1 parent 7e32ff5 commit 48100f9

5 files changed

Lines changed: 16 additions & 25 deletions

File tree

mlir/include/air/Conversion/AIRToAIESchedulingUtils.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,7 @@ class ShimDMAAllocator : public DMAAllocator {
197197

198198
FailureOr<allocation_info_t>
199199
allocNewDmaChannel(air::MemcpyInterface &memcpyOp, int col, int row,
200-
std::vector<Operation *> &dma_ops,
201-
std::string colAllocConstraint = "same_column");
200+
std::vector<Operation *> &dma_ops);
202201

203202
FailureOr<allocation_info_t>
204203
allocNewDmaChannel(air::MemcpyInterface &memcpyOp,

mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -998,9 +998,10 @@ air::ShimDMAAllocator::ShimDMAAllocator(AIE::DeviceOp device)
998998
}
999999
}
10001000

1001-
FailureOr<air::allocation_info_t> air::ShimDMAAllocator::allocNewDmaChannel(
1002-
air::MemcpyInterface &memcpyOp, int col, int row,
1003-
std::vector<Operation *> &dma_ops, std::string colAllocConstraint) {
1001+
FailureOr<air::allocation_info_t>
1002+
air::ShimDMAAllocator::allocNewDmaChannel(air::MemcpyInterface &memcpyOp,
1003+
int col, int row,
1004+
std::vector<Operation *> &dma_ops) {
10041005
auto isMM2S = isTileOutbound(memcpyOp, dmaMemorySpace);
10051006
if (failed(isMM2S))
10061007
return failure();
@@ -1027,12 +1028,6 @@ FailureOr<air::allocation_info_t> air::ShimDMAAllocator::allocNewDmaChannel(
10271028
}
10281029
AIE::TileOp tile = nullptr;
10291030
int colIdx = 0;
1030-
if (colAllocConstraint == "same_column") {
1031-
// Attempt to use shim dma channels within the same column.
1032-
auto it = find(dma_columns.begin(), dma_columns.end(), col);
1033-
if (it != dma_columns.end())
1034-
colIdx = it - dma_columns.begin();
1035-
}
10361031
int dma_col = dma_columns[colIdx];
10371032

10381033
// For packet-flow ops, reuse an existing physical channel on this shim tile

mlir/test/Conversion/AIRToAIE/air_shimcpy_to_npu.mlir

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -782,8 +782,6 @@ func.func @func11(%arg0: memref<128xbf16>, %arg1: memref<128xbf16>) {
782782
// CHECK: aie.device(npu1)
783783
// CHECK: %[[tile_0_0:.*]] = aie.tile(0, 0)
784784
// CHECK: %[[tile_1_0:.*]] = aie.tile(1, 0)
785-
// CHECK: %[[tile_2_0:.*]] = aie.tile(2, 0)
786-
// CHECK: %[[tile_3_0:.*]] = aie.tile(3, 0)
787785
// CHECK: %[[tile_0_1:.*]] = aie.tile(0, 1)
788786
// CHECK: %[[tile_1_1:.*]] = aie.tile(1, 1)
789787
// CHECK: %[[tile_2_1:.*]] = aie.tile(2, 1)
@@ -841,9 +839,9 @@ func.func @func11(%arg0: memref<128xbf16>, %arg1: memref<128xbf16>) {
841839
// CHECK: aie.core(%[[tile_1_2]])
842840
// CHECK: aie.core(%[[tile_0_2]])
843841
// CHECK: aie.flow(%[[tile_0_1]], DMA : 0, %[[tile_0_0]], DMA : 0)
844-
// CHECK: aie.flow(%[[tile_1_1]], DMA : 0, %[[tile_1_0]], DMA : 0)
845-
// CHECK: aie.flow(%[[tile_2_1]], DMA : 0, %[[tile_2_0]], DMA : 0)
846-
// CHECK: aie.flow(%[[tile_3_1]], DMA : 0, %[[tile_3_0]], DMA : 0)
842+
// CHECK: aie.flow(%[[tile_1_1]], DMA : 0, %[[tile_0_0]], DMA : 1)
843+
// CHECK: aie.flow(%[[tile_2_1]], DMA : 0, %[[tile_1_0]], DMA : 0)
844+
// CHECK: aie.flow(%[[tile_3_1]], DMA : 0, %[[tile_1_0]], DMA : 1)
847845
// CHECK: aie.flow(%[[tile_0_2]], DMA : 0, %[[tile_0_1]], DMA : 0)
848846
// CHECK: aie.flow(%[[tile_0_3]], DMA : 0, %[[tile_0_1]], DMA : 1)
849847
// CHECK: aie.flow(%[[tile_0_4]], DMA : 0, %[[tile_0_1]], DMA : 2)
@@ -1082,12 +1080,12 @@ module {
10821080
// CHECK: %[[tile_0_3:.*]] = aie.tile(0, 3)
10831081
// CHECK: %[[tile_1_3:.*]] = aie.tile(1, 3)
10841082
// CHECK: aie.flow(%[[tile_0_0]], DMA : 0, %[[tile_0_2]], DMA : 0)
1085-
// CHECK: aie.flow(%[[tile_1_0]], DMA : 0, %[[tile_1_2]], DMA : 0)
1086-
// CHECK: aie.flow(%[[tile_0_0]], DMA : 1, %[[tile_0_3]], DMA : 0)
1083+
// CHECK: aie.flow(%[[tile_0_0]], DMA : 1, %[[tile_1_2]], DMA : 0)
1084+
// CHECK: aie.flow(%[[tile_1_0]], DMA : 0, %[[tile_0_3]], DMA : 0)
10871085
// CHECK: aie.flow(%[[tile_1_0]], DMA : 1, %[[tile_1_3]], DMA : 0)
10881086
// CHECK: aie.shim_dma_allocation @air_channel_0_0(%[[tile_0_0]], MM2S, 0)
1089-
// CHECK: aie.shim_dma_allocation @air_channel_0_1(%[[tile_1_0]], MM2S, 0)
1090-
// CHECK: aie.shim_dma_allocation @air_channel_0_2(%[[tile_0_0]], MM2S, 1)
1087+
// CHECK: aie.shim_dma_allocation @air_channel_0_1(%[[tile_0_0]], MM2S, 1)
1088+
// CHECK: aie.shim_dma_allocation @air_channel_0_2(%[[tile_1_0]], MM2S, 0)
10911089
// CHECK: aie.shim_dma_allocation @air_channel_0_3(%[[tile_1_0]], MM2S, 1)
10921090
// CHECK: func.func @func14
10931091
// CHECK: air.channel.put @channel_0{{.*}} metadataArray = [{base = "air_channel_0_0", index = 0 : i32}, {base = "air_channel_0_1", index = 1 : i32}, {base = "air_channel_0_2", index = 2 : i32}, {base = "air_channel_0_3", index = 3 : i32}]} : (memref<32x16xi32>)

mlir/test/Conversion/AIRToAIE/async_gemm_w_pingpong_to_locks_npu.mlir

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010
// CHECK-LABEL: aie.device(npu1) @segment_0 {
1111
// CHECK: %[[tile_0_0:.*]] = aie.tile(0, 0)
12-
// CHECK: %[[tile_1_0:.*]] = aie.tile(1, 0)
1312
// CHECK: %[[tile_0_1:.*]] = aie.tile(0, 1)
1413
// CHECK: %[[tile_1_1:.*]] = aie.tile(1, 1)
1514
// CHECK: %[[tile_0_2:.*]] = aie.tile(0, 2)
@@ -29,7 +28,7 @@
2928
// CHECK-DAG: aie.buffer(%[[tile_1_1]]) {{{.*}}} : memref<128x64xi32, 1>
3029
// CHECK-COUNT-20: aie.buffer({{.*}}) {{{.*}}} : memref<32x32xi32, 2>
3130
// CHECK: aie.flow(%[[tile_0_0]], DMA : 0, %[[tile_0_1]], DMA : 0)
32-
// CHECK: aie.flow(%[[tile_1_0]], DMA : 0, %[[tile_1_1]], DMA : 0)
31+
// CHECK: aie.flow(%[[tile_0_0]], DMA : 1, %[[tile_1_1]], DMA : 0)
3332
// CHECK: aie.flow(%[[tile_0_1]], DMA : 0, %[[tile_0_0]], DMA : 0)
3433
// CHECK: aie.flow(%[[tile_0_1]], DMA : 1, %[[tile_0_2]], DMA : 0)
3534
// CHECK: aie.flow(%[[tile_0_1]], DMA : 2, %[[tile_0_3]], DMA : 0)

mlir/test/Conversion/AIRToAIE/good_shim_packet_flow_npu_4col.mlir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010
// 4x4 NPU1 array.
1111

1212
// WHOLEARRAY: %[[shim_noc_tile_0_0:.*]] = aie.tile(0, 0)
13-
// WHOLEARRAY: %[[shim_noc_tile_1_0:.*]] = aie.tile(1, 0)
14-
// WHOLEARRAY: %[[shim_noc_tile_2_0:.*]] = aie.tile(2, 0)
15-
// WHOLEARRAY: %[[shim_noc_tile_3_0:.*]] = aie.tile(3, 0)
1613
// WHOLEARRAY: aie.shim_dma_allocation @air_channel_2_0(%[[shim_noc_tile_0_0]], MM2S, 0)
14+
// WHOLEARRAY: aie.shim_dma_allocation @air_channel_2_1(%[[shim_noc_tile_0_0]], MM2S, 0)
15+
// WHOLEARRAY: aie.shim_dma_allocation @air_channel_2_2(%[[shim_noc_tile_0_0]], MM2S, 0)
16+
// WHOLEARRAY: aie.shim_dma_allocation @air_channel_2_3(%[[shim_noc_tile_0_0]], MM2S, 0)
1717

1818

1919
#map = affine_map<()[s0] -> (s0 * 256)>

0 commit comments

Comments
 (0)