Delete shim DMA column-affinity heuristic (RFC #1567 Stage C #6)

erwei-xilinx · erwei-xilinx · commit 48100f962cca · 2026-05-08T16:45:31.000-07:00
Drops `colAllocConstraint == "same_column"` from ShimDMAAllocator::allocNewDmaChannel. Default for every caller (no override exists), so the effect is universal. Shim DMA columns are now chosen by first-fit + round-robin overflow when channels exhaust; compute-shim column adjacency is delegated to mlir-aie's flow-aware placer (#3055). Also drops the `colAllocConstraint` parameter from the public API entry in AIRToAIESchedulingUtils.h since it now has no values that change behavior. Acknowledged behavioral change: workloads that depended on shim DMAs landing in the same column as the compute-side `(col, row)` may see worse routing until follow-up Stage C #7 consolidates the shim path with ShimTileAllocator's unconstrained `(?, ?)` approach. Tests updated to reflect first-fit shim packing: - good_shim_packet_flow_npu_4col.mlir: 4 packet flows now share shim col 0 via packet switching (was 4 cols). - async_gemm_w_pingpong_to_locks_npu.mlir: both compute cols' shim traffic packs into shim col 0 (DMA:0 + DMA:1). - air_shimcpy_to_npu.mlir (subtest 12): 4 mem-to-shim S2MM flows fill shim cols 0+1 via channel overflow (was 4 cols). - air_shimcpy_to_npu.mlir (subtest 14): MM2S allocation pattern reflows to (col 0 ch 0, col 0 ch 1, col 1 ch 0, col 1 ch 1) instead of same-column striping. Tests: 384/384 check-air-mlir (2 pre-existing AIRToROCDL failures unrelated). clang-format-17 clean.
diff --git a/mlir/include/air/Conversion/AIRToAIESchedulingUtils.h b/mlir/include/air/Conversion/AIRToAIESchedulingUtils.h
@@ -197,8 +197,7 @@ class ShimDMAAllocator : public DMAAllocator {
 
   FailureOr<allocation_info_t>
   allocNewDmaChannel(air::MemcpyInterface &memcpyOp, int col, int row,
-                     std::vector<Operation *> &dma_ops,
-                     std::string colAllocConstraint = "same_column");
+                     std::vector<Operation *> &dma_ops);
 
   FailureOr<allocation_info_t>
   allocNewDmaChannel(air::MemcpyInterface &memcpyOp,
diff --git a/mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp b/mlir/lib/Conversion/AIRToAIESchedulingUtils.cpp
@@ -998,9 +998,10 @@ air::ShimDMAAllocator::ShimDMAAllocator(AIE::DeviceOp device)
   }
 }
 
-FailureOr<air::allocation_info_t> air::ShimDMAAllocator::allocNewDmaChannel(
-    air::MemcpyInterface &memcpyOp, int col, int row,
-    std::vector<Operation *> &dma_ops, std::string colAllocConstraint) {
+FailureOr<air::allocation_info_t>
+air::ShimDMAAllocator::allocNewDmaChannel(air::MemcpyInterface &memcpyOp,
+                                          int col, int row,
+                                          std::vector<Operation *> &dma_ops) {
   auto isMM2S = isTileOutbound(memcpyOp, dmaMemorySpace);
   if (failed(isMM2S))
     return failure();
@@ -1027,12 +1028,6 @@ FailureOr<air::allocation_info_t> air::ShimDMAAllocator::allocNewDmaChannel(
   }
   AIE::TileOp tile = nullptr;
   int colIdx = 0;
-  if (colAllocConstraint == "same_column") {
-    // Attempt to use shim dma channels within the same column.
-    auto it = find(dma_columns.begin(), dma_columns.end(), col);
-    if (it != dma_columns.end())
-      colIdx = it - dma_columns.begin();
-  }
   int dma_col = dma_columns[colIdx];
 
   // For packet-flow ops, reuse an existing physical channel on this shim tile
diff --git a/mlir/test/Conversion/AIRToAIE/air_shimcpy_to_npu.mlir b/mlir/test/Conversion/AIRToAIE/air_shimcpy_to_npu.mlir
@@ -782,8 +782,6 @@ func.func @func11(%arg0: memref<128xbf16>, %arg1: memref<128xbf16>) {
 // CHECK: aie.device(npu1)
 // CHECK: %[[tile_0_0:.*]] = aie.tile(0, 0)
 // CHECK: %[[tile_1_0:.*]] = aie.tile(1, 0)
-// CHECK: %[[tile_2_0:.*]] = aie.tile(2, 0)
-// CHECK: %[[tile_3_0:.*]] = aie.tile(3, 0)
 // CHECK: %[[tile_0_1:.*]] = aie.tile(0, 1)
 // CHECK: %[[tile_1_1:.*]] = aie.tile(1, 1)
 // CHECK: %[[tile_2_1:.*]] = aie.tile(2, 1)
@@ -841,9 +839,9 @@ func.func @func11(%arg0: memref<128xbf16>, %arg1: memref<128xbf16>) {
 // CHECK: aie.core(%[[tile_1_2]])
 // CHECK: aie.core(%[[tile_0_2]])
 // CHECK: aie.flow(%[[tile_0_1]], DMA : 0, %[[tile_0_0]], DMA : 0)
-// CHECK: aie.flow(%[[tile_1_1]], DMA : 0, %[[tile_1_0]], DMA : 0)
-// CHECK: aie.flow(%[[tile_2_1]], DMA : 0, %[[tile_2_0]], DMA : 0)
-// CHECK: aie.flow(%[[tile_3_1]], DMA : 0, %[[tile_3_0]], DMA : 0)
+// CHECK: aie.flow(%[[tile_1_1]], DMA : 0, %[[tile_0_0]], DMA : 1)
+// CHECK: aie.flow(%[[tile_2_1]], DMA : 0, %[[tile_1_0]], DMA : 0)
+// CHECK: aie.flow(%[[tile_3_1]], DMA : 0, %[[tile_1_0]], DMA : 1)
 // CHECK: aie.flow(%[[tile_0_2]], DMA : 0, %[[tile_0_1]], DMA : 0)
 // CHECK: aie.flow(%[[tile_0_3]], DMA : 0, %[[tile_0_1]], DMA : 1)
 // CHECK: aie.flow(%[[tile_0_4]], DMA : 0, %[[tile_0_1]], DMA : 2)
@@ -1082,12 +1080,12 @@ module {
 // CHECK: %[[tile_0_3:.*]] = aie.tile(0, 3)
 // CHECK: %[[tile_1_3:.*]] = aie.tile(1, 3)
 // CHECK: aie.flow(%[[tile_0_0]], DMA : 0, %[[tile_0_2]], DMA : 0)
-// CHECK: aie.flow(%[[tile_1_0]], DMA : 0, %[[tile_1_2]], DMA : 0)
-// CHECK: aie.flow(%[[tile_0_0]], DMA : 1, %[[tile_0_3]], DMA : 0)
+// CHECK: aie.flow(%[[tile_0_0]], DMA : 1, %[[tile_1_2]], DMA : 0)
+// CHECK: aie.flow(%[[tile_1_0]], DMA : 0, %[[tile_0_3]], DMA : 0)
 // CHECK: aie.flow(%[[tile_1_0]], DMA : 1, %[[tile_1_3]], DMA : 0)
 // CHECK: aie.shim_dma_allocation @air_channel_0_0(%[[tile_0_0]], MM2S, 0)
-// CHECK: aie.shim_dma_allocation @air_channel_0_1(%[[tile_1_0]], MM2S, 0)
-// CHECK: aie.shim_dma_allocation @air_channel_0_2(%[[tile_0_0]], MM2S, 1)
+// CHECK: aie.shim_dma_allocation @air_channel_0_1(%[[tile_0_0]], MM2S, 1)
+// CHECK: aie.shim_dma_allocation @air_channel_0_2(%[[tile_1_0]], MM2S, 0)
 // CHECK: aie.shim_dma_allocation @air_channel_0_3(%[[tile_1_0]], MM2S, 1)
 // CHECK: func.func @func14
 // CHECK: air.channel.put  @channel_0{{.*}} metadataArray = [{base = "air_channel_0_0", index = 0 : i32}, {base = "air_channel_0_1", index = 1 : i32}, {base = "air_channel_0_2", index = 2 : i32}, {base = "air_channel_0_3", index = 3 : i32}]} : (memref<32x16xi32>)
diff --git a/mlir/test/Conversion/AIRToAIE/async_gemm_w_pingpong_to_locks_npu.mlir b/mlir/test/Conversion/AIRToAIE/async_gemm_w_pingpong_to_locks_npu.mlir
@@ -9,7 +9,6 @@
 
 // CHECK-LABEL:   aie.device(npu1) @segment_0 {
 // CHECK:   %[[tile_0_0:.*]] = aie.tile(0, 0)
-// CHECK:   %[[tile_1_0:.*]] = aie.tile(1, 0)
 // CHECK:   %[[tile_0_1:.*]] = aie.tile(0, 1)
 // CHECK:   %[[tile_1_1:.*]] = aie.tile(1, 1)
 // CHECK:   %[[tile_0_2:.*]] = aie.tile(0, 2)
@@ -29,7 +28,7 @@
 // CHECK-DAG:    aie.buffer(%[[tile_1_1]]) {{{.*}}} : memref<128x64xi32, 1>
 // CHECK-COUNT-20:    aie.buffer({{.*}}) {{{.*}}} : memref<32x32xi32, 2>
 // CHECK:    aie.flow(%[[tile_0_0]], DMA : 0, %[[tile_0_1]], DMA : 0)
-// CHECK:    aie.flow(%[[tile_1_0]], DMA : 0, %[[tile_1_1]], DMA : 0)
+// CHECK:    aie.flow(%[[tile_0_0]], DMA : 1, %[[tile_1_1]], DMA : 0)
 // CHECK:    aie.flow(%[[tile_0_1]], DMA : 0, %[[tile_0_0]], DMA : 0)
 // CHECK:    aie.flow(%[[tile_0_1]], DMA : 1, %[[tile_0_2]], DMA : 0)
 // CHECK:    aie.flow(%[[tile_0_1]], DMA : 2, %[[tile_0_3]], DMA : 0)
diff --git a/mlir/test/Conversion/AIRToAIE/good_shim_packet_flow_npu_4col.mlir b/mlir/test/Conversion/AIRToAIE/good_shim_packet_flow_npu_4col.mlir
@@ -10,10 +10,10 @@
 // 4x4 NPU1 array.
 
 // WHOLEARRAY: %[[shim_noc_tile_0_0:.*]] = aie.tile(0, 0)
-// WHOLEARRAY: %[[shim_noc_tile_1_0:.*]] = aie.tile(1, 0)
-// WHOLEARRAY: %[[shim_noc_tile_2_0:.*]] = aie.tile(2, 0)
-// WHOLEARRAY: %[[shim_noc_tile_3_0:.*]] = aie.tile(3, 0)
 // WHOLEARRAY: aie.shim_dma_allocation @air_channel_2_0(%[[shim_noc_tile_0_0]], MM2S, 0)
+// WHOLEARRAY: aie.shim_dma_allocation @air_channel_2_1(%[[shim_noc_tile_0_0]], MM2S, 0)
+// WHOLEARRAY: aie.shim_dma_allocation @air_channel_2_2(%[[shim_noc_tile_0_0]], MM2S, 0)
+// WHOLEARRAY: aie.shim_dma_allocation @air_channel_2_3(%[[shim_noc_tile_0_0]], MM2S, 0)
 
 
 #map = affine_map<()[s0] -> (s0 * 256)>