@@ -88,40 +88,17 @@ AIE::LockOp air::allocateLockOp(AIE::DeviceOp aie_device, AIE::TileLike tile,
8888 AIE::LockOp lock = nullptr ;
8989 std::set<int > ids;
9090 Operation *tileOp = tile.getOperation ();
91- bool tileIsLogical = isa<AIE::LogicalTileOp>(tileOp);
92- // For logical tiles, multiple distinct LTOs can collapse onto the same
93- // physical aie.tile during aie-place-tiles only when they share the same
94- // (col, tile_type) — different cols always resolve to different physical
95- // tiles. Reserve IDs across same-col same-type LTOs so post-collapse
96- // assignments don't collide. Reserving across ALL same-type LTOs (across
97- // every col) blows the per-tile lock budget in workloads like
98- // bf16_cascade where 8 memtile LTOs each need 10 locks: union'd IDs
99- // become 0..79, but the per-tile max is 63.
100- AIE::AIETileType tileType = tile.getTileType ();
101- std::optional<int32_t > tileCol;
102- if (tileIsLogical)
103- tileCol = cast<AIE::LogicalTileOp>(tileOp).getCol ();
91+ // Each (logical or physical) tile owns its own lock-ID space. The
92+ // aie-place-tiles pass is invoked with merge-ltos=false from aircc, so
93+ // distinct LTOs never collapse onto a shared physical tile — no need
94+ // to reserve IDs across other LTOs.
10495 aie_device.walk ([&](AIE::LockOp l) {
105- auto lockTileOp = l.getTile ().getDefiningOp ();
106- bool ownerMatches = (lockTileOp == tileOp);
107- if (!ownerMatches && tileIsLogical) {
108- auto otherLT = dyn_cast_if_present<AIE::LogicalTileOp>(lockTileOp);
109- if (otherLT && otherLT.getTileType () == tileType) {
110- // Only reserve across LTOs that COULD share a physical tile post-
111- // collapse: same col hint (or both unhinted, since aie-place-tiles
112- // may put both at the same col). Differently-hinted LTOs always
113- // resolve to different cols.
114- auto otherCol = otherLT.getCol ();
115- if (tileCol == otherCol)
116- ownerMatches = true ;
117- }
118- }
119- if (!ownerMatches)
96+ if (l.getTile ().getDefiningOp () != tileOp)
12097 return ;
12198 if (!l.getLockID ().has_value ())
12299 return ;
123100 auto i = l.getLockIDValue ();
124- if (lockTileOp == tileOp && i == id)
101+ if (i == id)
125102 lock = l;
126103 ids.insert (i);
127104 });
@@ -980,10 +957,6 @@ air::TileDMAAllocator::getBuffer(uint64_t, AIE::TileOp tile,
980957air::ShimDMAAllocator::ShimDMAAllocator (AIE::DeviceOp device)
981958 : air::DMAAllocator(device, air::MemorySpace::L3) {
982959 shim_dma_channels = 2 ;
983- const auto &tm = device.getTargetModel ();
984- for (int i = 0 , e = tm.columns (); i < e; i++)
985- if (tm.isShimNOCTile (i, 0 ))
986- dma_columns.push_back (i);
987960}
988961
989962FailureOr<air::allocation_info_t >
@@ -1023,135 +996,88 @@ air::ShimDMAAllocator::allocNewDmaChannel(air::MemcpyInterface &memcpyOp,
1023996 dma_ops_get_id.push_back (-1 );
1024997 }
1025998
1026- // For packet-flow ops, reuse an existing packet-flow allocation (in the
1027- // same direction AND on a shim LTO whose col hint matches the compute
1028- // col) to multiplex via packet IDs at the shim DMA level. Each new entry
1029- // shares the same logical tile and channel; downstream shim_dma_allocation
1030- // metadata is generated per-entry. Reusing across compute cols would
1031- // funnel every herd's packet flows onto a single shim — the packet
1032- // routing pipeline can't disambiguate that many IDs on one port.
1033- if (isPacketFlowOp) {
1034- for (auto &t : *allocs) {
1035- bool isPacketAlloc = false ;
1036- for (auto o : t.memcpyOps ) {
1037- auto mc = dyn_cast_if_present<air::MemcpyInterface>(o);
1038- if (!mc)
999+ // Bucket key: compute col. All flows from the same herd col share an
1000+ // unhinted shim LTO. aie-place-tiles assigns the physical col; the
1001+ // merge-ltos=false pass option (set by aircc) keeps each LTO on its
1002+ // own physical tile.
1003+ auto walkBucketLTOs = [&](auto fn) {
1004+ llvm::SmallPtrSet<Operation *, 8 > seen;
1005+ for (auto *side : {&mm2s_allocs, &s2mm_allocs}) {
1006+ for (auto &t : *side) {
1007+ if (t.col != col)
10391008 continue ;
1040- auto ct = air::getChannelType (mc);
1041- if (succeeded (ct) && ct.value () == " npu_dma_packet" ) {
1042- isPacketAlloc = true ;
1043- break ;
1044- }
1045- }
1046- if (!isPacketAlloc)
1047- continue ;
1048- // Restrict reuse to allocs whose tile is the LTO at this compute
1049- // col. Without this guard, a second compute col's packet flow would
1050- // glom onto the first col's shim alloc (because we accept any
1051- // packet alloc), producing one shim with N packet IDs instead of
1052- // N shims with 1 packet ID each — which the routing pass rejects
1053- // with "false packet id match".
1054- if (col >= 0 ) {
10551009 auto lt = dyn_cast<AIE::LogicalTileOp>(t.dma_tile .getOperation ());
1056- if (!lt)
1010+ if (!lt || lt. getTileType () != AIE::AIETileType::ShimNOCTile )
10571011 continue ;
1058- auto ltCol = lt.getCol ();
1059- if (!ltCol || (int )*ltCol != col)
1012+ if (!seen.insert (lt.getOperation ()).second )
10601013 continue ;
1014+ if (fn (lt))
1015+ return ;
10611016 }
1062- AIE::DMAChannel aie_chan = {dir, t.dma_channel .channel };
1063- allocs->push_back ({t.dma_tile ,
1017+ }
1018+ };
1019+
1020+ auto channelsUsedOn = [&](AIE::LogicalTileOp lt) {
1021+ std::set<int > used;
1022+ for (auto *side : {&mm2s_allocs, &s2mm_allocs})
1023+ for (auto &t : *side)
1024+ if (t.dma_tile .getOperation () == lt.getOperation () &&
1025+ t.dma_channel .direction == dir)
1026+ used.insert ((int )t.dma_channel .channel );
1027+ return used;
1028+ };
1029+
1030+ // For packet flows: reuse the bucket's existing packet channel if any.
1031+ if (isPacketFlowOp) {
1032+ AIE::LogicalTileOp packetLT = nullptr ;
1033+ int packetCh = -1 ;
1034+ walkBucketLTOs ([&](AIE::LogicalTileOp lt) {
1035+ for (auto *side : {&mm2s_allocs, &s2mm_allocs}) {
1036+ for (auto &t : *side) {
1037+ if (t.dma_tile .getOperation () != lt.getOperation ())
1038+ continue ;
1039+ if (t.dma_channel .direction != dir)
1040+ continue ;
1041+ for (auto o : t.memcpyOps ) {
1042+ auto mc = dyn_cast_if_present<air::MemcpyInterface>(o);
1043+ if (!mc)
1044+ continue ;
1045+ auto ct = air::getChannelType (mc);
1046+ if (succeeded (ct) && ct.value () == " npu_dma_packet" ) {
1047+ packetLT = lt;
1048+ packetCh = (int )t.dma_channel .channel ;
1049+ return true ;
1050+ }
1051+ }
1052+ }
1053+ }
1054+ return false ;
1055+ });
1056+ if (packetLT) {
1057+ AIE::DMAChannel aie_chan = {dir, packetCh};
1058+ allocs->push_back ({packetLT,
10641059 col,
10651060 row,
10661061 aie_chan,
1067- t. dma_channel . channel ,
1062+ packetCh ,
10681063 /* packet_flow_id=*/ -1 ,
10691064 dma_ops_get_id,
10701065 {memcpyOp.getOperation ()}});
10711066 return allocs->back ();
10721067 }
10731068 }
10741069
1075- // Capacity-aware (col, channel) selection — restored to the pre-Path-B
1076- // semantics. The original allocNewDmaChannel walked
1077- // (compute_col, ch=0) -> (compute_col, ch=1) -> (next_col, ch=0) -> ...
1078- // and stopped at the first unused (col, channel) pair. With Path B the
1079- // tile is now an aie.logical_tile<ShimNOCTile>(col, ?) (the placer picks
1080- // the row), but the col hint must match what the placer will satisfy:
1081- // otherwise downstream airrt-to-npu reads a hint that disagrees with the
1082- // placer's eventual physical col, and NPU instructions target the wrong
1083- // shim. We mirror the original loop so each LTO's col hint is the col
1084- // a capacity-aware placer would pick on its own.
1085- AIE::TileLike tileLT = nullptr ;
1086- int dma_channel = -1 ;
1087-
1088- auto isUsedAtColCh = [&](int candidateCol, int ch) -> bool {
1089- for (auto *side : {&mm2s_allocs, &s2mm_allocs}) {
1090- for (auto &t : *side) {
1091- if (t.dma_channel .direction != dir)
1092- continue ;
1093- if ((int )t.dma_channel .channel != ch)
1094- continue ;
1095- auto cand = dyn_cast<AIE::LogicalTileOp>(t.dma_tile .getOperation ());
1096- if (!cand)
1097- continue ;
1098- if (cand.getTileType () != AIE::AIETileType::ShimNOCTile)
1099- continue ;
1100- auto candCol = cand.getCol ();
1101- if (candCol && (int )*candCol == candidateCol)
1102- return true ;
1103- }
1070+ // Find a bucket LTO with a free channel in this direction; else open
1071+ // a new unhinted shim LTO.
1072+ AIE::LogicalTileOp tileLT = nullptr ;
1073+ walkBucketLTOs ([&](AIE::LogicalTileOp lt) {
1074+ if ((int )channelsUsedOn (lt).size () < shim_dma_channels) {
1075+ tileLT = lt;
1076+ return true ;
11041077 }
11051078 return false ;
1106- };
1107- auto findLTOAtCol = [&](int candidateCol) -> AIE::LogicalTileOp {
1108- for (auto *side : {&mm2s_allocs, &s2mm_allocs}) {
1109- for (auto &t : *side) {
1110- auto cand = dyn_cast<AIE::LogicalTileOp>(t.dma_tile .getOperation ());
1111- if (!cand)
1112- continue ;
1113- if (cand.getTileType () != AIE::AIETileType::ShimNOCTile)
1114- continue ;
1115- auto candCol = cand.getCol ();
1116- if (candCol && (int )*candCol == candidateCol)
1117- return cand;
1118- }
1119- }
1120- return nullptr ;
1121- };
1122-
1123- // Find the first (col, channel) pair not yet used. Start at compute col
1124- // (so shim sits near its core) and rotate through ShimNOC cols.
1125- int chosenCol = -1 ;
1126- int chosenCh = -1 ;
1127- if (!dma_columns.empty ()) {
1128- int startIdx = 0 ;
1129- if (col >= 0 ) {
1130- auto it = std::find (dma_columns.begin (), dma_columns.end (), col);
1131- if (it != dma_columns.end ())
1132- startIdx = it - dma_columns.begin ();
1133- }
1134- for (int hops = 0 ; hops < (int )dma_columns.size () && chosenCol < 0 ;
1135- hops++) {
1136- int c = dma_columns[(startIdx + hops) % dma_columns.size ()];
1137- for (int ch = 0 ; ch < shim_dma_channels; ch++) {
1138- if (!isUsedAtColCh (c, ch)) {
1139- chosenCol = c;
1140- chosenCh = ch;
1141- break ;
1142- }
1143- }
1144- }
1145- }
1146- if (chosenCol < 0 )
1147- return memcpyOp.emitOpError (" out of shim DMA channels" );
1148-
1149- // Reuse the existing LTO at chosenCol if one is there; otherwise create
1150- // a new LTO. Reusing keeps the per-physical-shim aie.shim_dma op
1151- // aggregated (one shim_dma per tile rather than several).
1152- if (auto existing = findLTOAtCol (chosenCol)) {
1153- tileLT = existing;
1154- } else {
1079+ });
1080+ if (!tileLT) {
11551081 OpBuilder b (device);
11561082 b.setInsertionPointToStart (device.getBody ());
11571083 for (auto &op : device.getBody ()->getOperations ()) {
@@ -1160,19 +1086,24 @@ air::ShimDMAAllocator::allocNewDmaChannel(air::MemcpyInterface &memcpyOp,
11601086 else
11611087 break ;
11621088 }
1163- auto *ctx = b.getContext ();
1164- IntegerAttr colAttr =
1165- IntegerAttr::get (IntegerType::get (ctx, 32 ), chosenCol);
11661089 tileLT = AIE::LogicalTileOp::create (b, device.getLoc (),
1167- AIE::AIETileType::ShimNOCTile, colAttr,
1090+ AIE::AIETileType::ShimNOCTile,
1091+ /* col=*/ IntegerAttr (),
11681092 /* row=*/ IntegerAttr (),
11691093 /* allocation_scheme=*/ StringAttr ());
11701094 }
1171- dma_channel = chosenCh;
11721095
1173- // The col/row int args here record the other side (compute side) of the
1174- // flow for airrt metadata; they have nothing to do with the shim's
1175- // eventual physical placement.
1096+ auto usedChans = channelsUsedOn (tileLT);
1097+ int dma_channel = -1 ;
1098+ for (int ch = 0 ; ch < shim_dma_channels; ch++) {
1099+ if (!usedChans.count (ch)) {
1100+ dma_channel = ch;
1101+ break ;
1102+ }
1103+ }
1104+ if (dma_channel < 0 )
1105+ return memcpyOp.emitOpError (" out of shim DMA channels" );
1106+
11761107 return air::DMAAllocator::allocNewDmaChannel (memcpyOp, tileLT, dma_channel,
11771108 col, row, dma_ops_get_id);
11781109}
0 commit comments