@@ -1077,11 +1077,26 @@ void MooncakeBackend::advanceRecvOp(RecvOpStateData& opData) {
10771077
10781078 case RecvOpState::READY_TO_ALLOC : {
10791079 // Check if the slot is already allocated by another active operation
1080- // If so, find the next available slot
1081- while (allocatedRecvSlots_.find (opData.baseSlot ) != allocatedRecvSlots_.end ()) {
1080+ // If so, find the next available slot that doesn't cause wrap-around
1081+ int attempts = 0 ;
1082+ while (attempts < static_cast <int >(kP2PNumSlots )) {
1083+ // Check if current slot is free and doesn't cause wrap-around
1084+ if (allocatedRecvSlots_.find (opData.baseSlot ) == allocatedRecvSlots_.end () &&
1085+ opData.baseSlot + opData.numSlotsNeeded <= static_cast <int >(kP2PNumSlots )) {
1086+ // Found a valid slot
1087+ break ;
1088+ }
1089+ // Try next slot, wrapping around if needed
10821090 opData.baseSlot = (opData.baseSlot + 1 ) % static_cast <int >(kP2PNumSlots );
1091+ attempts++;
10831092 }
10841093
1094+ TORCH_CHECK (attempts < static_cast <int >(kP2PNumSlots ),
1095+ " P2P recv: failed to find available slot after " , attempts, " attempts" );
1096+ TORCH_CHECK (opData.baseSlot + opData.numSlotsNeeded <= static_cast <int >(kP2PNumSlots ),
1097+ " P2P recv: slot range would exceed buffer: baseSlot=" , opData.baseSlot ,
1098+ " , numSlots=" , opData.numSlotsNeeded );
1099+
10851100 // Mark slot as allocated
10861101 allocatedRecvSlots_.insert (opData.baseSlot );
10871102
0 commit comments