@@ -1069,30 +1069,39 @@ void MooncakeBackend::advanceRecvOp(RecvOpStateData& opData) {
10691069 }
10701070 }
10711071
1072+ // If we can allocate based on sequence/capacity, check if there's actually a free slot
10721073 if (canAllocate) {
1073- opData.state = RecvOpState::READY_TO_ALLOC ;
1074+ // Try to find an available slot that doesn't conflict with other operations
1075+ int startSlot = opData.baseSlot ;
1076+ int attempts = 0 ;
1077+ bool foundSlot = false ;
1078+
1079+ while (attempts < static_cast <int >(kP2PNumSlots )) {
1080+ // Check if current slot is free and doesn't cause wrap-around
1081+ if (allocatedRecvSlots_.find (opData.baseSlot ) == allocatedRecvSlots_.end () &&
1082+ opData.baseSlot + opData.numSlotsNeeded <= static_cast <int >(kP2PNumSlots )) {
1083+ // Found a valid slot
1084+ foundSlot = true ;
1085+ break ;
1086+ }
1087+ // Try next slot, wrapping around if needed
1088+ opData.baseSlot = (opData.baseSlot + 1 ) % static_cast <int >(kP2PNumSlots );
1089+ attempts++;
1090+ }
1091+
1092+ if (foundSlot) {
1093+ opData.state = RecvOpState::READY_TO_ALLOC ;
1094+ }
1095+ // If no slot found, stay in WAITING_SLOT_AVAIL and wait for slots to be freed
10741096 }
10751097 break ;
10761098 }
10771099
10781100 case RecvOpState::READY_TO_ALLOC : {
1079- // Check if the slot is already allocated by another active operation
1080- // If so, find the next available slot that doesn't cause wrap-around
1081- int attempts = 0 ;
1082- while (attempts < static_cast <int >(kP2PNumSlots )) {
1083- // Check if current slot is free and doesn't cause wrap-around
1084- if (allocatedRecvSlots_.find (opData.baseSlot ) == allocatedRecvSlots_.end () &&
1085- opData.baseSlot + opData.numSlotsNeeded <= static_cast <int >(kP2PNumSlots )) {
1086- // Found a valid slot
1087- break ;
1088- }
1089- // Try next slot, wrapping around if needed
1090- opData.baseSlot = (opData.baseSlot + 1 ) % static_cast <int >(kP2PNumSlots );
1091- attempts++;
1092- }
1093-
1094- TORCH_CHECK (attempts < static_cast <int >(kP2PNumSlots ),
1095- " P2P recv: failed to find available slot after " , attempts, " attempts" );
1101+ // At this point, we've already found a valid slot in WAITING_SLOT_AVAIL
1102+ // Just verify it's still available and allocate it
1103+ TORCH_CHECK (allocatedRecvSlots_.find (opData.baseSlot ) == allocatedRecvSlots_.end (),
1104+ " P2P recv: slot " , opData.baseSlot , " was allocated by another operation" );
10961105 TORCH_CHECK (opData.baseSlot + opData.numSlotsNeeded <= static_cast <int >(kP2PNumSlots ),
10971106 " P2P recv: slot range would exceed buffer: baseSlot=" , opData.baseSlot ,
10981107 " , numSlots=" , opData.numSlotsNeeded );
0 commit comments