Skip to content

Commit a2614cf

Browse files
committed
Debug [skip ci]
1 parent cc5c476 commit a2614cf

1 file changed

Lines changed: 27 additions & 18 deletions

File tree

mooncake-ep/src/mooncake_backend.cpp

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1069,30 +1069,39 @@ void MooncakeBackend::advanceRecvOp(RecvOpStateData& opData) {
10691069
}
10701070
}
10711071

1072+
// If we can allocate based on sequence/capacity, check if there's actually a free slot
10721073
if (canAllocate) {
1073-
opData.state = RecvOpState::READY_TO_ALLOC;
1074+
// Try to find an available slot that doesn't conflict with other operations
1075+
int startSlot = opData.baseSlot;
1076+
int attempts = 0;
1077+
bool foundSlot = false;
1078+
1079+
while (attempts < static_cast<int>(kP2PNumSlots)) {
1080+
// Check if current slot is free and doesn't cause wrap-around
1081+
if (allocatedRecvSlots_.find(opData.baseSlot) == allocatedRecvSlots_.end() &&
1082+
opData.baseSlot + opData.numSlotsNeeded <= static_cast<int>(kP2PNumSlots)) {
1083+
// Found a valid slot
1084+
foundSlot = true;
1085+
break;
1086+
}
1087+
// Try next slot, wrapping around if needed
1088+
opData.baseSlot = (opData.baseSlot + 1) % static_cast<int>(kP2PNumSlots);
1089+
attempts++;
1090+
}
1091+
1092+
if (foundSlot) {
1093+
opData.state = RecvOpState::READY_TO_ALLOC;
1094+
}
1095+
// If no slot found, stay in WAITING_SLOT_AVAIL and wait for slots to be freed
10741096
}
10751097
break;
10761098
}
10771099

10781100
case RecvOpState::READY_TO_ALLOC: {
1079-
// Check if the slot is already allocated by another active operation
1080-
// If so, find the next available slot that doesn't cause wrap-around
1081-
int attempts = 0;
1082-
while (attempts < static_cast<int>(kP2PNumSlots)) {
1083-
// Check if current slot is free and doesn't cause wrap-around
1084-
if (allocatedRecvSlots_.find(opData.baseSlot) == allocatedRecvSlots_.end() &&
1085-
opData.baseSlot + opData.numSlotsNeeded <= static_cast<int>(kP2PNumSlots)) {
1086-
// Found a valid slot
1087-
break;
1088-
}
1089-
// Try next slot, wrapping around if needed
1090-
opData.baseSlot = (opData.baseSlot + 1) % static_cast<int>(kP2PNumSlots);
1091-
attempts++;
1092-
}
1093-
1094-
TORCH_CHECK(attempts < static_cast<int>(kP2PNumSlots),
1095-
"P2P recv: failed to find available slot after ", attempts, " attempts");
1101+
// At this point, we've already found a valid slot in WAITING_SLOT_AVAIL
1102+
// Just verify it's still available and allocate it
1103+
TORCH_CHECK(allocatedRecvSlots_.find(opData.baseSlot) == allocatedRecvSlots_.end(),
1104+
"P2P recv: slot ", opData.baseSlot, " was allocated by another operation");
10961105
TORCH_CHECK(opData.baseSlot + opData.numSlotsNeeded <= static_cast<int>(kP2PNumSlots),
10971106
"P2P recv: slot range would exceed buffer: baseSlot=", opData.baseSlot,
10981107
", numSlots=", opData.numSlotsNeeded);

0 commit comments

Comments
 (0)