Skip to content

Commit cc5c476

Browse files
committed
Debug [skip ci]
1 parent f83add6 commit cc5c476

1 file changed

Lines changed: 17 additions & 2 deletions

File tree

mooncake-ep/src/mooncake_backend.cpp

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1077,11 +1077,26 @@ void MooncakeBackend::advanceRecvOp(RecvOpStateData& opData) {
10771077

10781078
case RecvOpState::READY_TO_ALLOC: {
10791079
// Check if the slot is already allocated by another active operation
1080-
// If so, find the next available slot
1081-
while (allocatedRecvSlots_.find(opData.baseSlot) != allocatedRecvSlots_.end()) {
1080+
// If so, find the next available slot that doesn't cause wrap-around
1081+
int attempts = 0;
1082+
while (attempts < static_cast<int>(kP2PNumSlots)) {
1083+
// Check if current slot is free and doesn't cause wrap-around
1084+
if (allocatedRecvSlots_.find(opData.baseSlot) == allocatedRecvSlots_.end() &&
1085+
opData.baseSlot + opData.numSlotsNeeded <= static_cast<int>(kP2PNumSlots)) {
1086+
// Found a valid slot
1087+
break;
1088+
}
1089+
// Try next slot, wrapping around if needed
10821090
opData.baseSlot = (opData.baseSlot + 1) % static_cast<int>(kP2PNumSlots);
1091+
attempts++;
10831092
}
10841093

1094+
TORCH_CHECK(attempts < static_cast<int>(kP2PNumSlots),
1095+
"P2P recv: failed to find available slot after ", attempts, " attempts");
1096+
TORCH_CHECK(opData.baseSlot + opData.numSlotsNeeded <= static_cast<int>(kP2PNumSlots),
1097+
"P2P recv: slot range would exceed buffer: baseSlot=", opData.baseSlot,
1098+
", numSlots=", opData.numSlotsNeeded);
1099+
10851100
// Mark slot as allocated
10861101
allocatedRecvSlots_.insert(opData.baseSlot);
10871102

0 commit comments

Comments
 (0)