Skip to content

Commit 2df3485

Browse files
authored
Merge pull request #331 from diggerhq/fix/smart-drain-preserve-capacity
stop draining when near cap
2 parents 2adf723 + 3d79942 commit 2df3485

1 file changed

Lines changed: 31 additions & 0 deletions

File tree

internal/controlplane/scaler.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -883,6 +883,37 @@ func (s *Scaler) smartScaleDown(_ context.Context, region string, workers []*Wor
883883
return
884884
}
885885

886+
// Refuse to drain if doing so would leave zero workers with placement room.
887+
// The capacity reporter classifies a worker as "available" only when
888+
// committed_memory_mb / total_memory_mb < memPressureThresholdPct. Picking
889+
// the lightest-load worker as a drain target is fine when the rest of the
890+
// fleet has spare memory, but if every other worker is already over the
891+
// pressure line, draining the only low-pressure worker leaves the cell
892+
// reporting available_workers=0. The edge then rejects every new
893+
// /api/sandboxes with "no cells available with capacity" — exactly the
894+
// outage we hit on 2026-05-27 (4 workers, 2 overcommitted at 89%/137%,
895+
// scaler drained the only 2 healthy ones).
896+
availableAfter := 0
897+
for _, w := range workers {
898+
if w == nil || w.MachineID == target.MachineID {
899+
continue
900+
}
901+
if s.state.IsDraining(w.MachineID) {
902+
continue
903+
}
904+
if w.TotalMemoryMB <= 0 {
905+
continue
906+
}
907+
if (w.CommittedMemoryMB*100)/w.TotalMemoryMB < memPressureThresholdPct {
908+
availableAfter++
909+
}
910+
}
911+
if availableAfter < 1 {
912+
log.Printf("scaler: refusing smart drain of %s — would leave 0 workers under memory pressure threshold (%d%% committed cutoff); fleet has %d worker(s) but the rest are over the placement line",
913+
target.MachineID, memPressureThresholdPct, len(workers))
914+
return
915+
}
916+
886917
log.Printf("scaler: initiating smart drain of worker %s (machine=%s, sandboxes=%d)",
887918
target.ID, target.MachineID, target.Current)
888919

0 commit comments

Comments
 (0)