You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
// the lightest-load worker as a drain target is fine when the rest of the
890
+
// fleet has spare memory, but if every other worker is already over the
891
+
// pressure line, draining the only low-pressure worker leaves the cell
892
+
// reporting available_workers=0. The edge then rejects every new
893
+
// /api/sandboxes with "no cells available with capacity" — exactly the
894
+
// outage we hit on 2026-05-27 (4 workers, 2 overcommitted at 89%/137%,
895
+
// scaler drained the only 2 healthy ones).
896
+
availableAfter:=0
897
+
for_, w:=rangeworkers {
898
+
ifw==nil||w.MachineID==target.MachineID {
899
+
continue
900
+
}
901
+
ifs.state.IsDraining(w.MachineID) {
902
+
continue
903
+
}
904
+
ifw.TotalMemoryMB<=0 {
905
+
continue
906
+
}
907
+
if (w.CommittedMemoryMB*100)/w.TotalMemoryMB<memPressureThresholdPct {
908
+
availableAfter++
909
+
}
910
+
}
911
+
ifavailableAfter<1 {
912
+
log.Printf("scaler: refusing smart drain of %s — would leave 0 workers under memory pressure threshold (%d%% committed cutoff); fleet has %d worker(s) but the rest are over the placement line",
0 commit comments