Merge pull request #331 from diggerhq/fix/smart-drain-preserve-capacity

breardon2011 · web-flow · commit 2df34854feca · 2026-05-27T12:45:18.000-07:00
stop draining when near cap
diff --git a/internal/controlplane/scaler.go b/internal/controlplane/scaler.go
@@ -883,6 +883,37 @@ func (s *Scaler) smartScaleDown(_ context.Context, region string, workers []*Wor
 		return
 	}
 
+	// Refuse to drain if doing so would leave zero workers with placement room.
+	// The capacity reporter classifies a worker as "available" only when
+	// committed_memory_mb / total_memory_mb < memPressureThresholdPct. Picking
+	// the lightest-load worker as a drain target is fine when the rest of the
+	// fleet has spare memory, but if every other worker is already over the
+	// pressure line, draining the only low-pressure worker leaves the cell
+	// reporting available_workers=0. The edge then rejects every new
+	// /api/sandboxes with "no cells available with capacity" — exactly the
+	// outage we hit on 2026-05-27 (4 workers, 2 overcommitted at 89%/137%,
+	// scaler drained the only 2 healthy ones).
+	availableAfter := 0
+	for _, w := range workers {
+		if w == nil || w.MachineID == target.MachineID {
+			continue
+		}
+		if s.state.IsDraining(w.MachineID) {
+			continue
+		}
+		if w.TotalMemoryMB <= 0 {
+			continue
+		}
+		if (w.CommittedMemoryMB*100)/w.TotalMemoryMB < memPressureThresholdPct {
+			availableAfter++
+		}
+	}
+	if availableAfter < 1 {
+		log.Printf("scaler: refusing smart drain of %s — would leave 0 workers under memory pressure threshold (%d%% committed cutoff); fleet has %d worker(s) but the rest are over the placement line",
+			target.MachineID, memPressureThresholdPct, len(workers))
+		return
+	}
+
 	log.Printf("scaler: initiating smart drain of worker %s (machine=%s, sandboxes=%d)",
 		target.ID, target.MachineID, target.Current)