33
44"""AIMD health tracker for the per-group scale-up token bucket.
55
6- A single per-group ``health: float`` in ``[probe_floor, 1.0]`` drives the
7- bucket's refill rate. State:
6+ A per-group ``health: float`` in ``[probe_floor, 1.0]`` drives the bucket's
7+ refill rate.
88
9- - Starts at ``1.0`` (fully healthy).
10- - On every failure event: ``health *= decay_per_failure`` (multiplicative
9+ - Each failure event applies ``health *= decay_per_failure`` (multiplicative
1110 decrease, clamped at ``probe_floor``).
12- - On every query/event: ``health += recovery_per_second * dt`` (additive
13- increase, clamped at ``1.0``). ``recovery_per_second`` is chosen so the
14- score climbs from ``probe_floor`` to ``1.0`` over ``recovery_duration``.
15-
16- This is AIMD — the same shape TCP congestion control uses. One isolated
17- failure throttles the group for a short time; sustained failures push it
18- to the floor; sustained success lets it climb back smoothly.
19-
20- The bucket's ``refill_rate`` is set to ``base_rate * health`` on every
21- ``try_acquire_scale_up`` call. Cadence — not batch size — is what gets
22- throttled. Each failure also caps the bucket's current token inventory
23- to ``capacity * health``, so a previously-healthy bucket that suddenly
24- degrades can't burst its accumulated tokens before the lowered refill
25- rate takes effect.
26-
27- The probe floor exists so a fully degraded zone still attempts at some low
28- rate — without it, zero attempts means zero samples and the score could
29- never climb back.
30-
31- Quota-exhaustion is a separate hard block (``record_quota_exceeded``).
32- Quota is categorical ("GCP said no") and recovers on GCP's timescale, not
33- ours, so it bypasses the health-score machinery entirely.
34-
35- In-flight ``created_at`` is tracked so a terminated slice can be classified
36- short-lived vs. long-lived by age. Only short-lived deaths (or BOOT_FAILED
37- at any age) feed the decay; long-lived preemptions are normal preemptible
38- behaviour and do nothing.
39-
40- State is in-memory; on controller restart ``health`` resets to ``1.0``.
11+ - Between events, ``health`` accrues additively so it climbs from
12+ ``probe_floor`` to ``1.0`` over ``recovery_duration``.
13+ - ``try_acquire_scale_up`` sets ``bucket.refill_rate = base_rate * health``
14+ and then takes a token, and each failure also caps the bucket inventory
15+ to ``capacity * health`` so an accumulated burst can't bypass the new
16+ rate.
17+
18+ The probe floor keeps a degraded zone probing at a non-zero rate so it can
19+ produce non-failure samples and recover.
20+
21+ Quota-exhaustion is a separate categorical block (``record_quota_exceeded``)
22+ that does not feed the health score.
23+
24+ In-flight ``created_at`` is tracked so a terminated slice's age determines
25+ whether the termination counts as a failure. State is in-memory; on
26+ controller restart ``health`` resets to ``1.0``.
4127"""
4228
4329from __future__ import annotations
5036# Time to recover from probe_floor back to 1.0 under no failures.
5137DEFAULT_RECOVERY_DURATION = Duration .from_hours (1 )
5238
53- # Multiplicative decay applied per failure. 0.7 → 1 failure leaves us at 70%,
54- # 3 failures at 34%, 5 failures at 17%. Aggressive but not crippling .
39+ # Multiplicative decay applied per failure. 0.7 → health drops to 70% after
40+ # one failure, 34% after three, 17% after five .
5541DEFAULT_DECAY_PER_FAILURE = 0.7
5642
57- # Floor on the effective scale-up rate, in attempts per minute. Even a fully
58- # degraded zone keeps probing at this rate so it can produce non-failure
59- # outcomes and let the score climb back. At base 16/min, 0.5/min = 3% floor.
43+ # Minimum effective scale-up rate in attempts/min, applied even when health
44+ # bottoms out so the group keeps producing non-failure samples.
6045DEFAULT_PROBE_FLOOR_PER_MINUTE = 0.5
6146
62- # Slices younger than this at termination count as failures; older ones are
63- # normal preemptible behaviour.
47+ # Slices younger than this at termination count as failures.
6448DEFAULT_SHORT_LIVED_THRESHOLD = Duration .from_minutes (5 )
6549
6650# How long a quota-exhausted error blocks scale-up before we attempt again.
@@ -80,11 +64,8 @@ class SliceFate(StrEnum):
8064
8165
8266class GroupHealth (StrEnum ):
83- """Display label derived from the current health score.
84-
85- Informational only — the detector throttles on the raw float, not on
86- these bands.
87- """
67+ """Display label derived from the current health score. Display only — the
68+ detector throttles on the raw float, not on these bands."""
8869
8970 HEALTHY = "healthy"
9071 SUSPECT = "suspect"
@@ -93,13 +74,7 @@ class GroupHealth(StrEnum):
9374
9475
9576class BackoffDetector :
96- """Per-group AIMD health tracker.
97-
98- Thread-safe. Failure events decay the score multiplicatively; recovery
99- accrues additively as wall time advances. ``try_acquire_scale_up``
100- folds the recovery, updates the bucket's refill rate from the current
101- score, and takes a token.
102- """
77+ """Per-group AIMD health tracker. Thread-safe."""
10378
10479 def __init__ (
10580 self ,
@@ -136,12 +111,10 @@ def __init__(
136111 self ._health : float = 1.0
137112 self ._last_tick : Timestamp | None = None
138113
139- # In-flight slice tracking for age classification at termination time.
140- # Does not contribute to the score; just needed so a long-lived slice's
141- # eventual preemption can be recognised as not-a-failure.
114+ # created_at by slice_id; used only to classify a termination by age.
142115 self ._inflight : dict [str , Timestamp ] = {}
143116
144- # Quota gate — separate from the AIMD state.
117+ # Categorical quota block, separate from the AIMD state.
145118 self ._quota_until : Deadline | None = None
146119 self ._quota_reason : str = ""
147120
@@ -152,22 +125,15 @@ def __init__(
152125 # -----------------------------------------------------------------------
153126
154127 def record_created (self , slice_id : str , created_at : Timestamp ) -> None :
155- """Record a successful slice create. Tracks ``created_at`` for later
156- age-at-death classification. Idempotent (keeps original timestamp)."""
128+ """Record a successful slice create. Idempotent."""
157129 with self ._lock :
158130 self ._inflight .setdefault (slice_id , created_at )
159131
160132 def record_terminated (self , slice_id : str , fate : SliceFate , terminated_at : Timestamp ) -> None :
161- """Record the end of a previously-created slice.
133+ """Record the termination of a previously-created slice.
162134
163- Counts as a failure (decays the score) when:
164- - ``fate == BOOT_FAILED`` (always — slice never produced useful work).
165- - ``fate == PREEMPTED`` and age < ``short_lived_threshold``.
166-
167- Long-lived preemptions are silent — the slice did useful work first.
168- If the slice was never seen by :meth:`record_created` (e.g. controller
169- restart), ``created_at`` defaults to ``terminated_at`` → age 0 →
170- short-lived → one decay applied.
135+ Decays the score on ``BOOT_FAILED`` at any age, or on ``PREEMPTED``
136+ if the slice was younger than ``short_lived_threshold``.
171137 """
172138 with self ._lock :
173139 created_at = self ._inflight .pop (slice_id , terminated_at )
@@ -179,26 +145,19 @@ def record_terminated(self, slice_id: str, fate: SliceFate, terminated_at: Times
179145 self ._apply_decay (terminated_at )
180146
181147 def record_create_failed (self , ts : Timestamp ) -> None :
182- """Record a CreateSlice RPC error (no slice handle was returned).
183-
184- Always a failure event — equivalent to a zero-age BOOT_FAILED.
185- """
148+ """Record a CreateSlice RPC error. Always decays the score."""
186149 with self ._lock :
187150 self ._apply_recovery (ts )
188151 self ._apply_decay (ts )
189152
190153 def record_quota_exceeded (self , reason : str , ts : Timestamp ) -> None :
191- """Record a quota error from GCP. Sets a fixed-duration hard block.
192-
193- Quota is categorical, not probabilistic — it does not feed the AIMD
194- score, just sets ``_quota_until``.
195- """
154+ """Set a fixed-duration quota block. Does not feed the AIMD score."""
196155 with self ._lock :
197156 self ._quota_until = Deadline .after (ts , self ._quota_block_duration )
198157 self ._quota_reason = reason
199158
200159 def clear_quota_block (self ) -> None :
201- """Clear the quota-block deadline (called on a proven successful create) ."""
160+ """Clear the quota-block deadline."""
202161 with self ._lock :
203162 self ._quota_until = None
204163 self ._quota_reason = ""
@@ -208,7 +167,7 @@ def clear_quota_block(self) -> None:
208167 # -----------------------------------------------------------------------
209168
210169 def health (self , now : Timestamp ) -> float :
211- """Current health score in ``[probe_floor, 1.0]`` after recovery to now."""
170+ """Health score in ``[probe_floor, 1.0]`` after applying recovery up to `` now`` ."""
212171 with self ._lock :
213172 self ._apply_recovery (now )
214173 return self ._health
@@ -225,21 +184,14 @@ def health_label(self, now: Timestamp) -> GroupHealth:
225184 return GroupHealth .HOSTILE
226185
227186 def can_scale_up (self , now : Timestamp ) -> bool :
228- """False only when inside the quota-block window.
229-
230- The health score throttles via the bucket's refill rate; it never
231- produces a hard block. Quota is the only categorical gate.
232- """
187+ """False only when inside the quota-block window."""
233188 deadline , _ = self ._snapshot_quota ()
234189 if deadline is not None and not deadline .expired (now = now ):
235190 return False
236191 return True
237192
238193 def try_acquire_scale_up (self , now : Timestamp ) -> bool :
239- """Apply recovery, set the bucket refill rate to ``base * health``, then acquire.
240-
241- Returns False on quota block or empty bucket.
242- """
194+ """Apply recovery, set ``bucket.refill_rate = base * health``, then take a token."""
243195 if not self .can_scale_up (now ):
244196 return False
245197 with self ._lock :
@@ -261,10 +213,7 @@ def quota_deadline(self, now: Timestamp) -> Timestamp | None:
261213 return deadline .as_timestamp ()
262214
263215 def status_label (self , now : Timestamp ) -> str :
264- """Human-readable status string for dashboards/logs.
265-
266- Format: ``"<label> health=<score>"``, e.g. ``"churning health=0.34"``.
267- """
216+ """``"<label> health=<score>"`` (e.g. ``"churning health=0.34"``) for dashboards/logs."""
268217 h = self .health (now )
269218 return f"{ self .health_label (now ).value } health={ h :.2f} "
270219
@@ -273,7 +222,7 @@ def status_label(self, now: Timestamp) -> str:
273222 # -----------------------------------------------------------------------
274223
275224 def _apply_recovery (self , now : Timestamp ) -> None :
276- """Accrue recovery from the last tick to ``now``."""
225+ """Accrue recovery from the last tick to ``now``, clamped at 1.0 ."""
277226 if self ._last_tick is None :
278227 self ._last_tick = now
279228 return
@@ -284,13 +233,7 @@ def _apply_recovery(self, now: Timestamp) -> None:
284233 self ._last_tick = now
285234
286235 def _apply_decay (self , now : Timestamp ) -> None :
287- """Apply one multiplicative decay and drain bucket inventory to match.
288-
289- Health drops to ``health * decay`` (clamped at the probe floor). The
290- bucket's accumulated tokens are then capped at ``capacity * health``,
291- so a previously-healthy group that suddenly degrades can't burst
292- through a full bucket before the new (lower) refill rate matters.
293- """
236+ """Multiply health by ``decay`` (clamped at the floor) and cap bucket inventory at ``capacity * health``."""
294237 self ._health = max (self ._floor , self ._health * self ._decay )
295238 self ._scale_up_bucket .cap_tokens (self ._scale_up_bucket .capacity * self ._health , now = now )
296239
0 commit comments