marin-community
diff --git a/‎lib/iris/src/iris/cluster/controller/autoscaler/backoff_detector.py‎
Lines changed: 42 additions & 99 deletions b/‎lib/iris/src/iris/cluster/controller/autoscaler/backoff_detector.py‎
Lines changed: 42 additions & 99 deletions
diff --git a/‎lib/iris/src/iris/cluster/controller/autoscaler/planning.py‎
Lines changed: 4 additions & 9 deletions b/‎lib/iris/src/iris/cluster/controller/autoscaler/planning.py‎
Lines changed: 4 additions & 9 deletions
@@ -3,41 +3,27 @@
 
 """AIMD health tracker for the per-group scale-up token bucket.
 
-A single per-group ``health: float`` in ``[probe_floor, 1.0]`` drives the
-bucket's refill rate. State:
+A per-group ``health: float`` in ``[probe_floor, 1.0]`` drives the bucket's
+refill rate.
 
-- Starts at ``1.0`` (fully healthy).
-- On every failure event: ``health *= decay_per_failure`` (multiplicative
+- Each failure event applies ``health *= decay_per_failure`` (multiplicative
   decrease, clamped at ``probe_floor``).
-- On every query/event: ``health += recovery_per_second * dt`` (additive
-  increase, clamped at ``1.0``). ``recovery_per_second`` is chosen so the
-  score climbs from ``probe_floor`` to ``1.0`` over ``recovery_duration``.
-
-This is AIMD — the same shape TCP congestion control uses. One isolated
-failure throttles the group for a short time; sustained failures push it
-to the floor; sustained success lets it climb back smoothly.
-
-The bucket's ``refill_rate`` is set to ``base_rate * health`` on every
-``try_acquire_scale_up`` call. Cadence — not batch size — is what gets
-throttled. Each failure also caps the bucket's current token inventory
-to ``capacity * health``, so a previously-healthy bucket that suddenly
-degrades can't burst its accumulated tokens before the lowered refill
-rate takes effect.
-
-The probe floor exists so a fully degraded zone still attempts at some low
-rate — without it, zero attempts means zero samples and the score could
-never climb back.
-
-Quota-exhaustion is a separate hard block (``record_quota_exceeded``).
-Quota is categorical ("GCP said no") and recovers on GCP's timescale, not
-ours, so it bypasses the health-score machinery entirely.
-
-In-flight ``created_at`` is tracked so a terminated slice can be classified
-short-lived vs. long-lived by age. Only short-lived deaths (or BOOT_FAILED
-at any age) feed the decay; long-lived preemptions are normal preemptible
-behaviour and do nothing.
-
-State is in-memory; on controller restart ``health`` resets to ``1.0``.
+- Between events, ``health`` accrues additively so it climbs from
+  ``probe_floor`` to ``1.0`` over ``recovery_duration``.
+- ``try_acquire_scale_up`` sets ``bucket.refill_rate = base_rate * health``
+  and then takes a token, and each failure also caps the bucket inventory
+  to ``capacity * health`` so an accumulated burst can't bypass the new
+  rate.
+
+The probe floor keeps a degraded zone probing at a non-zero rate so it can
+produce non-failure samples and recover.
+
+Quota-exhaustion is a separate categorical block (``record_quota_exceeded``)
+that does not feed the health score.
+
+In-flight ``created_at`` is tracked so a terminated slice's age determines
+whether the termination counts as a failure. State is in-memory; on
+controller restart ``health`` resets to ``1.0``.
 """
 
 from __future__ import annotations
@@ -50,17 +36,15 @@
 # Time to recover from probe_floor back to 1.0 under no failures.
 DEFAULT_RECOVERY_DURATION = Duration.from_hours(1)
 
-# Multiplicative decay applied per failure. 0.7 → 1 failure leaves us at 70%,
-# 3 failures at 34%, 5 failures at 17%. Aggressive but not crippling.
+# Multiplicative decay applied per failure. 0.7 → health drops to 70% after
+# one failure, 34% after three, 17% after five.
 DEFAULT_DECAY_PER_FAILURE = 0.7
 
-# Floor on the effective scale-up rate, in attempts per minute. Even a fully
-# degraded zone keeps probing at this rate so it can produce non-failure
-# outcomes and let the score climb back. At base 16/min, 0.5/min = 3% floor.
+# Minimum effective scale-up rate in attempts/min, applied even when health
+# bottoms out so the group keeps producing non-failure samples.
 DEFAULT_PROBE_FLOOR_PER_MINUTE = 0.5
 
-# Slices younger than this at termination count as failures; older ones are
-# normal preemptible behaviour.
+# Slices younger than this at termination count as failures.
 DEFAULT_SHORT_LIVED_THRESHOLD = Duration.from_minutes(5)
 
 # How long a quota-exhausted error blocks scale-up before we attempt again.
@@ -80,11 +64,8 @@ class SliceFate(StrEnum):
 
 
 class GroupHealth(StrEnum):
-    """Display label derived from the current health score.
-
-    Informational only — the detector throttles on the raw float, not on
-    these bands.
-    """
+    """Display label derived from the current health score. Display only — the
+    detector throttles on the raw float, not on these bands."""
 
     HEALTHY = "healthy"
     SUSPECT = "suspect"
@@ -93,13 +74,7 @@ class GroupHealth(StrEnum):
 
 
 class BackoffDetector:
-    """Per-group AIMD health tracker.
-
-    Thread-safe. Failure events decay the score multiplicatively; recovery
-    accrues additively as wall time advances. ``try_acquire_scale_up``
-    folds the recovery, updates the bucket's refill rate from the current
-    score, and takes a token.
-    """
+    """Per-group AIMD health tracker. Thread-safe."""
 
     def __init__(
         self,
@@ -136,12 +111,10 @@ def __init__(
         self._health: float = 1.0
         self._last_tick: Timestamp | None = None
 
-        # In-flight slice tracking for age classification at termination time.
-        # Does not contribute to the score; just needed so a long-lived slice's
-        # eventual preemption can be recognised as not-a-failure.
+        # created_at by slice_id; used only to classify a termination by age.
         self._inflight: dict[str, Timestamp] = {}
 
-        # Quota gate — separate from the AIMD state.
+        # Categorical quota block, separate from the AIMD state.
         self._quota_until: Deadline | None = None
         self._quota_reason: str = ""
 
@@ -152,22 +125,15 @@ def __init__(
     # -----------------------------------------------------------------------
 
     def record_created(self, slice_id: str, created_at: Timestamp) -> None:
-        """Record a successful slice create. Tracks ``created_at`` for later
-        age-at-death classification. Idempotent (keeps original timestamp)."""
+        """Record a successful slice create. Idempotent."""
         with self._lock:
             self._inflight.setdefault(slice_id, created_at)
 
     def record_terminated(self, slice_id: str, fate: SliceFate, terminated_at: Timestamp) -> None:
-        """Record the end of a previously-created slice.
+        """Record the termination of a previously-created slice.
 
-        Counts as a failure (decays the score) when:
-        - ``fate == BOOT_FAILED`` (always — slice never produced useful work).
-        - ``fate == PREEMPTED`` and age < ``short_lived_threshold``.
-
-        Long-lived preemptions are silent — the slice did useful work first.
-        If the slice was never seen by :meth:`record_created` (e.g. controller
-        restart), ``created_at`` defaults to ``terminated_at`` → age 0 →
-        short-lived → one decay applied.
+        Decays the score on ``BOOT_FAILED`` at any age, or on ``PREEMPTED``
+        if the slice was younger than ``short_lived_threshold``.
         """
         with self._lock:
             created_at = self._inflight.pop(slice_id, terminated_at)
@@ -179,26 +145,19 @@ def record_terminated(self, slice_id: str, fate: SliceFate, terminated_at: Times
                 self._apply_decay(terminated_at)
 
     def record_create_failed(self, ts: Timestamp) -> None:
-        """Record a CreateSlice RPC error (no slice handle was returned).
-
-        Always a failure event — equivalent to a zero-age BOOT_FAILED.
-        """
+        """Record a CreateSlice RPC error. Always decays the score."""
         with self._lock:
             self._apply_recovery(ts)
             self._apply_decay(ts)
 
     def record_quota_exceeded(self, reason: str, ts: Timestamp) -> None:
-        """Record a quota error from GCP. Sets a fixed-duration hard block.
-
-        Quota is categorical, not probabilistic — it does not feed the AIMD
-        score, just sets ``_quota_until``.
-        """
+        """Set a fixed-duration quota block. Does not feed the AIMD score."""
         with self._lock:
             self._quota_until = Deadline.after(ts, self._quota_block_duration)
             self._quota_reason = reason
 
     def clear_quota_block(self) -> None:
-        """Clear the quota-block deadline (called on a proven successful create)."""
+        """Clear the quota-block deadline."""
         with self._lock:
             self._quota_until = None
             self._quota_reason = ""
@@ -208,7 +167,7 @@ def clear_quota_block(self) -> None:
     # -----------------------------------------------------------------------
 
     def health(self, now: Timestamp) -> float:
-        """Current health score in ``[probe_floor, 1.0]`` after recovery to now."""
+        """Health score in ``[probe_floor, 1.0]`` after applying recovery up to ``now``."""
         with self._lock:
             self._apply_recovery(now)
             return self._health
@@ -225,21 +184,14 @@ def health_label(self, now: Timestamp) -> GroupHealth:
         return GroupHealth.HOSTILE
 
     def can_scale_up(self, now: Timestamp) -> bool:
-        """False only when inside the quota-block window.
-
-        The health score throttles via the bucket's refill rate; it never
-        produces a hard block. Quota is the only categorical gate.
-        """
+        """False only when inside the quota-block window."""
         deadline, _ = self._snapshot_quota()
         if deadline is not None and not deadline.expired(now=now):
             return False
         return True
 
     def try_acquire_scale_up(self, now: Timestamp) -> bool:
-        """Apply recovery, set the bucket refill rate to ``base * health``, then acquire.
-
-        Returns False on quota block or empty bucket.
-        """
+        """Apply recovery, set ``bucket.refill_rate = base * health``, then take a token."""
         if not self.can_scale_up(now):
             return False
         with self._lock:
@@ -261,10 +213,7 @@ def quota_deadline(self, now: Timestamp) -> Timestamp | None:
         return deadline.as_timestamp()
 
     def status_label(self, now: Timestamp) -> str:
-        """Human-readable status string for dashboards/logs.
-
-        Format: ``"<label> health=<score>"``, e.g. ``"churning health=0.34"``.
-        """
+        """``"<label> health=<score>"`` (e.g. ``"churning health=0.34"``) for dashboards/logs."""
         h = self.health(now)
         return f"{self.health_label(now).value} health={h:.2f}"
 
@@ -273,7 +222,7 @@ def status_label(self, now: Timestamp) -> str:
     # -----------------------------------------------------------------------
 
     def _apply_recovery(self, now: Timestamp) -> None:
-        """Accrue recovery from the last tick to ``now``."""
+        """Accrue recovery from the last tick to ``now``, clamped at 1.0."""
         if self._last_tick is None:
             self._last_tick = now
             return
@@ -284,13 +233,7 @@ def _apply_recovery(self, now: Timestamp) -> None:
         self._last_tick = now
 
     def _apply_decay(self, now: Timestamp) -> None:
-        """Apply one multiplicative decay and drain bucket inventory to match.
-
-        Health drops to ``health * decay`` (clamped at the probe floor). The
-        bucket's accumulated tokens are then capped at ``capacity * health``,
-        so a previously-healthy group that suddenly degrades can't burst
-        through a full bucket before the new (lower) refill rate matters.
-        """
+        """Multiply health by ``decay`` (clamped at the floor) and cap bucket inventory at ``capacity * health``."""
         self._health = max(self._floor, self._health * self._decay)
         self._scale_up_bucket.cap_tokens(self._scale_up_bucket.capacity * self._health, now=now)
 
 
@@ -93,15 +93,10 @@ def decisions(self) -> list[ScalingDecision]:
 
 
 def build_group_scale_plan(group: ScalingGroup, required_slices: int, ts: Timestamp) -> GroupScalePlan:
-    """Build the actionable scale-up plan for a group.
-
-    ``slices_to_add`` is the honest desired count for this tick — what we
-    would launch if nothing were rate-limiting us. The actual throttle
-    happens later in ``runtime.execute``, where each launch must acquire
-    a token from the detector's scale-up bucket; the bucket's refill rate
-    is set to ``base_rate * health`` on every acquire, so degraded groups
-    still get a plan here every tick but only succeed at acquiring a
-    token at the AIMD-modulated cadence.
+    """Build a scale-up plan for one group.
+
+    ``slices_to_add`` is the desired launch count for this tick before any
+    rate-limiting. Token-bucket throttling is applied at execution time.
     """
 
     counts = GroupSliceCounts.from_group(group)