fix(metrics): raise metrics drain-timeout default to 300s

viraatc · claude · viraatc · commit 8f547af3faa8 · 2026-06-10T20:42:28.000-07:00
A 1M-sample run holds ~2M deferred tokenizations at ENDED; the drain
fans the whole buffer into one encode_batch per shard, so a 60s budget
expires before any chunk returns and the entire backlog is dropped.
300s covers 1M-sample runs with headroom.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/AGENTS.md b/AGENTS.md
@@ -115,7 +115,7 @@ The aggregator is a separate process (`python -m inference_endpoint.async_utils.
 
 - **Series storage**: each `SeriesSampler` keeps three parallel views: O(1) cheap rollups (count/total/min/max/sum_sq, exact), an HDR Histogram (cheap live percentiles), and an in-memory `array.array` of raw values (for exact percentiles in the `COMPLETE` snapshot). Hot path is `registry.record(name, value)` — no allocation, no I/O.
 - **Counter API**: `registry.increment(name, delta=1)` for sample-event counters. `registry.set_counter(name, value)` only for the two duration counters (`total_duration_ns` max-of-elapsed, `tracked_duration_ns` sum-of-blocks).
-- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by the `--drain-timeout` budget — schema default 60 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly.
+- **Lifecycle**: `INITIALIZE` (constructed, awaiting first `STARTED`) → `LIVE` (run in progress, ticking every `--publish-interval` seconds) → `DRAINING` (set on `ENDED`; tick continues; bounded by the `--drain-timeout` budget — schema default 300 s) → terminal: `COMPLETE` (clean end via `publish_final`, exact stats) **or** `INTERRUPTED` (signal-handler-triggered final via SIGTERM/SIGINT; best-effort partial stats). Drain timeout detected by consumers as `state == COMPLETE and n_pending_tasks > 0`; interrupted runs are detected as `state == INTERRUPTED` directly.
 - **Final delivery is dual-path with separated concerns**: `publish_final` atomically writes `final_snapshot.json` (`tmp + fsync(file) + rename + fsync(parent_dir)`) — this is the **primary** Report source — AND emits the terminal-state snapshot over pub/sub as a TUI shutdown signal. Each path is wrapped in its own try/except so one failure cannot suppress the other. Main process consumer reads `final_snapshot.json` (via `json.loads` to dict, no Struct decode); falls back to the subscriber's `latest` live snapshot only if the file is missing (e.g. SIGKILL / OOM before the signal handler ran). The dict form is the canonical consumer contract (see `snapshot_to_dict`).
 - **Histogram bucket edges are dynamic per snapshot**: log-spaced over the observed `[min, max]`. Bucket count is fixed at construction; consumers MUST re-render from the snapshot's `(lo, hi, count)` triples each frame and MUST NOT track bucket-by-index across snapshots.
 
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/aggregator.py
@@ -124,7 +124,7 @@ def __init__(
     ):
         # drain_timeout_s is injected (not derived) because the right
         # value is workload-dependent: long-context tokenize-heavy runs
-        # need more headroom than the schema default 60 s, and the
+        # need more headroom than the schema default 300 s, and the
         # aggregator itself can't measure that ahead of time. Keeping it
         # as an arg lets the __main__ CLI flag plumb the user's choice
         # through without coupling this class to argparse.
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/snapshot.py
@@ -45,7 +45,7 @@ class SessionState(str, Enum):
     LIVE        → run in progress; tick task publishing live HDR-derived stats.
     DRAINING    → ``SessionEventType.ENDED`` has been received; the aggregator
                   is tokenizing the buffered samples (bounded by the
-                  ``--drain-timeout`` budget — schema default 60 s). Tick task
+                  ``--drain-timeout`` budget — schema default 300 s). Tick task
                   continues at this stage, still HDR-derived; no new events
                   will arrive.
     COMPLETE    → terminal clean state. The ``publish_final()`` snapshot
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
@@ -558,11 +558,11 @@ class DrainConfig(BaseModel):
             ),
         ),
     ] = Field(
-        60.0,
+        300.0,
         ge=0,
         description=(
             "Wall-clock budget (seconds) to finish tokenizing buffered samples "
-            "after ENDED (default: 60.0; 0 = unlimited)."
+            "after ENDED (default: 300.0; 0 = unlimited)."
         ),
     )
     metrics_tokenizer_workers: Annotated[
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
+    metrics_drain_timeout_s: 300.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited).
     metrics_tokenizer_workers: 2  # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain).
   warmup:
     enabled: false  # Enable warmup phase before performance run
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
+    metrics_drain_timeout_s: 300.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited).
     metrics_tokenizer_workers: 2  # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain).
   warmup:
     enabled: false  # Enable warmup phase before performance run
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -79,7 +79,7 @@ settings:
     warmup_timeout_s: 240.0  # Warmup drain timeout in seconds (None = wait indefinitely)
     performance_timeout_s: 240.0  # Performance drain timeout in seconds (None = wait indefinitely)
     accuracy_timeout_s: null  # Accuracy drain timeout in seconds (None = wait indefinitely)
-    metrics_drain_timeout_s: 60.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 60.0; 0 = unlimited).
+    metrics_drain_timeout_s: 300.0  # Wall-clock budget (seconds) to finish tokenizing buffered samples after ENDED (default: 300.0; 0 = unlimited).
     metrics_tokenizer_workers: 2  # In-process tokenizer threads for live (mid-run) ISL/OSL/TPOT (default: 2; 0 = defer everything to the end-of-run drain).
   warmup:
     enabled: false  # Enable warmup phase before performance run
diff --git a/tests/unit/commands/test_benchmark.py b/tests/unit/commands/test_benchmark.py
@@ -489,7 +489,7 @@ def test_defaults(self):
         assert cfg.warmup_timeout_s == 240.0
         assert cfg.performance_timeout_s == 240.0
         assert cfg.accuracy_timeout_s is None
-        assert cfg.metrics_drain_timeout_s == 60.0
+        assert cfg.metrics_drain_timeout_s == 300.0
 
     @pytest.mark.unit
     @pytest.mark.parametrize(