Instant ttft oversaturation (#607)

sjmonson · web-flow · commit e52e92cb2864 · 2026-03-02T20:17:59.000-05:00
## Summary When over-saturation detection is enabled (`--detect-saturation`), the constraint can only receive TTFT data after a request fully completes. With large models and long contexts, no request completes within the `minimum_duration` window (default 30s), so the constraint falls back to concurrent slope alone and stops prematurely. This PR adds time-bounded instant TTFT notifications: when over-saturation detection is enabled, worker processes monitor for first-token arrival during streaming and send a `"first_token_arrived"` status update before the request completes. This gives the constraint real TTFT data for a two-signal decision. Notifications are sent only during the first `minimum_duration` seconds of the benchmark to limit IPC overhead. ## Details - [x] Add `"first_token_arrived"` to `RequestInfo.status` literal (`schemas/info.py`) - [x] Add TTFT polling monitor to `WorkerProcess` — spawns an async task per request that detects `first_token_iteration` and sends a `"first_token_arrived"` update (`scheduler/worker.py`) - [x] Time-bound the monitor: notifications stop after `minimum_duration` seconds via `instant_ttft_duration` (`scheduler/worker.py`) - [x] Handle `"first_token_arrived"` in `WorkerGroupState` — no request count changes, passes through to constraints (`scheduler/worker_group.py`) - [x] Extract `minimum_duration` from `OverSaturationConstraint` to configure worker TTFT duration (`scheduler/worker_group.py`) - [x] Accept TTFT from both `"first_token_arrived"` and `"completed"` in the constraint, deduplicated by request ID (`scheduler/constraints/saturation.py`) - [x] Add 8 tests covering happy path, dedup, missing timings, backward compatibility, concurrent isolation, disabled mode, reset, and multi-request slope building ## Test Plan - Run `pytest tests/unit/scheduler/ tests/unit/schemas/ tests/unit/backends/` — 1077 passed - Run `pre-commit run --files` on changed files — all checks pass - Verify with `--detect-saturation` on a large model with long context (>10k tokens) that the benchmark no longer stops prematurely ## Related Issues - Resolves #606 --- - [x] "I certify that all code in this PR is my own, except as noted below." ## Use of AI - [x] Includes AI-assisted code completion - [x] Includes code generated by an AI application - [x] Includes AI-generated tests (NOTE: AI written tests should have a docstring that includes `## WRITTEN BY AI ##`)
diff --git a/src/guidellm/backends/openai/http.py b/src/guidellm/backends/openai/http.py
@@ -279,7 +279,7 @@ async def resolve(  # type: ignore[override]
         request: GenerationRequest,
         request_info: RequestInfo,
         history: list[tuple[GenerationRequest, GenerationResponse]] | None = None,
-    ) -> AsyncIterator[tuple[GenerationResponse, RequestInfo]]:
+    ) -> AsyncIterator[tuple[GenerationResponse | None, RequestInfo]]:
         """
         Process generation request and yield progressive responses.
 
@@ -377,6 +377,7 @@ async def resolve(  # type: ignore[override]
                     if request_info.timings.first_token_iteration is None:
                         request_info.timings.first_token_iteration = iter_time
                         request_info.timings.token_iterations = 0
+                        yield None, request_info
 
                     request_info.timings.last_token_iteration = iter_time
                     request_info.timings.token_iterations += iterations
diff --git a/src/guidellm/scheduler/constraints/saturation.py b/src/guidellm/scheduler/constraints/saturation.py
@@ -367,6 +367,7 @@ def reset(self) -> None:
         self.ttft_violations_counter = 0
         self.total_finished_ever = 0
         self.total_started_ever = 0
+        self._ttft_reported_request_ids: set[str] = set()
         self.concurrent_slope_checker = SlopeChecker(
             moe_threshold=self.moe_threshold, confidence=self.confidence, eps=self.eps
         )
@@ -519,17 +520,19 @@ def __call__(
             self._add_started(
                 {"concurrent_requests": concurrent_requests, "duration": duration}
             )
-        elif (
-            request_info.status == "completed"
-            and request_info.timings
-            and request_info.timings.first_token_iteration
-            and request_info.timings.request_start
-        ):
-            ttft = (
-                request_info.timings.first_token_iteration
-                - request_info.timings.request_start
-            )
-            self._add_finished({"ttft": ttft, "duration": duration})
+        elif request_info.status in ("first_token", "completed"):
+            if (
+                request_info.request_id not in self._ttft_reported_request_ids
+                and request_info.timings
+                and request_info.timings.first_token_iteration
+                and request_info.timings.request_start
+            ):
+                self._ttft_reported_request_ids.add(request_info.request_id)
+                ttft = (
+                    request_info.timings.first_token_iteration
+                    - request_info.timings.request_start
+                )
+                self._add_finished({"ttft": ttft, "duration": duration})
 
         self._update_duration(duration)
         is_over_saturated = self._check_alert()
diff --git a/src/guidellm/scheduler/schemas.py b/src/guidellm/scheduler/schemas.py
@@ -119,14 +119,15 @@ async def resolve(
         request: RequestT,
         request_info: RequestInfo,
         history: list[tuple[RequestT, ResponseT]] | None = None,
-    ) -> AsyncIterator[tuple[ResponseT, RequestInfo]]:
+    ) -> AsyncIterator[tuple[ResponseT | None, RequestInfo]]:
         """
         Process a request and yield incremental response updates.
 
         :param request: The request object to process
         :param request_info: Scheduling metadata and timing information
         :param history: Conversation history for multi-turn requests
-        :yield: Tuples of (response, updated_request_info) for each response chunk
+        :yield: Tuples of (response, updated_request_info) for each response chunk.
+            Response may be None for intermediate updates (e.g., first token arrival).
         :raises Exception: Implementation-specific exceptions for processing failures
         """
 
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
@@ -365,11 +365,18 @@ async def _process_next_request(self, target_start: float):
             async for resp, info in self.backend.resolve(  # type: ignore[attr-defined]
                 request, request_info, None
             ):
-                response = resp
                 request_info = info
                 if request_info is None:
                     raise RuntimeError("Received invalid request info from backend")
 
+                if (
+                    resp is None
+                    and request_info.timings.first_token_iteration is not None
+                ):
+                    self._send_update("first_token", None, request, request_info)
+
+                response = resp
+
             # Complete the request
             request_info.timings.resolve_end = time.time()
             self._send_update("completed", response, request, request_info)
@@ -428,7 +435,12 @@ async def _schedule_request(
     def _send_update(
         self,
         new_status: Literal[
-            "pending", "in_progress", "completed", "errored", "cancelled"
+            "pending",
+            "in_progress",
+            "first_token",
+            "completed",
+            "errored",
+            "cancelled",
         ],
         response: ResponseT | None,
         request: RequestT | MultiTurnRequestT[RequestT],
diff --git a/src/guidellm/scheduler/worker_group.py b/src/guidellm/scheduler/worker_group.py
@@ -643,6 +643,8 @@ def _update_state_request_counts(self, info: RequestInfo):
             self._state.pending_requests = len(self._pending_request_ids)
             self._processing_request_ids.add(info.request_id)
             self._state.processing_requests = len(self._processing_request_ids)
+        elif info.status == "first_token":
+            pass
         elif info.status == "completed":
             info.timings.finalized = finalized
             self._processing_request_ids.remove(info.request_id)
diff --git a/src/guidellm/schemas/info.py b/src/guidellm/schemas/info.py
@@ -126,7 +126,13 @@ class RequestInfo(StandardBaseModel):
         default_factory=lambda: str(uuid.uuid4()),
     )
     status: Literal[
-        "queued", "pending", "in_progress", "completed", "errored", "cancelled"
+        "queued",
+        "pending",
+        "in_progress",
+        "first_token",
+        "completed",
+        "errored",
+        "cancelled",
     ] = Field(description="Current processing status of the request", default="queued")
     scheduler_node_id: int = Field(
         description="ID/rank of the scheduler node handling the request",
diff --git a/tests/unit/scheduler/test_over_saturation_comprehensive.py b/tests/unit/scheduler/test_over_saturation_comprehensive.py