vllm-project
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/guidellm/backends/openai.py‎
Lines changed: 10 additions & 11 deletions b/‎src/guidellm/backends/openai.py‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎src/guidellm/backends/response_handlers.py‎
Lines changed: 69 additions & 44 deletions b/‎src/guidellm/backends/response_handlers.py‎
Lines changed: 69 additions & 44 deletions
diff --git a/‎src/guidellm/benchmark/schemas/generative/accumulator.py‎
Lines changed: 5 additions & 5 deletions b/‎src/guidellm/benchmark/schemas/generative/accumulator.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/guidellm/benchmark/schemas/generative/metrics.py‎
Lines changed: 1 addition & 1 deletion b/‎src/guidellm/benchmark/schemas/generative/metrics.py‎
Lines changed: 1 addition & 1 deletion
@@ -129,6 +129,7 @@ dev = [
     "mdformat-gfm~=0.3.6",
 
     # type-checking
+    "pandas-stubs",
     "types-PyYAML~=6.0.1",
     "types-requests~=2.32.0",
     "types-toml",
 
@@ -54,7 +54,7 @@ class OpenAIHTTPBackend(Backend):
     def __init__(
         self,
         target: str,
-        model: str | None = None,
+        model: str = "",
         api_routes: dict[str, str] | None = None,
         response_handlers: dict[str, Any] | None = None,
         timeout: float = 60.0,
@@ -192,7 +192,7 @@ async def available_models(self) -> list[str]:
 
         return [item["id"] for item in response.json()["data"]]
 
-    async def default_model(self) -> str | None:
+    async def default_model(self) -> str:
         """
         Get the default model for this backend.
 
@@ -202,9 +202,9 @@ async def default_model(self) -> str | None:
             return self.model
 
         models = await self.available_models()
-        return models[0] if models else None
+        return models[0] if models else ""
 
-    async def resolve(
+    async def resolve(  # type: ignore[override]
         self,
         request: GenerationRequest,
         request_info: RequestInfo,
@@ -230,11 +230,9 @@ async def resolve(
         if history is not None:
             raise NotImplementedError("Multi-turn requests not yet supported")
 
-        response_handler = self._resolve_response_handler(
-            request_type=request.request_type
-        )
         if (request_path := self.api_routes.get(request.request_type)) is None:
             raise ValueError(f"Unsupported request type '{request.request_type}'")
+
         request_url = f"{self.target}/{request_path}"
         request_files = (
             {
@@ -246,6 +244,9 @@ async def resolve(
         )
         request_json = request.arguments.body if not request_files else None
         request_data = request.arguments.body if request_files else None
+        response_handler = self._resolve_response_handler(
+            request_type=request.request_type
+        )
 
         if not request.arguments.stream:
             request_info.timings.request_start = time.time()
@@ -288,10 +289,8 @@ async def resolve(
                     request_info.timings.request_iterations += 1
 
                     iterations = response_handler.add_streaming_line(chunk)
-                    if iterations is None or end_reached:
-                        end_reached = True
-                        continue
-                    if iterations <=0:
+                    if iterations is None or iterations <= 0 or end_reached:
+                        end_reached = end_reached or iterations is None
                         continue
 
                     if request_info.timings.first_token_iteration is None:
 
@@ -9,7 +9,7 @@
 
 from __future__ import annotations
 
-from typing import Any, Protocol
+from typing import Any, Protocol, cast
 
 from guidellm.schemas import GenerationRequest, GenerationResponse, UsageMetrics
 from guidellm.utils import RegistryMixin, json
@@ -109,14 +109,15 @@ def compile_non_streaming(
         :return: Standardized GenerationResponse with extracted text and metrics
         """
         choices, usage = self.extract_choices_and_usage(response)
-        input_metrics, output_metrics = self.extract_metrics(usage)
+        text = choices[0].get("text", "") if choices else ""
+        input_metrics, output_metrics = self.extract_metrics(usage, text)
 
         return GenerationResponse(
             request_id=request.request_id,
             request_args=str(
                 request.arguments.model_dump() if request.arguments else None
             ),
-            text=choices[0].get("text", "") if choices else "",
+            text=text,
             input_metrics=input_metrics,
             output_metrics=output_metrics,
         )
@@ -137,7 +138,7 @@ def add_streaming_line(self, line: str) -> int | None:
         updated = False
         choices, usage = self.extract_choices_and_usage(data)
 
-        if text := choices[0].get("text"):
+        if choices and (text := choices[0].get("text")):
             self.streaming_texts.append(text)
             updated = True
 
@@ -153,14 +154,15 @@ def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
         :param request: Original generation request
         :return: Standardized GenerationResponse with concatenated text and metrics
         """
-        input_metrics, output_metrics = self.extract_metrics(self.streaming_usage)
+        text = "".join(self.streaming_texts)
+        input_metrics, output_metrics = self.extract_metrics(self.streaming_usage, text)
 
         return GenerationResponse(
             request_id=request.request_id,
             request_args=str(
                 request.arguments.model_dump() if request.arguments else None
             ),
-            text="".join(self.streaming_texts),
+            text=text,
             input_metrics=input_metrics,
             output_metrics=output_metrics,
         )
@@ -194,25 +196,34 @@ def extract_choices_and_usage(
         return response.get("choices", []), response.get("usage", {})
 
     def extract_metrics(
-        self, usage: dict[str, int | dict[str, int]] | None
+        self, usage: dict[str, int | dict[str, int]] | None, text: str
     ) -> tuple[UsageMetrics, UsageMetrics]:
         """
         Extract input and output usage metrics from API response usage data.
 
         :param usage: Usage data dictionary from API response
+        :param text: Generated text for calculating word and character counts
         :return: Tuple of input_metrics and output_metrics as UsageMetrics objects
         """
         if not usage:
-            return UsageMetrics(), UsageMetrics()
+            return UsageMetrics(), UsageMetrics(
+                text_words=len(text.split()) if text else 0,
+                text_characters=len(text) if text else 0,
+            )
 
-        input_details: dict[str, int] = usage.get("prompt_tokens_details", {}) or {}
-        output_details: dict[str, int] = (
-            usage.get("completion_tokens_details", {}) or {}
+        input_details: dict[str, int] = cast(
+            "dict[str, int]", usage.get("prompt_tokens_details", {}) or {}
+        )
+        output_details: dict[str, int] = cast(
+            "dict[str, int]", usage.get("completion_tokens_details", {}) or {}
         )
+        usage_metrics: dict[str, int] = cast("dict[str, int]", usage)
 
         return UsageMetrics(
             text_tokens=(
-                input_details.get("prompt_tokens") or usage.get("prompt_tokens")
+                input_details.get("prompt_tokens")
+                or usage_metrics.get("prompt_tokens")
+                or 0
             ),
             image_tokens=input_details.get("image_tokens"),
             video_tokens=input_details.get("video_tokens"),
@@ -221,8 +232,11 @@ def extract_metrics(
         ), UsageMetrics(
             text_tokens=(
                 output_details.get("completion_tokens")
-                or usage.get("completion_tokens")
+                or usage_metrics.get("completion_tokens")
+                or 0
             ),
+            text_words=len(text.split()) if text else 0,
+            text_characters=len(text) if text else 0,
             image_tokens=output_details.get("image_tokens"),
             video_tokens=output_details.get("video_tokens"),
             audio_tokens=output_details.get("audio_tokens"),
@@ -254,14 +268,16 @@ def compile_non_streaming(
         :return: Standardized GenerationResponse with extracted content and metrics
         """
         choices, usage = self.extract_choices_and_usage(response)
-        input_metrics, output_metrics = self.extract_metrics(usage)
+        choice = choices[0] if choices else {}
+        text = choice.get("content", "")
+        input_metrics, output_metrics = self.extract_metrics(usage, text)
 
         return GenerationResponse(
             request_id=request.request_id,
             request_args=str(
                 request.arguments.model_dump() if request.arguments else None
             ),
-            text=(choices[0].get("message", {}).get("content", "") if choices else ""),
+            text=text,
             input_metrics=input_metrics,
             output_metrics=output_metrics,
         )
@@ -298,14 +314,15 @@ def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
         :param request: Original generation request
         :return: Standardized GenerationResponse with concatenated content and metrics
         """
-        input_metrics, output_metrics = self.extract_metrics(self.streaming_usage)
+        text = "".join(self.streaming_texts)
+        input_metrics, output_metrics = self.extract_metrics(self.streaming_usage, text)
 
         return GenerationResponse(
             request_id=request.request_id,
             request_args=str(
                 request.arguments.model_dump() if request.arguments else None
             ),
-            text="".join(self.streaming_texts),
+            text=text,
             input_metrics=input_metrics,
             output_metrics=output_metrics,
         )
@@ -352,29 +369,18 @@ def compile_non_streaming(
         :param response: Complete API response containing text and usage data
         :return: Standardized GenerationResponse with extracted text and metrics
         """
-        usage: dict[str, int | dict[str, int]] = response.get("usage", {})
-        input_details: dict[str, int] = usage.get("input_token_details", {}) or {}
-        output_details: dict[str, int] = usage.get("output_token_details", {}) or {}
         text: str = response.get("text", "")
+        usage: dict[str, int | dict[str, int]] = response.get("usage", {})
+        input_metrics, output_metrics = self.extract_metrics(usage, text)
 
         return GenerationResponse(
             request_id=request.request_id,
             request_args=str(
                 request.arguments.model_dump() if request.arguments else None
             ),
             text=text,
-            input_metrics=UsageMetrics(
-                text_tokens=input_details.get("text_tokens", usage.get("input_tokens")),
-                audio_tokens=input_details.get(
-                    "audio_tokens", usage.get("input_tokens")
-                ),
-                audio_seconds=input_details.get("seconds", usage.get("seconds")),
-            ),
-            output_metrics=UsageMetrics(
-                text_tokens=output_details.get(
-                    "text_tokens", usage.get("output_tokens")
-                ),
-            ),
+            input_metrics=input_metrics,
+            output_metrics=output_metrics,
         )
 
     def add_streaming_line(self, line: str) -> int | None:
@@ -394,8 +400,6 @@ def add_streaming_line(self, line: str) -> int | None:
             return 0
 
         data: dict[str, Any] = json.loads(line)
-        text: str
-        usage: dict[str, int | dict[str, int]]
         updated = False
 
         if text := data.get("text"):
@@ -414,20 +418,21 @@ def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
         :param request: Original generation request
         :return: Standardized GenerationResponse with concatenated text and metrics
         """
-        input_metrics, output_metrics = self.extract_metrics(self.streaming_usage)
+        text = "".join(self.streaming_texts)
+        input_metrics, output_metrics = self.extract_metrics(self.streaming_usage, text)
 
         return GenerationResponse(
             request_id=request.request_id,
             request_args=str(
                 request.arguments.model_dump() if request.arguments else None
             ),
-            text="".join(self.streaming_texts),
+            text=text,
             input_metrics=input_metrics,
             output_metrics=output_metrics,
         )
 
     def extract_metrics(
-        self, usage: dict[str, int | dict[str, int]] | None
+        self, usage: dict[str, int | dict[str, int]] | None, text: str
     ) -> tuple[UsageMetrics, UsageMetrics]:
         """
         Extract input and output usage metrics from audio API response usage data.
@@ -436,20 +441,40 @@ def extract_metrics(
         in addition to standard text token counts.
 
         :param usage: Usage data dictionary from audio API response
+        :param text: Generated text for calculating word and character counts
         :return: Tuple of input_metrics and output_metrics as UsageMetrics objects
         """
         if not usage:
-            return UsageMetrics(), UsageMetrics()
+            return UsageMetrics(), UsageMetrics(
+                text_words=len(text.split()) if text else 0,
+                text_characters=len(text) if text else 0,
+            )
 
-        input_details: dict[str, int] = usage.get("input_token_details", {}) or {}
-        output_details: dict[str, int] = usage.get("output_token_details", {}) or {}
+        input_details: dict[str, int] = cast(
+            "dict[str, int]", usage.get("input_token_details", {}) or {}
+        )
+        output_details: dict[str, int] = cast(
+            "dict[str, int]", usage.get("output_token_details", {}) or {}
+        )
+        usage_metrics: dict[str, int] = cast("dict[str, int]", usage)
 
         return UsageMetrics(
-            text_tokens=(input_details.get("text_tokens") or usage.get("input_tokens")),
+            text_tokens=input_details.get("text_tokens") or 0,
             audio_tokens=(
-                input_details.get("audio_tokens") or usage.get("audio_tokens")
+                input_details.get("audio_tokens")
+                or usage_metrics.get("audio_tokens")
+                or usage_metrics.get("input_tokens")
+                or 0
+            ),
+            audio_seconds=(
+                input_details.get("seconds") or usage_metrics.get("seconds") or 0
             ),
-            audio_seconds=(input_details.get("seconds") or usage.get("seconds")),
         ), UsageMetrics(
-            text_tokens=output_details.get("text_tokens") or usage.get("output_tokens"),
+            text_tokens=(
+                output_details.get("text_tokens")
+                or usage_metrics.get("output_tokens")
+                or 0
+            ),
+            text_words=len(text.split()) if text else 0,
+            text_characters=len(text) if text else 0,
         )
@@ -174,7 +174,7 @@ def update_estimate(
                 config.warmup >= 1.0
                 and scheduler_state.remaining_duration is not None
                 and self.duration is not None
-                and config.warmup >= self.duration
+                and self.duration >= config.warmup
             )
             exceeded_count = (
                 config.warmup >= 1.0
@@ -184,7 +184,7 @@ def update_estimate(
             exceeded_fraction = (
                 config.warmup < 1.0
                 and scheduler_state.remaining_fraction is not None
-                and config.warmup >= 1.0 - scheduler_state.remaining_fraction
+                and 1.0 - scheduler_state.remaining_fraction >= config.warmup
             )
 
             if exceeded_time or exceeded_count or exceeded_fraction:
@@ -198,7 +198,7 @@ def update_estimate(
             exceeded_time = (
                 config.cooldown >= 1.0
                 and scheduler_state.remaining_duration is not None
-                and config.cooldown <= scheduler_state.remaining_duration
+                and scheduler_state.remaining_duration <= config.cooldown
             )
             exceeded_count = (
                 config.cooldown >= 1.0
@@ -208,7 +208,7 @@ def update_estimate(
             exceeded_fraction = (
                 config.cooldown < 1.0
                 and scheduler_state.remaining_fraction is not None
-                and config.cooldown >= scheduler_state.remaining_fraction
+                and scheduler_state.remaining_fraction <= config.cooldown
             )
 
             if exceeded_time or exceeded_count or exceeded_fraction:
@@ -401,7 +401,7 @@ def update_estimate(
             + scheduler_state.cancelled_requests
         )
 
-        # All requests much have queued, dequeued, resolve_end, and finalized timings
+        # All requests must have queued, dequeued, resolve_end, and finalized timings
         timings: RequestTimings = stats.info.timings
         if any(
             timing is None
 
@@ -882,7 +882,7 @@ def compile(cls, accumulator: GenerativeBenchmarkAccumulator) -> GenerativeMetri
                 errored=errored,
             ),
             prompt_tokens_per_second=StatusDistributionSummary.rate_distribution_from_timings_function(
-                function=lambda req: req.prompt_tokens_timings,
+                function=lambda req: req.prompt_tokens_timing,
                 successful=successful,
                 incomplete=incomplete,
                 errored=errored,