Added support for tool calling metrics in Chat Completions API

jaredoconnell · jaredoconnell · commit 5b309e6629eb · 2026-04-03T17:06:15.000-04:00
Generated-by: Cursor AI
Signed-off-by: Jared O'Connell &lt;joconnel@redhat.com&gt;
diff --git a/src/guidellm/backends/openai/request_handlers.py b/src/guidellm/backends/openai/request_handlers.py
@@ -390,9 +390,14 @@ class ChatCompletionsRequestHandler(TextCompletionsRequestHandler):
 
     Extends TextCompletionsResponseHandler to handle chat completion requests where
     generated text is nested within message objects in the choices array. Processes
-    both streaming and non-streaming chat completion responses.
+    both streaming and non-streaming chat completion responses, including tool call
+    responses where the model outputs ``tool_calls`` instead of text content.
     """
 
+    def __init__(self):
+        super().__init__()
+        self.streaming_tool_calls: dict[int, dict] = {}
+
     def _format_prompts(
         self, column_data: list[dict[str, Any]], column_type: str
     ) -> list[dict[str, Any]]:
@@ -520,6 +525,27 @@ def format(  # noqa: C901
 
         return arguments
 
+    @staticmethod
+    def _tool_calls_to_text(tool_calls: list[dict]) -> str:
+        """Serialize a ``tool_calls`` array to a JSON string."""
+        # orjson.dumps returns bytes; stdlib json.dumps returns str
+        raw = json.dumps(tool_calls)
+        return raw.decode("utf-8") if isinstance(raw, bytes) else raw
+
+    @staticmethod
+    def _add_tool_call_metrics(
+        output_metrics: UsageMetrics, tool_call_count: int
+    ) -> None:
+        """Tag output metrics with tool call info (subset of text metrics).
+
+        Sets ``tool_call_tokens`` equal to ``text_tokens`` (since the server
+        reports all completion tokens together) and records the number of
+        individual tool calls. These fields are additive metadata -- they do
+        not affect ``total_tokens``.
+        """
+        output_metrics.tool_call_tokens = output_metrics.text_tokens
+        output_metrics.tool_call_count = tool_call_count
+
     def compile_non_streaming(
         self,
         request: GenerationRequest,
@@ -530,16 +556,24 @@ def compile_non_streaming(
         Process a complete chat completion response.
 
         Extracts content from the message object within choices, handling the nested
-        structure specific to chat completion endpoints.
+        structure specific to chat completion endpoints. When the model returns tool
+        calls instead of text content, the tool calls are serialized as JSON text.
 
         :param request: Original generation request
         :param response: Complete API response containing choices and usage data
         :return: Standardized GenerationResponse with extracted content and metrics
         """
         choices, usage = self.extract_choices_and_usage(response)
         choice: dict[str, dict] = choices[0] if choices else {}
-        text = choice.get("message", {}).get("content", "")
+        message = choice.get("message", {})
+        text = message.get("content") or ""
+        # Tool call responses set content=null and put output in tool_calls
+        tool_calls = message.get("tool_calls") if not text else None
+        if tool_calls:
+            text = self._tool_calls_to_text(tool_calls)
         input_metrics, output_metrics = self.extract_metrics(usage, text)
+        if tool_calls:
+            self._add_tool_call_metrics(output_metrics, len(tool_calls))
 
         return GenerationResponse(
             request_id=request.request_id,
@@ -555,7 +589,8 @@ def add_streaming_line(self, line: str) -> int | None:
         Process a single line from a chat completion streaming response.
 
         Handles the chat completion specific delta structure where content is nested
-        within delta objects in the streaming response chunks.
+        within delta objects in the streaming response chunks. Also accumulates
+        ``tool_calls`` deltas when the model streams function call output.
 
         :param line: Raw SSE line from the streaming response
         :return: 1 if content was extracted, 0 if line ignored, None if done
@@ -569,11 +604,34 @@ def add_streaming_line(self, line: str) -> int | None:
         updated = False
         choices, usage = self.extract_choices_and_usage(data)
         choice: dict[str, dict] = choices[0] if choices else {}
+        delta = choice.get("delta", {}) if choices else {}
 
-        if choices and (content := choice.get("delta", {}).get("content")):
+        if content := delta.get("content"):
             self.streaming_texts.append(content)
             updated = True
 
+        # Tool call streaming sends incremental chunks via delta.tool_calls.
+        # Each chunk (tc_delta) carries an "index" identifying which tool call
+        # it belongs to (for parallel tool calls), plus partial fragments of
+        # function.name and function.arguments that must be concatenated across
+        # multiple SSE events to reconstruct the complete call.
+        for tc_delta in delta.get("tool_calls", []):
+            idx = tc_delta.get("index", 0)
+            if idx not in self.streaming_tool_calls:
+                # First chunk for this tool call: initialize with id and type
+                self.streaming_tool_calls[idx] = {
+                    "id": tc_delta.get("id", ""),
+                    "type": tc_delta.get("type", "function"),
+                    "function": {"name": "", "arguments": ""},
+                }
+            tc = self.streaming_tool_calls[idx]
+            fn_delta = tc_delta.get("function", {})
+            if fn_name := fn_delta.get("name"):
+                tc["function"]["name"] += fn_name
+            if fn_args := fn_delta.get("arguments"):
+                tc["function"]["arguments"] += fn_args
+            updated = True
+
         if usage:
             self.streaming_usage = usage
 
@@ -585,11 +643,23 @@ def compile_streaming(
         """
         Compile accumulated streaming chat completion content into a final response.
 
+        When no text content was streamed but tool calls were accumulated, the tool
+        calls are serialized as JSON text.
+
         :param request: Original generation request
         :return: Standardized GenerationResponse with concatenated content and metrics
         """
         text = "".join(self.streaming_texts)
+        has_tool_calls = not text and bool(self.streaming_tool_calls)
+        if has_tool_calls:
+            tool_calls_list = [
+                self.streaming_tool_calls[idx]
+                for idx in sorted(self.streaming_tool_calls)
+            ]
+            text = self._tool_calls_to_text(tool_calls_list)
         input_metrics, output_metrics = self.extract_metrics(self.streaming_usage, text)
+        if has_tool_calls:
+            self._add_tool_call_metrics(output_metrics, len(self.streaming_tool_calls))
 
         return GenerationResponse(
             request_id=request.request_id,
diff --git a/src/guidellm/benchmark/outputs/console.py b/src/guidellm/benchmark/outputs/console.py
@@ -224,6 +224,7 @@ async def finalize(self, report: GenerativeBenchmarksReport) -> str:
         self.print_image_table(report)
         self.print_video_table(report)
         self.print_audio_table(report)
+        self.print_tool_call_table(report)
         self.print_request_counts_table(report)
         self.print_request_latency_table(report)
         self.print_server_throughput_table(report)
@@ -359,6 +360,22 @@ def print_audio_table(self, report: GenerativeBenchmarksReport):
             ],
         )
 
+    def print_tool_call_table(self, report: GenerativeBenchmarksReport):
+        """
+        Print tool-call-specific metrics table if any tool call data exists.
+
+        :param report: The benchmark report containing tool call metrics
+        """
+        self._print_modality_table(
+            report=report,
+            modality="tool_call",
+            title="Tool Call Metrics Statistics (Completed Requests)",
+            metric_groups=[
+                ("tokens", "Tokens"),
+                ("count", "Count"),
+            ],
+        )
+
     def print_request_counts_table(self, report: GenerativeBenchmarksReport):
         """
         Print request token count statistics table.
@@ -512,7 +529,7 @@ def print_server_throughput_table(self, report: GenerativeBenchmarksReport):
     def _print_modality_table(
         self,
         report: GenerativeBenchmarksReport,
-        modality: Literal["text", "image", "video", "audio"],
+        modality: Literal["text", "image", "video", "audio", "tool_call"],
         title: str,
         metric_groups: list[tuple[str, str]],
     ):
diff --git a/src/guidellm/benchmark/outputs/csv.py b/src/guidellm/benchmark/outputs/csv.py
@@ -54,6 +54,10 @@
         ("seconds", "Seconds"),
         ("bytes", "Bytes"),
     ],
+    "tool_call": [
+        ("tokens", "Tokens"),
+        ("count", "Count"),
+    ],
 }
 
 
diff --git a/src/guidellm/benchmark/schemas/generative/metrics.py b/src/guidellm/benchmark/schemas/generative/metrics.py
@@ -32,6 +32,7 @@
     "GenerativeMetrics",
     "GenerativeMetricsSummary",
     "GenerativeTextMetricsSummary",
+    "GenerativeToolCallMetricsSummary",
     "GenerativeVideoMetricsSummary",
     "SchedulerMetrics",
     "StatusTypes",
@@ -691,6 +692,53 @@ def compile(
         )
 
 
+class GenerativeToolCallMetricsSummary(StandardBaseDict):
+    """
+    Tool-call-specific metric summaries for generative benchmarks.
+
+    Tracks token and count metrics for tool call outputs. These are a subset
+    of text metrics (tool_call_tokens is a subset of text_tokens) and are only
+    populated for requests where the model produced tool calls.
+    """
+
+    tokens: GenerativeMetricsSummary | None = Field(
+        description="Tool call token count metrics and distributions"
+    )
+    count: GenerativeMetricsSummary | None = Field(
+        description="Tool call count metrics and distributions"
+    )
+
+    @classmethod
+    def compile(
+        cls,
+        successful: list[GenerativeRequestStats],
+        incomplete: list[GenerativeRequestStats],
+        errored: list[GenerativeRequestStats],
+    ) -> GenerativeToolCallMetricsSummary:
+        """
+        Compile tool call metrics summary from request statistics.
+
+        :param successful: Successfully completed request statistics
+        :param incomplete: Incomplete/cancelled request statistics
+        :param errored: Failed request statistics
+        :return: Compiled tool call metrics summary
+        """
+        return GenerativeToolCallMetricsSummary(
+            tokens=GenerativeMetricsSummary.compile(
+                property_name="tool_call_tokens",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            count=GenerativeMetricsSummary.compile(
+                property_name="tool_call_count",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+        )
+
+
 class GenerativeMetrics(StandardBaseDict):
     """
     Comprehensive metrics for generative AI benchmarks.
@@ -770,6 +818,9 @@ class GenerativeMetrics(StandardBaseDict):
     audio: GenerativeAudioMetricsSummary = Field(
         description="Audio-specific metrics for tokens, samples, duration, and bytes"
     )
+    tool_call: GenerativeToolCallMetricsSummary = Field(
+        description="Tool call metrics for tokens and call counts"
+    )
 
     @classmethod
     def compile(cls, accumulator: GenerativeBenchmarkAccumulator) -> GenerativeMetrics:
@@ -924,4 +975,7 @@ def compile(cls, accumulator: GenerativeBenchmarkAccumulator) -> GenerativeMetri
             audio=GenerativeAudioMetricsSummary.compile(
                 successful=successful, incomplete=incomplete, errored=errored
             ),
+            tool_call=GenerativeToolCallMetricsSummary.compile(
+                successful=successful, incomplete=incomplete, errored=errored
+            ),
         )
diff --git a/src/guidellm/mock_server/models.py b/src/guidellm/mock_server/models.py
@@ -146,6 +146,16 @@ class ChatCompletionsRequest(BaseModel):
     user: str | None = Field(
         default=None, description="User identifier for tracking and abuse monitoring"
     )
+    tools: list[dict[str, Any]] | None = Field(
+        default=None, description="Tool definitions for function calling"
+    )
+    tool_choice: str | dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Controls tool selection: 'auto', 'required', 'none', "
+            "or a specific function"
+        ),
+    )
 
     # vLLM extensions
     use_beam_search: bool | None = Field(
diff --git a/src/guidellm/schemas/request.py b/src/guidellm/schemas/request.py
@@ -151,6 +151,17 @@ class UsageMetrics(StandardBaseDict):
         default=None, description="Number of audio bytes processed/generated."
     )
 
+    # Tool call stats (subset of text stats, not counted separately in total_tokens)
+    tool_call_tokens: int | None = Field(
+        default=None,
+        description=(
+            "Number of output tokens that were tool calls (subset of text_tokens)."
+        ),
+    )
+    tool_call_count: int | None = Field(
+        default=None, description="Number of tool calls generated."
+    )
+
     @computed_field  # type: ignore[misc]
     @property
     def total_tokens(self) -> int | None:
diff --git a/tests/unit/backends/openai/test_request_handlers.py b/tests/unit/backends/openai/test_request_handlers.py
diff --git a/tests/unit/schemas/test_request.py b/tests/unit/schemas/test_request.py

Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,10 @@`
`54`	`54`	`("seconds", "Seconds"),`
`55`	`55`	`("bytes", "Bytes"),`
`56`	`56`	`],`
	`57`	`+ "tool_call": [`
	`58`	`+ ("tokens", "Tokens"),`
	`59`	`+ ("count", "Count"),`
	`60`	`+ ],`
`57`	`61`	`}`
`58`	`62`
`59`	`63`