Skip to content

Commit 5b309e6

Browse files
committed
Added support for tool calling metrics in Chat Completions API
Generated-by: Cursor AI Signed-off-by: Jared O'Connell <joconnel@redhat.com>
1 parent c9b5d99 commit 5b309e6

File tree

8 files changed

+529
-6
lines changed

8 files changed

+529
-6
lines changed

src/guidellm/backends/openai/request_handlers.py

Lines changed: 75 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -390,9 +390,14 @@ class ChatCompletionsRequestHandler(TextCompletionsRequestHandler):
390390
391391
Extends TextCompletionsResponseHandler to handle chat completion requests where
392392
generated text is nested within message objects in the choices array. Processes
393-
both streaming and non-streaming chat completion responses.
393+
both streaming and non-streaming chat completion responses, including tool call
394+
responses where the model outputs ``tool_calls`` instead of text content.
394395
"""
395396

397+
def __init__(self):
398+
super().__init__()
399+
self.streaming_tool_calls: dict[int, dict] = {}
400+
396401
def _format_prompts(
397402
self, column_data: list[dict[str, Any]], column_type: str
398403
) -> list[dict[str, Any]]:
@@ -520,6 +525,27 @@ def format( # noqa: C901
520525

521526
return arguments
522527

528+
@staticmethod
529+
def _tool_calls_to_text(tool_calls: list[dict]) -> str:
530+
"""Serialize a ``tool_calls`` array to a JSON string."""
531+
# orjson.dumps returns bytes; stdlib json.dumps returns str
532+
raw = json.dumps(tool_calls)
533+
return raw.decode("utf-8") if isinstance(raw, bytes) else raw
534+
535+
@staticmethod
536+
def _add_tool_call_metrics(
537+
output_metrics: UsageMetrics, tool_call_count: int
538+
) -> None:
539+
"""Tag output metrics with tool call info (subset of text metrics).
540+
541+
Sets ``tool_call_tokens`` equal to ``text_tokens`` (since the server
542+
reports all completion tokens together) and records the number of
543+
individual tool calls. These fields are additive metadata -- they do
544+
not affect ``total_tokens``.
545+
"""
546+
output_metrics.tool_call_tokens = output_metrics.text_tokens
547+
output_metrics.tool_call_count = tool_call_count
548+
523549
def compile_non_streaming(
524550
self,
525551
request: GenerationRequest,
@@ -530,16 +556,24 @@ def compile_non_streaming(
530556
Process a complete chat completion response.
531557
532558
Extracts content from the message object within choices, handling the nested
533-
structure specific to chat completion endpoints.
559+
structure specific to chat completion endpoints. When the model returns tool
560+
calls instead of text content, the tool calls are serialized as JSON text.
534561
535562
:param request: Original generation request
536563
:param response: Complete API response containing choices and usage data
537564
:return: Standardized GenerationResponse with extracted content and metrics
538565
"""
539566
choices, usage = self.extract_choices_and_usage(response)
540567
choice: dict[str, dict] = choices[0] if choices else {}
541-
text = choice.get("message", {}).get("content", "")
568+
message = choice.get("message", {})
569+
text = message.get("content") or ""
570+
# Tool call responses set content=null and put output in tool_calls
571+
tool_calls = message.get("tool_calls") if not text else None
572+
if tool_calls:
573+
text = self._tool_calls_to_text(tool_calls)
542574
input_metrics, output_metrics = self.extract_metrics(usage, text)
575+
if tool_calls:
576+
self._add_tool_call_metrics(output_metrics, len(tool_calls))
543577

544578
return GenerationResponse(
545579
request_id=request.request_id,
@@ -555,7 +589,8 @@ def add_streaming_line(self, line: str) -> int | None:
555589
Process a single line from a chat completion streaming response.
556590
557591
Handles the chat completion specific delta structure where content is nested
558-
within delta objects in the streaming response chunks.
592+
within delta objects in the streaming response chunks. Also accumulates
593+
``tool_calls`` deltas when the model streams function call output.
559594
560595
:param line: Raw SSE line from the streaming response
561596
:return: 1 if content was extracted, 0 if line ignored, None if done
@@ -569,11 +604,34 @@ def add_streaming_line(self, line: str) -> int | None:
569604
updated = False
570605
choices, usage = self.extract_choices_and_usage(data)
571606
choice: dict[str, dict] = choices[0] if choices else {}
607+
delta = choice.get("delta", {}) if choices else {}
572608

573-
if choices and (content := choice.get("delta", {}).get("content")):
609+
if content := delta.get("content"):
574610
self.streaming_texts.append(content)
575611
updated = True
576612

613+
# Tool call streaming sends incremental chunks via delta.tool_calls.
614+
# Each chunk (tc_delta) carries an "index" identifying which tool call
615+
# it belongs to (for parallel tool calls), plus partial fragments of
616+
# function.name and function.arguments that must be concatenated across
617+
# multiple SSE events to reconstruct the complete call.
618+
for tc_delta in delta.get("tool_calls", []):
619+
idx = tc_delta.get("index", 0)
620+
if idx not in self.streaming_tool_calls:
621+
# First chunk for this tool call: initialize with id and type
622+
self.streaming_tool_calls[idx] = {
623+
"id": tc_delta.get("id", ""),
624+
"type": tc_delta.get("type", "function"),
625+
"function": {"name": "", "arguments": ""},
626+
}
627+
tc = self.streaming_tool_calls[idx]
628+
fn_delta = tc_delta.get("function", {})
629+
if fn_name := fn_delta.get("name"):
630+
tc["function"]["name"] += fn_name
631+
if fn_args := fn_delta.get("arguments"):
632+
tc["function"]["arguments"] += fn_args
633+
updated = True
634+
577635
if usage:
578636
self.streaming_usage = usage
579637

@@ -585,11 +643,23 @@ def compile_streaming(
585643
"""
586644
Compile accumulated streaming chat completion content into a final response.
587645
646+
When no text content was streamed but tool calls were accumulated, the tool
647+
calls are serialized as JSON text.
648+
588649
:param request: Original generation request
589650
:return: Standardized GenerationResponse with concatenated content and metrics
590651
"""
591652
text = "".join(self.streaming_texts)
653+
has_tool_calls = not text and bool(self.streaming_tool_calls)
654+
if has_tool_calls:
655+
tool_calls_list = [
656+
self.streaming_tool_calls[idx]
657+
for idx in sorted(self.streaming_tool_calls)
658+
]
659+
text = self._tool_calls_to_text(tool_calls_list)
592660
input_metrics, output_metrics = self.extract_metrics(self.streaming_usage, text)
661+
if has_tool_calls:
662+
self._add_tool_call_metrics(output_metrics, len(self.streaming_tool_calls))
593663

594664
return GenerationResponse(
595665
request_id=request.request_id,

src/guidellm/benchmark/outputs/console.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ async def finalize(self, report: GenerativeBenchmarksReport) -> str:
224224
self.print_image_table(report)
225225
self.print_video_table(report)
226226
self.print_audio_table(report)
227+
self.print_tool_call_table(report)
227228
self.print_request_counts_table(report)
228229
self.print_request_latency_table(report)
229230
self.print_server_throughput_table(report)
@@ -359,6 +360,22 @@ def print_audio_table(self, report: GenerativeBenchmarksReport):
359360
],
360361
)
361362

363+
def print_tool_call_table(self, report: GenerativeBenchmarksReport):
364+
"""
365+
Print tool-call-specific metrics table if any tool call data exists.
366+
367+
:param report: The benchmark report containing tool call metrics
368+
"""
369+
self._print_modality_table(
370+
report=report,
371+
modality="tool_call",
372+
title="Tool Call Metrics Statistics (Completed Requests)",
373+
metric_groups=[
374+
("tokens", "Tokens"),
375+
("count", "Count"),
376+
],
377+
)
378+
362379
def print_request_counts_table(self, report: GenerativeBenchmarksReport):
363380
"""
364381
Print request token count statistics table.
@@ -512,7 +529,7 @@ def print_server_throughput_table(self, report: GenerativeBenchmarksReport):
512529
def _print_modality_table(
513530
self,
514531
report: GenerativeBenchmarksReport,
515-
modality: Literal["text", "image", "video", "audio"],
532+
modality: Literal["text", "image", "video", "audio", "tool_call"],
516533
title: str,
517534
metric_groups: list[tuple[str, str]],
518535
):

src/guidellm/benchmark/outputs/csv.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@
5454
("seconds", "Seconds"),
5555
("bytes", "Bytes"),
5656
],
57+
"tool_call": [
58+
("tokens", "Tokens"),
59+
("count", "Count"),
60+
],
5761
}
5862

5963

src/guidellm/benchmark/schemas/generative/metrics.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
"GenerativeMetrics",
3333
"GenerativeMetricsSummary",
3434
"GenerativeTextMetricsSummary",
35+
"GenerativeToolCallMetricsSummary",
3536
"GenerativeVideoMetricsSummary",
3637
"SchedulerMetrics",
3738
"StatusTypes",
@@ -691,6 +692,53 @@ def compile(
691692
)
692693

693694

695+
class GenerativeToolCallMetricsSummary(StandardBaseDict):
696+
"""
697+
Tool-call-specific metric summaries for generative benchmarks.
698+
699+
Tracks token and count metrics for tool call outputs. These are a subset
700+
of text metrics (tool_call_tokens is a subset of text_tokens) and are only
701+
populated for requests where the model produced tool calls.
702+
"""
703+
704+
tokens: GenerativeMetricsSummary | None = Field(
705+
description="Tool call token count metrics and distributions"
706+
)
707+
count: GenerativeMetricsSummary | None = Field(
708+
description="Tool call count metrics and distributions"
709+
)
710+
711+
@classmethod
712+
def compile(
713+
cls,
714+
successful: list[GenerativeRequestStats],
715+
incomplete: list[GenerativeRequestStats],
716+
errored: list[GenerativeRequestStats],
717+
) -> GenerativeToolCallMetricsSummary:
718+
"""
719+
Compile tool call metrics summary from request statistics.
720+
721+
:param successful: Successfully completed request statistics
722+
:param incomplete: Incomplete/cancelled request statistics
723+
:param errored: Failed request statistics
724+
:return: Compiled tool call metrics summary
725+
"""
726+
return GenerativeToolCallMetricsSummary(
727+
tokens=GenerativeMetricsSummary.compile(
728+
property_name="tool_call_tokens",
729+
successful=successful,
730+
incomplete=incomplete,
731+
errored=errored,
732+
),
733+
count=GenerativeMetricsSummary.compile(
734+
property_name="tool_call_count",
735+
successful=successful,
736+
incomplete=incomplete,
737+
errored=errored,
738+
),
739+
)
740+
741+
694742
class GenerativeMetrics(StandardBaseDict):
695743
"""
696744
Comprehensive metrics for generative AI benchmarks.
@@ -770,6 +818,9 @@ class GenerativeMetrics(StandardBaseDict):
770818
audio: GenerativeAudioMetricsSummary = Field(
771819
description="Audio-specific metrics for tokens, samples, duration, and bytes"
772820
)
821+
tool_call: GenerativeToolCallMetricsSummary = Field(
822+
description="Tool call metrics for tokens and call counts"
823+
)
773824

774825
@classmethod
775826
def compile(cls, accumulator: GenerativeBenchmarkAccumulator) -> GenerativeMetrics:
@@ -924,4 +975,7 @@ def compile(cls, accumulator: GenerativeBenchmarkAccumulator) -> GenerativeMetri
924975
audio=GenerativeAudioMetricsSummary.compile(
925976
successful=successful, incomplete=incomplete, errored=errored
926977
),
978+
tool_call=GenerativeToolCallMetricsSummary.compile(
979+
successful=successful, incomplete=incomplete, errored=errored
980+
),
927981
)

src/guidellm/mock_server/models.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,16 @@ class ChatCompletionsRequest(BaseModel):
146146
user: str | None = Field(
147147
default=None, description="User identifier for tracking and abuse monitoring"
148148
)
149+
tools: list[dict[str, Any]] | None = Field(
150+
default=None, description="Tool definitions for function calling"
151+
)
152+
tool_choice: str | dict[str, Any] | None = Field(
153+
default=None,
154+
description=(
155+
"Controls tool selection: 'auto', 'required', 'none', "
156+
"or a specific function"
157+
),
158+
)
149159

150160
# vLLM extensions
151161
use_beam_search: bool | None = Field(

src/guidellm/schemas/request.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,17 @@ class UsageMetrics(StandardBaseDict):
151151
default=None, description="Number of audio bytes processed/generated."
152152
)
153153

154+
# Tool call stats (subset of text stats, not counted separately in total_tokens)
155+
tool_call_tokens: int | None = Field(
156+
default=None,
157+
description=(
158+
"Number of output tokens that were tool calls (subset of text_tokens)."
159+
),
160+
)
161+
tool_call_count: int | None = Field(
162+
default=None, description="Number of tool calls generated."
163+
)
164+
154165
@computed_field # type: ignore[misc]
155166
@property
156167
def total_tokens(self) -> int | None:

0 commit comments

Comments
 (0)