@@ -390,9 +390,14 @@ class ChatCompletionsRequestHandler(TextCompletionsRequestHandler):
390390
391391 Extends TextCompletionsResponseHandler to handle chat completion requests where
392392 generated text is nested within message objects in the choices array. Processes
393- both streaming and non-streaming chat completion responses.
393+ both streaming and non-streaming chat completion responses, including tool call
394+ responses where the model outputs ``tool_calls`` instead of text content.
394395 """
395396
397+ def __init__ (self ):
398+ super ().__init__ ()
399+ self .streaming_tool_calls : dict [int , dict ] = {}
400+
396401 def _format_prompts (
397402 self , column_data : list [dict [str , Any ]], column_type : str
398403 ) -> list [dict [str , Any ]]:
@@ -520,6 +525,27 @@ def format( # noqa: C901
520525
521526 return arguments
522527
528+ @staticmethod
529+ def _tool_calls_to_text (tool_calls : list [dict ]) -> str :
530+ """Serialize a ``tool_calls`` array to a JSON string."""
531+ # orjson.dumps returns bytes; stdlib json.dumps returns str
532+ raw = json .dumps (tool_calls )
533+ return raw .decode ("utf-8" ) if isinstance (raw , bytes ) else raw
534+
535+ @staticmethod
536+ def _add_tool_call_metrics (
537+ output_metrics : UsageMetrics , tool_call_count : int
538+ ) -> None :
539+ """Tag output metrics with tool call info (subset of text metrics).
540+
541+ Sets ``tool_call_tokens`` equal to ``text_tokens`` (since the server
542+ reports all completion tokens together) and records the number of
543+ individual tool calls. These fields are additive metadata -- they do
544+ not affect ``total_tokens``.
545+ """
546+ output_metrics .tool_call_tokens = output_metrics .text_tokens
547+ output_metrics .tool_call_count = tool_call_count
548+
523549 def compile_non_streaming (
524550 self ,
525551 request : GenerationRequest ,
@@ -530,16 +556,24 @@ def compile_non_streaming(
530556 Process a complete chat completion response.
531557
532558 Extracts content from the message object within choices, handling the nested
533- structure specific to chat completion endpoints.
559+ structure specific to chat completion endpoints. When the model returns tool
560+ calls instead of text content, the tool calls are serialized as JSON text.
534561
535562 :param request: Original generation request
536563 :param response: Complete API response containing choices and usage data
537564 :return: Standardized GenerationResponse with extracted content and metrics
538565 """
539566 choices , usage = self .extract_choices_and_usage (response )
540567 choice : dict [str , dict ] = choices [0 ] if choices else {}
541- text = choice .get ("message" , {}).get ("content" , "" )
568+ message = choice .get ("message" , {})
569+ text = message .get ("content" ) or ""
570+ # Tool call responses set content=null and put output in tool_calls
571+ tool_calls = message .get ("tool_calls" ) if not text else None
572+ if tool_calls :
573+ text = self ._tool_calls_to_text (tool_calls )
542574 input_metrics , output_metrics = self .extract_metrics (usage , text )
575+ if tool_calls :
576+ self ._add_tool_call_metrics (output_metrics , len (tool_calls ))
543577
544578 return GenerationResponse (
545579 request_id = request .request_id ,
@@ -555,7 +589,8 @@ def add_streaming_line(self, line: str) -> int | None:
555589 Process a single line from a chat completion streaming response.
556590
557591 Handles the chat completion specific delta structure where content is nested
558- within delta objects in the streaming response chunks.
592+ within delta objects in the streaming response chunks. Also accumulates
593+ ``tool_calls`` deltas when the model streams function call output.
559594
560595 :param line: Raw SSE line from the streaming response
561596 :return: 1 if content was extracted, 0 if line ignored, None if done
@@ -569,11 +604,34 @@ def add_streaming_line(self, line: str) -> int | None:
569604 updated = False
570605 choices , usage = self .extract_choices_and_usage (data )
571606 choice : dict [str , dict ] = choices [0 ] if choices else {}
607+ delta = choice .get ("delta" , {}) if choices else {}
572608
573- if choices and ( content := choice . get ( " delta" , {}) .get ("content" ) ):
609+ if content := delta .get ("content" ):
574610 self .streaming_texts .append (content )
575611 updated = True
576612
613+ # Tool call streaming sends incremental chunks via delta.tool_calls.
614+ # Each chunk (tc_delta) carries an "index" identifying which tool call
615+ # it belongs to (for parallel tool calls), plus partial fragments of
616+ # function.name and function.arguments that must be concatenated across
617+ # multiple SSE events to reconstruct the complete call.
618+ for tc_delta in delta .get ("tool_calls" , []):
619+ idx = tc_delta .get ("index" , 0 )
620+ if idx not in self .streaming_tool_calls :
621+ # First chunk for this tool call: initialize with id and type
622+ self .streaming_tool_calls [idx ] = {
623+ "id" : tc_delta .get ("id" , "" ),
624+ "type" : tc_delta .get ("type" , "function" ),
625+ "function" : {"name" : "" , "arguments" : "" },
626+ }
627+ tc = self .streaming_tool_calls [idx ]
628+ fn_delta = tc_delta .get ("function" , {})
629+ if fn_name := fn_delta .get ("name" ):
630+ tc ["function" ]["name" ] += fn_name
631+ if fn_args := fn_delta .get ("arguments" ):
632+ tc ["function" ]["arguments" ] += fn_args
633+ updated = True
634+
577635 if usage :
578636 self .streaming_usage = usage
579637
@@ -585,11 +643,23 @@ def compile_streaming(
585643 """
586644 Compile accumulated streaming chat completion content into a final response.
587645
646+ When no text content was streamed but tool calls were accumulated, the tool
647+ calls are serialized as JSON text.
648+
588649 :param request: Original generation request
589650 :return: Standardized GenerationResponse with concatenated content and metrics
590651 """
591652 text = "" .join (self .streaming_texts )
653+ has_tool_calls = not text and bool (self .streaming_tool_calls )
654+ if has_tool_calls :
655+ tool_calls_list = [
656+ self .streaming_tool_calls [idx ]
657+ for idx in sorted (self .streaming_tool_calls )
658+ ]
659+ text = self ._tool_calls_to_text (tool_calls_list )
592660 input_metrics , output_metrics = self .extract_metrics (self .streaming_usage , text )
661+ if has_tool_calls :
662+ self ._add_tool_call_metrics (output_metrics , len (self .streaming_tool_calls ))
593663
594664 return GenerationResponse (
595665 request_id = request .request_id ,
0 commit comments