frontend: report reasoning_tokens in chat completion usage

GavinZhu-GMI · GavinZhu-GMI · commit de764e6d9e71 · 2026-06-30T07:03:20.000Z
The SGLang chat processor (--dyn-chat-processor sglang) split reasoning_content
per request but never populated usage.completion_tokens_details.reasoning_tokens.
The engine-side counter is gated on require_reasoning, which the bare-Engine path
does not set, so meta_info["reasoning_tokens"] is always 0; count in the frontend
instead, where the per-request ReasoningParser already runs.

SglangStreamingPostProcessor accumulates the reasoning-text deltas and
reasoning_token_count() re-encodes the span (the streaming detector buffers, so
per-chunk token_ids cannot be tallied). _generate_and_stream injects
completion_tokens_details when a reasoning parser is active for the request;
sibling detail fields are 0 (not null) so strict downstream usage parsers do not
drop the object.

Signed-off-by: Gavin.Zhu &lt;gavin.z@gmicloud.ai&gt;
diff --git a/components/src/dynamo/frontend/sglang_prepost.py b/components/src/dynamo/frontend/sglang_prepost.py
@@ -905,6 +905,12 @@ def __init__(
         self._tool_call_args: dict[int, list[str]] = {}  # tool_index -> arg chunks
         # Full text accumulator for robust finish-time re-parse.
         self._tool_text_parts: list[str] = []
+        # Reasoning text deltas, accumulated so usage can report
+        # reasoning_tokens. Counting here (not in the worker) is the only
+        # per-request-correct place: SGLang's scheduler only counts reasoning
+        # when require_reasoning=True, which the Dynamo bare-Engine path never
+        # sets. See reasoning_token_count().
+        self._reasoning_text_parts: list[str] = []
 
     def _strip_trailing_eos_token_ids(self, token_ids: list[int]) -> list[int]:
         if not self._eos_token_ids:
@@ -959,6 +965,27 @@ def _incremental_decode(self, new_token_ids: list[int]) -> str:
 
         return window_text[len(prefix_text) :]
 
+    def reasoning_token_count(self) -> int:
+        """Token count of the reasoning span for usage.completion_tokens_details.
+
+        Re-encodes the accumulated reasoning text. This is approximate vs the
+        exact generated count (it excludes the think markers and relies on BPE
+        re-tokenization being faithful), but it is per-request and
+        version-stable -- and it is the only viable source here, because the
+        engine's own scheduler counter is gated on ``require_reasoning`` which
+        the Dynamo bare-Engine path never sets. We re-encode the full text
+        rather than tallying per-chunk ``token_ids`` because the streaming
+        reasoning detector buffers across chunks, so token_ids do not align to
+        the reasoning/normal split per chunk.
+        """
+        if not self._reasoning_text_parts:
+            return 0
+        return len(
+            self.tokenizer.encode(
+                "".join(self._reasoning_text_parts), add_special_tokens=False
+            )
+        )
+
     def process_output(self, engine_response: dict[str, Any]) -> dict[str, Any] | None:
         """Process a single engine response chunk into an OpenAI SSE choice dict.
 
@@ -1001,6 +1028,8 @@ def process_output(self, engine_response: dict[str, Any]) -> dict[str, Any] | No
             r_text, n_text = self.reasoning_parser.parse_stream_chunk(delta_text)
             reasoning_text = r_text or None
             normal_text = n_text or ""
+            if r_text:
+                self._reasoning_text_parts.append(r_text)
 
         # -- Tool call parsing (accumulate deltas) --
         content_text = normal_text
diff --git a/components/src/dynamo/frontend/sglang_processor.py b/components/src/dynamo/frontend/sglang_processor.py
@@ -602,6 +602,24 @@ async def _generate_and_stream(
                             "object": "chat.completion.chunk",
                         }
                         if pending_usage:
+                            # Report reasoning tokens. The worker's completion_usage
+                            # has no completion_tokens_details (the bare-Engine path
+                            # cannot count reasoning), and the per-request reasoning
+                            # parser here is the only correct source. Emit only when a
+                            # reasoning parser is active for this request
+                            # (separate_reasoning honored); reasoning_tokens=0 is still
+                            # correct for a request that reasoned nothing.
+                            if post.reasoning_parser is not None:
+                                completion_details = dict(
+                                    pending_usage.get("completion_tokens_details") or {}
+                                )
+                                completion_details[
+                                    "reasoning_tokens"
+                                ] = post.reasoning_token_count()
+                                pending_usage = {
+                                    **pending_usage,
+                                    "completion_tokens_details": completion_details,
+                                }
                             dynamo_out["usage"] = pending_usage
                             pending_usage = None
                         response_nvext: dict[str, Any] = {}
diff --git a/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py b/components/src/dynamo/frontend/tests/test_sglang_processor_unit.py
@@ -2344,6 +2344,67 @@ def test_reasoning_separated(self, tokenizer):
         assert "42" in content
 
 
+class TestReasoningTokenCount:  # FRONTEND.9 — usage.completion_tokens_details.reasoning_tokens
+    """reasoning_token_count() re-encodes the accumulated reasoning span.
+
+    This is the per-request source for usage.completion_tokens_details.reasoning_tokens:
+    SGLang's scheduler only counts reasoning when require_reasoning=True, which the
+    Dynamo bare-Engine path never sets, so the count must come from the frontend parser.
+    """
+
+    def test_zero_without_reasoning_parser(self, tokenizer):
+        """No reasoning parser → no reasoning span → 0."""
+        post = SglangStreamingPostProcessor(
+            tokenizer=tokenizer, tool_call_parser=None, reasoning_parser=None
+        )
+        post.process_output(
+            {"token_ids": tokenizer.encode("Hello"), "finish_reason": "stop"}
+        )
+        assert post.reasoning_token_count() == 0
+
+    def test_zero_when_nothing_reasoned(self, tokenizer):
+        """Reasoning parser active but the model emitted no reasoning → 0."""
+        from sglang.srt.parser.reasoning_parser import ReasoningParser
+
+        rp = ReasoningParser(model_type="qwen3", stream_reasoning=True)
+        post = SglangStreamingPostProcessor(
+            tokenizer=tokenizer, tool_call_parser=None, reasoning_parser=rp
+        )
+        assert post.reasoning_token_count() == 0
+
+    def test_counts_accumulated_reasoning(self, tokenizer):
+        """<think>...</think> tokens are reported, and match a re-encode of the span."""
+        from sglang.srt.parser.reasoning_parser import ReasoningParser
+
+        rp = ReasoningParser(model_type="qwen3", stream_reasoning=True)
+        post = SglangStreamingPostProcessor(
+            tokenizer=tokenizer, tool_call_parser=None, reasoning_parser=rp
+        )
+        text = (
+            "<think>\nLet me think about this carefully.\n</think>\n\nThe answer is 42."
+        )
+        token_ids = tokenizer.encode(text)
+        reasoning_parts: list[str] = []
+        for i in range(0, len(token_ids), 5):
+            batch = token_ids[i : i + 5]
+            is_last = i + 5 >= len(token_ids)
+            choice = post.process_output(
+                {"token_ids": batch, "finish_reason": "stop" if is_last else None}
+            )
+            if choice:
+                reasoning_parts.append(
+                    choice.get("delta", {}).get("reasoning_content", "")
+                )
+        reasoning = "".join(reasoning_parts)
+
+        count = post.reasoning_token_count()
+        assert count > 0
+        # Equals a re-encode of exactly the reasoning text the parser surfaced.
+        assert count == len(tokenizer.encode(reasoning, add_special_tokens=False))
+        # And it's strictly less than the whole completion (content excluded).
+        assert count < len(token_ids)
+
+
 # ---------------------------------------------------------------------------
 # Utility functions
 # ---------------------------------------------------------------------------