Skip to content

Commit de764e6

Browse files
committed
frontend: report reasoning_tokens in chat completion usage
The SGLang chat processor (--dyn-chat-processor sglang) split reasoning_content per request but never populated usage.completion_tokens_details.reasoning_tokens. The engine-side counter is gated on require_reasoning, which the bare-Engine path does not set, so meta_info["reasoning_tokens"] is always 0; count in the frontend instead, where the per-request ReasoningParser already runs. SglangStreamingPostProcessor accumulates the reasoning-text deltas and reasoning_token_count() re-encodes the span (the streaming detector buffers, so per-chunk token_ids cannot be tallied). _generate_and_stream injects completion_tokens_details when a reasoning parser is active for the request; sibling detail fields are 0 (not null) so strict downstream usage parsers do not drop the object. Signed-off-by: Gavin.Zhu <gavin.z@gmicloud.ai>
1 parent ac05d48 commit de764e6

3 files changed

Lines changed: 108 additions & 0 deletions

File tree

components/src/dynamo/frontend/sglang_prepost.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -905,6 +905,12 @@ def __init__(
905905
self._tool_call_args: dict[int, list[str]] = {} # tool_index -> arg chunks
906906
# Full text accumulator for robust finish-time re-parse.
907907
self._tool_text_parts: list[str] = []
908+
# Reasoning text deltas, accumulated so usage can report
909+
# reasoning_tokens. Counting here (not in the worker) is the only
910+
# per-request-correct place: SGLang's scheduler only counts reasoning
911+
# when require_reasoning=True, which the Dynamo bare-Engine path never
912+
# sets. See reasoning_token_count().
913+
self._reasoning_text_parts: list[str] = []
908914

909915
def _strip_trailing_eos_token_ids(self, token_ids: list[int]) -> list[int]:
910916
if not self._eos_token_ids:
@@ -959,6 +965,27 @@ def _incremental_decode(self, new_token_ids: list[int]) -> str:
959965

960966
return window_text[len(prefix_text) :]
961967

968+
def reasoning_token_count(self) -> int:
969+
"""Token count of the reasoning span for usage.completion_tokens_details.
970+
971+
Re-encodes the accumulated reasoning text. This is approximate vs the
972+
exact generated count (it excludes the think markers and relies on BPE
973+
re-tokenization being faithful), but it is per-request and
974+
version-stable -- and it is the only viable source here, because the
975+
engine's own scheduler counter is gated on ``require_reasoning`` which
976+
the Dynamo bare-Engine path never sets. We re-encode the full text
977+
rather than tallying per-chunk ``token_ids`` because the streaming
978+
reasoning detector buffers across chunks, so token_ids do not align to
979+
the reasoning/normal split per chunk.
980+
"""
981+
if not self._reasoning_text_parts:
982+
return 0
983+
return len(
984+
self.tokenizer.encode(
985+
"".join(self._reasoning_text_parts), add_special_tokens=False
986+
)
987+
)
988+
962989
def process_output(self, engine_response: dict[str, Any]) -> dict[str, Any] | None:
963990
"""Process a single engine response chunk into an OpenAI SSE choice dict.
964991
@@ -1001,6 +1028,8 @@ def process_output(self, engine_response: dict[str, Any]) -> dict[str, Any] | No
10011028
r_text, n_text = self.reasoning_parser.parse_stream_chunk(delta_text)
10021029
reasoning_text = r_text or None
10031030
normal_text = n_text or ""
1031+
if r_text:
1032+
self._reasoning_text_parts.append(r_text)
10041033

10051034
# -- Tool call parsing (accumulate deltas) --
10061035
content_text = normal_text

components/src/dynamo/frontend/sglang_processor.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,24 @@ async def _generate_and_stream(
602602
"object": "chat.completion.chunk",
603603
}
604604
if pending_usage:
605+
# Report reasoning tokens. The worker's completion_usage
606+
# has no completion_tokens_details (the bare-Engine path
607+
# cannot count reasoning), and the per-request reasoning
608+
# parser here is the only correct source. Emit only when a
609+
# reasoning parser is active for this request
610+
# (separate_reasoning honored); reasoning_tokens=0 is still
611+
# correct for a request that reasoned nothing.
612+
if post.reasoning_parser is not None:
613+
completion_details = dict(
614+
pending_usage.get("completion_tokens_details") or {}
615+
)
616+
completion_details[
617+
"reasoning_tokens"
618+
] = post.reasoning_token_count()
619+
pending_usage = {
620+
**pending_usage,
621+
"completion_tokens_details": completion_details,
622+
}
605623
dynamo_out["usage"] = pending_usage
606624
pending_usage = None
607625
response_nvext: dict[str, Any] = {}

components/src/dynamo/frontend/tests/test_sglang_processor_unit.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2344,6 +2344,67 @@ def test_reasoning_separated(self, tokenizer):
23442344
assert "42" in content
23452345

23462346

2347+
class TestReasoningTokenCount: # FRONTEND.9 — usage.completion_tokens_details.reasoning_tokens
2348+
"""reasoning_token_count() re-encodes the accumulated reasoning span.
2349+
2350+
This is the per-request source for usage.completion_tokens_details.reasoning_tokens:
2351+
SGLang's scheduler only counts reasoning when require_reasoning=True, which the
2352+
Dynamo bare-Engine path never sets, so the count must come from the frontend parser.
2353+
"""
2354+
2355+
def test_zero_without_reasoning_parser(self, tokenizer):
2356+
"""No reasoning parser → no reasoning span → 0."""
2357+
post = SglangStreamingPostProcessor(
2358+
tokenizer=tokenizer, tool_call_parser=None, reasoning_parser=None
2359+
)
2360+
post.process_output(
2361+
{"token_ids": tokenizer.encode("Hello"), "finish_reason": "stop"}
2362+
)
2363+
assert post.reasoning_token_count() == 0
2364+
2365+
def test_zero_when_nothing_reasoned(self, tokenizer):
2366+
"""Reasoning parser active but the model emitted no reasoning → 0."""
2367+
from sglang.srt.parser.reasoning_parser import ReasoningParser
2368+
2369+
rp = ReasoningParser(model_type="qwen3", stream_reasoning=True)
2370+
post = SglangStreamingPostProcessor(
2371+
tokenizer=tokenizer, tool_call_parser=None, reasoning_parser=rp
2372+
)
2373+
assert post.reasoning_token_count() == 0
2374+
2375+
def test_counts_accumulated_reasoning(self, tokenizer):
2376+
"""<think>...</think> tokens are reported, and match a re-encode of the span."""
2377+
from sglang.srt.parser.reasoning_parser import ReasoningParser
2378+
2379+
rp = ReasoningParser(model_type="qwen3", stream_reasoning=True)
2380+
post = SglangStreamingPostProcessor(
2381+
tokenizer=tokenizer, tool_call_parser=None, reasoning_parser=rp
2382+
)
2383+
text = (
2384+
"<think>\nLet me think about this carefully.\n</think>\n\nThe answer is 42."
2385+
)
2386+
token_ids = tokenizer.encode(text)
2387+
reasoning_parts: list[str] = []
2388+
for i in range(0, len(token_ids), 5):
2389+
batch = token_ids[i : i + 5]
2390+
is_last = i + 5 >= len(token_ids)
2391+
choice = post.process_output(
2392+
{"token_ids": batch, "finish_reason": "stop" if is_last else None}
2393+
)
2394+
if choice:
2395+
reasoning_parts.append(
2396+
choice.get("delta", {}).get("reasoning_content", "")
2397+
)
2398+
reasoning = "".join(reasoning_parts)
2399+
2400+
count = post.reasoning_token_count()
2401+
assert count > 0
2402+
# Equals a re-encode of exactly the reasoning text the parser surfaced.
2403+
assert count == len(tokenizer.encode(reasoning, add_special_tokens=False))
2404+
# And it's strictly less than the whole completion (content excluded).
2405+
assert count < len(token_ids)
2406+
2407+
23472408
# ---------------------------------------------------------------------------
23482409
# Utility functions
23492410
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)