|
12 | 12 |
|
13 | 13 | import contextvars |
14 | 14 | import json as _json |
| 15 | +import re |
15 | 16 | from dataclasses import dataclass, field |
16 | 17 | from typing import Any, AsyncIterator, Union |
17 | 18 |
|
18 | 19 | from openai import AsyncOpenAI, OpenAI |
19 | 20 |
|
20 | 21 |
|
| 22 | +# ──────────────────────────────────────────────────────────────── |
| 23 | +# Reasoning-trace stripping (non-streaming defensive cleanup) |
| 24 | +# ──────────────────────────────────────────────────────────────── |
| 25 | +# Well-formed <think>...</think> / <thinking>...</thinking> blocks. |
| 26 | +_THINK_PAIRED_RE = re.compile(r"<think(?:ing)?\s*>.*?</think(?:ing)?\s*>", re.IGNORECASE | re.DOTALL) |
| 27 | +# A *dangling* close tag with no matching open. This is the Qwen3.5/3.6 |
| 28 | +# OpenAI-compat leak shape: unlike qwen3-vl-* (which route reasoning to the |
| 29 | +# ``reasoning_content`` field), the 3.5/3.6 hybrid models never populate |
| 30 | +# ``reasoning_content`` — the whole chain-of-thought lands in ``content`` with |
| 31 | +# only a lone ``</think>`` (implicit open) separating it from the real answer. |
| 32 | +# A paired-tag regex alone can't catch this; we strip everything up to and |
| 33 | +# including the first unmatched close tag. |
| 34 | +_THINK_DANGLING_CLOSE_RE = re.compile(r"^.*?</think(?:ing)?\s*>", re.IGNORECASE | re.DOTALL) |
| 35 | +_THINK_ANY_CLOSE_RE = re.compile(r"</think(?:ing)?\s*>", re.IGNORECASE) |
| 36 | + |
| 37 | + |
| 38 | +def strip_thinking_segments(text: str | None) -> str: |
| 39 | + """Remove leaked chain-of-thought from a *non-streaming* model reply. |
| 40 | +
|
| 41 | + Handles two shapes: |
| 42 | + 1. Well-formed ``<think>...</think>`` blocks (any count). |
| 43 | + 2. Qwen3.5/3.6 leak: reasoning dumped into ``content`` with only a |
| 44 | + dangling ``</think>`` (no opening tag) before the answer. |
| 45 | +
|
| 46 | + Conservative — only acts when a think tag is present, so clean replies |
| 47 | + (qwen3-vl-*, gpt, claude, etc.) pass through untouched. Streaming is *not* |
| 48 | + covered here on purpose: when the chain-of-thought arrives token-by-token |
| 49 | + in ``delta.content`` with no delimiter there's nothing reliable to strip. |
| 50 | + """ |
| 51 | + if not text: |
| 52 | + return text or "" |
| 53 | + s = str(text) |
| 54 | + # 1) drop well-formed blocks first |
| 55 | + s = _THINK_PAIRED_RE.sub("", s) |
| 56 | + # 2) any close tag still present is unmatched → preceding text is thinking |
| 57 | + if _THINK_ANY_CLOSE_RE.search(s): |
| 58 | + s = _THINK_DANGLING_CLOSE_RE.sub("", s, count=1) |
| 59 | + return s.strip() |
| 60 | + |
| 61 | + |
21 | 62 | # ──────────────────────────────────────────────────────────────── |
22 | 63 | # Active-character context — used by ChatOpenAI._params to substitute |
23 | 64 | # ``{MASTER_NAME}`` / ``{LANLAN_NAME}`` placeholders that originated from |
@@ -475,18 +516,18 @@ async def ainvoke(self, messages: Any, **overrides: Any) -> LLMResponse: |
475 | 516 | # message=None 的合法响应,直接 .message.content 会 NoneType 崩溃。 |
476 | 517 | choice = resp.choices[0] if resp.choices else None |
477 | 518 | msg = choice.message if choice else None |
478 | | - content = getattr(msg, "content", None) |
| 519 | + content = strip_thinking_segments(getattr(msg, "content", None)) |
479 | 520 | usage_dict = resp.usage.model_dump() if resp.usage else {} |
480 | | - return LLMResponse(content=content or "", response_metadata={"token_usage": usage_dict}) |
| 521 | + return LLMResponse(content=content, response_metadata={"token_usage": usage_dict}) |
481 | 522 |
|
482 | 523 | def invoke(self, messages: Any, **overrides: Any) -> LLMResponse: |
483 | 524 | """Sync twin of ``ainvoke``. See its docstring for ``overrides``.""" |
484 | 525 | resp = self._client.chat.completions.create(**self._params(messages, **overrides)) |
485 | 526 | choice = resp.choices[0] if resp.choices else None |
486 | 527 | msg = choice.message if choice else None |
487 | | - content = getattr(msg, "content", None) |
| 528 | + content = strip_thinking_segments(getattr(msg, "content", None)) |
488 | 529 | usage_dict = resp.usage.model_dump() if resp.usage else {} |
489 | | - return LLMResponse(content=content or "", response_metadata={"token_usage": usage_dict}) |
| 530 | + return LLMResponse(content=content, response_metadata={"token_usage": usage_dict}) |
490 | 531 |
|
491 | 532 | # --- raw-resp invoke (for callers needing reasoning_content / raw choices) --- |
492 | 533 |
|
|
0 commit comments