fix(llm): 非流式收口剥离 Qwen3.5/3.6 泄漏进 content 的思考链 (#1529)

wehos · Hongzhi Wen · claude · web-flow · commit 5b07a594918e · 2026-05-26T01:57:57.000+08:00
qwen3-vl-* 会把推理放进 reasoning_content（content 干净），但 Qwen3.5/3.6
混合模型走 OpenAI 兼容端点时 reasoning_content 恒空，整段思考直接落在
content 里，只有一个无开标签的孤立 &lt;/think&gt; 分隔答案——普通成对正则抓不到。

新增共享 strip_thinking_segments()：先删成对 &lt;think&gt;…&lt;/think&gt;，再把剩下
的无头 &lt;/think&gt; 之前的内容（必为思考）整段切掉；无标签的干净回复原样透传。
挂在 ChatOpenAI.ainvoke/invoke 这个非流式唯一收口处（astream 读
reasoning_content 的对偶位置），一处覆盖 proactive、memory 等所有非流式调用。
openclaw 原来的 _strip_reasoning_trace 去重复用该函数，顺带获得无头 &lt;/think&gt;
处理能力。流式不动（思考逐 token 无分隔，无可靠切点）。

Co-authored-by: Hongzhi Wen &lt;cartabio.coder1@gmail.com&gt;
Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/brain/openclaw_adapter.py b/brain/openclaw_adapter.py
@@ -20,7 +20,7 @@
 
 from config import OPENCLAW_MAGIC_INTENT_MAX_TOKENS
 from utils.file_utils import robust_json_loads
-from utils.llm_client import create_chat_llm
+from utils.llm_client import create_chat_llm, strip_thinking_segments
 from utils.config_manager import get_config_manager
 from utils.logger_config import get_module_logger
 
@@ -458,7 +458,10 @@ async def stop_running(
 
     @staticmethod
     def _strip_reasoning_trace(text: str) -> str:
-        cleaned = re.sub(r"<think>.*?</think>", "", str(text or ""), flags=re.IGNORECASE | re.DOTALL).strip()
+        # Shared stripper handles both paired <think>...</think> and the
+        # Qwen3.5/3.6 dangling-</think> leak shape; ReAct line filtering below
+        # is openclaw-specific and stays here.
+        cleaned = strip_thinking_segments(text)
         if not cleaned:
             return ""
 
diff --git a/tests/unit/test_strip_thinking_segments.py b/tests/unit/test_strip_thinking_segments.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+"""``strip_thinking_segments`` — defensive chain-of-thought removal for
+non-streaming replies.
+
+Background: qwen3-vl-* route reasoning to the ``reasoning_content`` field
+(``content`` stays clean), but the Qwen3.5/3.6 hybrid models never populate
+``reasoning_content`` over the OpenAI-compatible endpoint — the whole
+chain-of-thought lands in ``content`` with only a *dangling* ``</think>`` (no
+opening tag) before the real answer. A paired-tag regex can't catch that;
+these cases pin the dangling-close behavior plus the well-formed and
+passthrough cases.
+"""
+import os
+import sys
+
+import pytest
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+
+from utils.llm_client import strip_thinking_segments
+
+
+@pytest.mark.parametrize(
+    ("raw", "expected"),
+    [
+        # 1) Qwen3.5 leak: implicit-open thinking + lone </think> + answer.
+        ("用户让我描述图片。草稿2更准确简洁。\n</think>\n\n这张图片包含一个红色的矩形。",
+         "这张图片包含一个红色的矩形。"),
+        # 2) Well-formed paired block.
+        ("<think>reason here</think>final answer", "final answer"),
+        # 3) <thinking> long-form variant, paired.
+        ("<thinking>step 1\nstep 2</thinking>\nDone.", "Done."),
+        # 4) Multiple paired blocks.
+        ("<think>a</think>X<think>b</think>Y", "XY"),
+        # 5) Clean reply (qwen3-vl path) passes through untouched.
+        ("图中左侧是一个红色矩形，右侧是一个蓝色圆形。",
+         "图中左侧是一个红色矩形，右侧是一个蓝色圆形。"),
+        # 6) Multiline reasoning before the dangling close (real probe shape).
+        ("1. 识别主体\n2. 组织语言\n精简一下：\n</think>\n\n答案在这里", "答案在这里"),
+        # 7) Case-insensitive close tag.
+        ("thinking...</THINK>answer", "answer"),
+        # 8) Empty / falsy inputs.
+        ("", ""),
+        (None, ""),
+    ],
+)
+def test_strip(raw, expected):
+    assert strip_thinking_segments(raw) == expected
+
+
+def test_no_answer_after_dangling_close_yields_empty():
+    """Pure-thinking reply with a trailing close tag → nothing left."""
+    assert strip_thinking_segments("just reasoning, no answer\n</think>") == ""
+
+
+def test_plain_text_with_no_tags_is_identity():
+    txt = "这是一段普通回复，没有任何思考标签，应原样返回。"
+    assert strip_thinking_segments(txt) == txt
diff --git a/utils/llm_client.py b/utils/llm_client.py
@@ -12,12 +12,53 @@
 
 import contextvars
 import json as _json
+import re
 from dataclasses import dataclass, field
 from typing import Any, AsyncIterator, Union
 
 from openai import AsyncOpenAI, OpenAI
 
 
+# ────────────────────────────────────────────────────────────────
+# Reasoning-trace stripping (non-streaming defensive cleanup)
+# ────────────────────────────────────────────────────────────────
+# Well-formed <think>...</think> / <thinking>...</thinking> blocks.
+_THINK_PAIRED_RE = re.compile(r"<think(?:ing)?\s*>.*?</think(?:ing)?\s*>", re.IGNORECASE | re.DOTALL)
+# A *dangling* close tag with no matching open. This is the Qwen3.5/3.6
+# OpenAI-compat leak shape: unlike qwen3-vl-* (which route reasoning to the
+# ``reasoning_content`` field), the 3.5/3.6 hybrid models never populate
+# ``reasoning_content`` — the whole chain-of-thought lands in ``content`` with
+# only a lone ``</think>`` (implicit open) separating it from the real answer.
+# A paired-tag regex alone can't catch this; we strip everything up to and
+# including the first unmatched close tag.
+_THINK_DANGLING_CLOSE_RE = re.compile(r"^.*?</think(?:ing)?\s*>", re.IGNORECASE | re.DOTALL)
+_THINK_ANY_CLOSE_RE = re.compile(r"</think(?:ing)?\s*>", re.IGNORECASE)
+
+
+def strip_thinking_segments(text: str | None) -> str:
+    """Remove leaked chain-of-thought from a *non-streaming* model reply.
+
+    Handles two shapes:
+      1. Well-formed ``<think>...</think>`` blocks (any count).
+      2. Qwen3.5/3.6 leak: reasoning dumped into ``content`` with only a
+         dangling ``</think>`` (no opening tag) before the answer.
+
+    Conservative — only acts when a think tag is present, so clean replies
+    (qwen3-vl-*, gpt, claude, etc.) pass through untouched. Streaming is *not*
+    covered here on purpose: when the chain-of-thought arrives token-by-token
+    in ``delta.content`` with no delimiter there's nothing reliable to strip.
+    """
+    if not text:
+        return text or ""
+    s = str(text)
+    # 1) drop well-formed blocks first
+    s = _THINK_PAIRED_RE.sub("", s)
+    # 2) any close tag still present is unmatched → preceding text is thinking
+    if _THINK_ANY_CLOSE_RE.search(s):
+        s = _THINK_DANGLING_CLOSE_RE.sub("", s, count=1)
+    return s.strip()
+
+
 # ────────────────────────────────────────────────────────────────
 # Active-character context — used by ChatOpenAI._params to substitute
 # ``{MASTER_NAME}`` / ``{LANLAN_NAME}`` placeholders that originated from
@@ -475,18 +516,18 @@ async def ainvoke(self, messages: Any, **overrides: Any) -> LLMResponse:
         # message=None 的合法响应，直接 .message.content 会 NoneType 崩溃。
         choice = resp.choices[0] if resp.choices else None
         msg = choice.message if choice else None
-        content = getattr(msg, "content", None)
+        content = strip_thinking_segments(getattr(msg, "content", None))
         usage_dict = resp.usage.model_dump() if resp.usage else {}
-        return LLMResponse(content=content or "", response_metadata={"token_usage": usage_dict})
+        return LLMResponse(content=content, response_metadata={"token_usage": usage_dict})
 
     def invoke(self, messages: Any, **overrides: Any) -> LLMResponse:
         """Sync twin of ``ainvoke``. See its docstring for ``overrides``."""
         resp = self._client.chat.completions.create(**self._params(messages, **overrides))
         choice = resp.choices[0] if resp.choices else None
         msg = choice.message if choice else None
-        content = getattr(msg, "content", None)
+        content = strip_thinking_segments(getattr(msg, "content", None))
         usage_dict = resp.usage.model_dump() if resp.usage else {}
-        return LLMResponse(content=content or "", response_metadata={"token_usage": usage_dict})
+        return LLMResponse(content=content, response_metadata={"token_usage": usage_dict})
 
     # --- raw-resp invoke (for callers needing reasoning_content / raw choices) ---