fix(llm): fix reasoning model response parsing bugs (#564) (#580)

henrypark133 · claude · ilblackdragon · web-flow · commit 26d274ac79f4 · 2026-03-06T08:29:30.000Z
Three related fixes for reasoning model artifacts (GLM-4/5, DeepSeek R1, Qwen3):

1. reasoning_content no longer leaks into tool-call assistant messages
   in nearai_chat — only used as fallback for final text responses.

2. plan() and evaluate_success() now apply clean_response() before JSON
   parsing, preventing &lt;think&gt; tag prefixes from breaking plan/eval.

3. Unclosed &lt;think&gt; before &lt;final&gt; no longer discards the answer —
   the strict discard path now extracts &lt;final&gt; content first.

8 regression tests added.

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
Co-authored-by: Illia Polosukhin &lt;ilblackdragon@gmail.com&gt;
diff --git a/src/llm/nearai_chat.rs b/src/llm/nearai_chat.rs
@@ -522,9 +522,6 @@ impl LlmProvider for NearAiChatProvider {
                     reason: "No choices in response".to_string(),
                 })?;
 
-        // Fall back to reasoning_content when content is null (e.g. GLM-5
-        // returns its answer in reasoning_content instead of content).
-        let content = choice.message.content.or(choice.message.reasoning_content);
         let tool_calls: Vec<ToolCall> = choice
             .message
             .tool_calls
@@ -541,6 +538,18 @@ impl LlmProvider for NearAiChatProvider {
             })
             .collect();
 
+        // Fall back to reasoning_content when content is null (e.g. GLM-5
+        // returns its answer in reasoning_content instead of content), but
+        // only for final text responses. Tool-call responses often have
+        // content: null + reasoning_content filled with chain-of-thought;
+        // leaking that into conversation history inflates context and
+        // confuses the model.
+        let content = if tool_calls.is_empty() {
+            choice.message.content.or(choice.message.reasoning_content)
+        } else {
+            choice.message.content
+        };
+
         let finish_reason = match choice.finish_reason.as_deref() {
             Some("stop") => FinishReason::Stop,
             Some("length") => FinishReason::Length,
@@ -1285,4 +1294,109 @@ mod tests {
         assert_eq!(input, default_in);
         assert_eq!(output, default_out);
     }
+
+    /// Regression: reasoning_content must NOT leak into tool-call responses.
+    #[test]
+    fn test_reasoning_content_not_leaked_into_tool_call_response() {
+        let response: ChatCompletionResponse = serde_json::from_value(serde_json::json!({
+            "id": "chatcmpl-test",
+            "choices": [{
+                "message": {
+                    "role": "assistant",
+                    "content": null,
+                    "reasoning_content": "Let me think about which tool to call...",
+                    "tool_calls": [{
+                        "id": "call_abc123",
+                        "type": "function",
+                        "function": {
+                            "name": "search",
+                            "arguments": "{\"query\":\"test\"}"
+                        }
+                    }]
+                },
+                "finish_reason": "tool_calls"
+            }],
+            "usage": { "prompt_tokens": 100, "completion_tokens": 50 }
+        }))
+        .unwrap();
+
+        let choice = response.choices.into_iter().next().unwrap();
+        let tool_calls: Vec<ToolCall> = choice
+            .message
+            .tool_calls
+            .unwrap_or_default()
+            .into_iter()
+            .map(|tc| {
+                let arguments = serde_json::from_str(&tc.function.arguments)
+                    .unwrap_or(serde_json::Value::Object(Default::default()));
+                ToolCall {
+                    id: tc.id,
+                    name: tc.function.name,
+                    arguments,
+                }
+            })
+            .collect();
+
+        let content = if tool_calls.is_empty() {
+            choice.message.content.or(choice.message.reasoning_content)
+        } else {
+            choice.message.content
+        };
+
+        assert!(
+            content.is_none(),
+            "reasoning_content should NOT leak into tool-call responses, got: {:?}",
+            content
+        );
+        assert_eq!(tool_calls.len(), 1);
+        assert_eq!(tool_calls[0].name, "search");
+    }
+
+    /// Regression: reasoning_content SHOULD be used as fallback for text responses.
+    #[test]
+    fn test_reasoning_content_used_for_text_response() {
+        let response: ChatCompletionResponse = serde_json::from_value(serde_json::json!({
+            "id": "chatcmpl-test",
+            "choices": [{
+                "message": {
+                    "role": "assistant",
+                    "content": null,
+                    "reasoning_content": "The answer is 42."
+                },
+                "finish_reason": "stop"
+            }],
+            "usage": { "prompt_tokens": 50, "completion_tokens": 20 }
+        }))
+        .unwrap();
+
+        let choice = response.choices.into_iter().next().unwrap();
+        let tool_calls: Vec<ToolCall> = choice
+            .message
+            .tool_calls
+            .unwrap_or_default()
+            .into_iter()
+            .map(|tc| {
+                let arguments = serde_json::from_str(&tc.function.arguments)
+                    .unwrap_or(serde_json::Value::Object(Default::default()));
+                ToolCall {
+                    id: tc.id,
+                    name: tc.function.name,
+                    arguments,
+                }
+            })
+            .collect();
+
+        let content = if tool_calls.is_empty() {
+            choice.message.content.or(choice.message.reasoning_content)
+        } else {
+            choice.message.content
+        };
+
+        assert_eq!(
+            content,
+            Some("The answer is 42.".to_string()),
+            "reasoning_content should be used as fallback for text responses"
+        );
+        assert!(tool_calls.is_empty());
+    }
 }
diff --git a/src/llm/reasoning.rs b/src/llm/reasoning.rs
@@ -335,8 +335,9 @@ impl Reasoning {
 
         let response = self.llm.complete(request).await?;
 
-        // Parse the plan from the response
-        self.parse_plan(&response.content)
+        // Clean reasoning model artifacts before parsing JSON
+        let cleaned = clean_response(&response.content);
+        self.parse_plan(&cleaned)
     }
 
     /// Select the best tool for the current situation.
@@ -429,7 +430,9 @@ Respond in JSON format:
 
         let response = self.llm.complete(request).await?;
 
-        self.parse_evaluation(&response.content)
+        // Clean reasoning model artifacts before parsing JSON
+        let cleaned = clean_response(&response.content);
+        self.parse_evaluation(&cleaned)
     }
 
     /// Generate a response to a user message.
@@ -1292,8 +1295,15 @@ fn strip_thinking_tags_regex(text: &str, code_regions: &[CodeRegion]) -> String
     }
 
     // Strict mode: if still inside an unclosed thinking tag, discard trailing text
+    // BUT preserve any <final> block embedded in the discarded region
     if !in_thinking {
         result.push_str(&text[last_index..]);
+    } else {
+        let trailing = &text[last_index..];
+        let trailing_regions = find_code_regions(trailing);
+        if let Some(final_content) = extract_final_content(trailing, &trailing_regions) {
+            result.push_str(&final_content);
+        }
     }
 
     result
@@ -1918,6 +1928,59 @@ That's my plan."#;
         assert_eq!(calls[0].name, "tool_list");
     }
 
+    // ---- plan/evaluate bypass clean_response (Bug #564-2) ----
+
+    #[test]
+    fn test_clean_response_strips_think_before_json_plan() {
+        let raw = r#"<think>I need to plan the steps carefully...</think>{"steps": [{"description": "Step 1", "tool": "search", "expected_outcome": "results"}], "reasoning": "Simple plan"}"#;
+        let cleaned = clean_response(raw);
+        // After cleaning, the JSON should be parseable
+        let json_str = extract_json(&cleaned).unwrap();
+        let parsed: serde_json::Value = serde_json::from_str(json_str).unwrap();
+        assert!(parsed.get("steps").is_some());
+    }
+
+    #[test]
+    fn test_clean_response_strips_think_before_json_evaluation() {
+        let raw = r#"<think>Let me evaluate whether this was successful...</think>{"success": true, "confidence": 0.95, "reasoning": "Task completed", "issues": [], "suggestions": []}"#;
+        let cleaned = clean_response(raw);
+        let json_str = extract_json(&cleaned).unwrap();
+        let eval: SuccessEvaluation = serde_json::from_str(json_str).unwrap();
+        assert!(eval.success);
+        assert_eq!(eval.confidence, 0.95);
+    }
+
+    // ---- Unclosed think before final (Bug #564-3) ----
+
+    #[test]
+    fn test_unclosed_think_before_final() {
+        assert_eq!(
+            clean_response("<think>reasoning no close tag <final>actual answer</final>"),
+            "actual answer"
+        );
+    }
+
+    #[test]
+    fn test_unclosed_thinking_before_final() {
+        assert_eq!(
+            clean_response("<thinking>long reasoning... <final>the real answer</final>"),
+            "the real answer"
+        );
+    }
+
+    #[test]
+    fn test_unclosed_think_before_final_with_prefix() {
+        assert_eq!(
+            clean_response("Hello <think>reasoning <final>world</final>"),
+            "Hello world"
+        );
+    }
+
+    #[test]
+    fn test_unclosed_think_no_final_still_discards() {
+        assert_eq!(clean_response("Hello <thinking>this never closes"), "Hello");
+    }
+
     #[test]
     fn test_recover_bracket_format_tool_call() {
         let tools = make_tools(&["http"]);