Skip to content

Commit 26d274a

Browse files
henrypark133claudeilblackdragon
authored
fix(llm): fix reasoning model response parsing bugs (#564) (#580)
Three related fixes for reasoning model artifacts (GLM-4/5, DeepSeek R1, Qwen3): 1. reasoning_content no longer leaks into tool-call assistant messages in nearai_chat — only used as fallback for final text responses. 2. plan() and evaluate_success() now apply clean_response() before JSON parsing, preventing <think> tag prefixes from breaking plan/eval. 3. Unclosed <think> before <final> no longer discards the answer — the strict discard path now extracts <final> content first. 8 regression tests added. Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: Illia Polosukhin <ilblackdragon@gmail.com>
1 parent b425213 commit 26d274a

2 files changed

Lines changed: 183 additions & 6 deletions

File tree

src/llm/nearai_chat.rs

Lines changed: 117 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -522,9 +522,6 @@ impl LlmProvider for NearAiChatProvider {
522522
reason: "No choices in response".to_string(),
523523
})?;
524524

525-
// Fall back to reasoning_content when content is null (e.g. GLM-5
526-
// returns its answer in reasoning_content instead of content).
527-
let content = choice.message.content.or(choice.message.reasoning_content);
528525
let tool_calls: Vec<ToolCall> = choice
529526
.message
530527
.tool_calls
@@ -541,6 +538,18 @@ impl LlmProvider for NearAiChatProvider {
541538
})
542539
.collect();
543540

541+
// Fall back to reasoning_content when content is null (e.g. GLM-5
542+
// returns its answer in reasoning_content instead of content), but
543+
// only for final text responses. Tool-call responses often have
544+
// content: null + reasoning_content filled with chain-of-thought;
545+
// leaking that into conversation history inflates context and
546+
// confuses the model.
547+
let content = if tool_calls.is_empty() {
548+
choice.message.content.or(choice.message.reasoning_content)
549+
} else {
550+
choice.message.content
551+
};
552+
544553
let finish_reason = match choice.finish_reason.as_deref() {
545554
Some("stop") => FinishReason::Stop,
546555
Some("length") => FinishReason::Length,
@@ -1285,4 +1294,109 @@ mod tests {
12851294
assert_eq!(input, default_in);
12861295
assert_eq!(output, default_out);
12871296
}
1297+
1298+
/// Regression: reasoning_content must NOT leak into tool-call responses.
1299+
#[test]
1300+
fn test_reasoning_content_not_leaked_into_tool_call_response() {
1301+
let response: ChatCompletionResponse = serde_json::from_value(serde_json::json!({
1302+
"id": "chatcmpl-test",
1303+
"choices": [{
1304+
"message": {
1305+
"role": "assistant",
1306+
"content": null,
1307+
"reasoning_content": "Let me think about which tool to call...",
1308+
"tool_calls": [{
1309+
"id": "call_abc123",
1310+
"type": "function",
1311+
"function": {
1312+
"name": "search",
1313+
"arguments": "{\"query\":\"test\"}"
1314+
}
1315+
}]
1316+
},
1317+
"finish_reason": "tool_calls"
1318+
}],
1319+
"usage": { "prompt_tokens": 100, "completion_tokens": 50 }
1320+
}))
1321+
.unwrap();
1322+
1323+
let choice = response.choices.into_iter().next().unwrap();
1324+
let tool_calls: Vec<ToolCall> = choice
1325+
.message
1326+
.tool_calls
1327+
.unwrap_or_default()
1328+
.into_iter()
1329+
.map(|tc| {
1330+
let arguments = serde_json::from_str(&tc.function.arguments)
1331+
.unwrap_or(serde_json::Value::Object(Default::default()));
1332+
ToolCall {
1333+
id: tc.id,
1334+
name: tc.function.name,
1335+
arguments,
1336+
}
1337+
})
1338+
.collect();
1339+
1340+
let content = if tool_calls.is_empty() {
1341+
choice.message.content.or(choice.message.reasoning_content)
1342+
} else {
1343+
choice.message.content
1344+
};
1345+
1346+
assert!(
1347+
content.is_none(),
1348+
"reasoning_content should NOT leak into tool-call responses, got: {:?}",
1349+
content
1350+
);
1351+
assert_eq!(tool_calls.len(), 1);
1352+
assert_eq!(tool_calls[0].name, "search");
1353+
}
1354+
1355+
/// Regression: reasoning_content SHOULD be used as fallback for text responses.
1356+
#[test]
1357+
fn test_reasoning_content_used_for_text_response() {
1358+
let response: ChatCompletionResponse = serde_json::from_value(serde_json::json!({
1359+
"id": "chatcmpl-test",
1360+
"choices": [{
1361+
"message": {
1362+
"role": "assistant",
1363+
"content": null,
1364+
"reasoning_content": "The answer is 42."
1365+
},
1366+
"finish_reason": "stop"
1367+
}],
1368+
"usage": { "prompt_tokens": 50, "completion_tokens": 20 }
1369+
}))
1370+
.unwrap();
1371+
1372+
let choice = response.choices.into_iter().next().unwrap();
1373+
let tool_calls: Vec<ToolCall> = choice
1374+
.message
1375+
.tool_calls
1376+
.unwrap_or_default()
1377+
.into_iter()
1378+
.map(|tc| {
1379+
let arguments = serde_json::from_str(&tc.function.arguments)
1380+
.unwrap_or(serde_json::Value::Object(Default::default()));
1381+
ToolCall {
1382+
id: tc.id,
1383+
name: tc.function.name,
1384+
arguments,
1385+
}
1386+
})
1387+
.collect();
1388+
1389+
let content = if tool_calls.is_empty() {
1390+
choice.message.content.or(choice.message.reasoning_content)
1391+
} else {
1392+
choice.message.content
1393+
};
1394+
1395+
assert_eq!(
1396+
content,
1397+
Some("The answer is 42.".to_string()),
1398+
"reasoning_content should be used as fallback for text responses"
1399+
);
1400+
assert!(tool_calls.is_empty());
1401+
}
12881402
}

src/llm/reasoning.rs

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -335,8 +335,9 @@ impl Reasoning {
335335

336336
let response = self.llm.complete(request).await?;
337337

338-
// Parse the plan from the response
339-
self.parse_plan(&response.content)
338+
// Clean reasoning model artifacts before parsing JSON
339+
let cleaned = clean_response(&response.content);
340+
self.parse_plan(&cleaned)
340341
}
341342

342343
/// Select the best tool for the current situation.
@@ -429,7 +430,9 @@ Respond in JSON format:
429430

430431
let response = self.llm.complete(request).await?;
431432

432-
self.parse_evaluation(&response.content)
433+
// Clean reasoning model artifacts before parsing JSON
434+
let cleaned = clean_response(&response.content);
435+
self.parse_evaluation(&cleaned)
433436
}
434437

435438
/// Generate a response to a user message.
@@ -1292,8 +1295,15 @@ fn strip_thinking_tags_regex(text: &str, code_regions: &[CodeRegion]) -> String
12921295
}
12931296

12941297
// Strict mode: if still inside an unclosed thinking tag, discard trailing text
1298+
// BUT preserve any <final> block embedded in the discarded region
12951299
if !in_thinking {
12961300
result.push_str(&text[last_index..]);
1301+
} else {
1302+
let trailing = &text[last_index..];
1303+
let trailing_regions = find_code_regions(trailing);
1304+
if let Some(final_content) = extract_final_content(trailing, &trailing_regions) {
1305+
result.push_str(&final_content);
1306+
}
12971307
}
12981308

12991309
result
@@ -1918,6 +1928,59 @@ That's my plan."#;
19181928
assert_eq!(calls[0].name, "tool_list");
19191929
}
19201930

1931+
// ---- plan/evaluate bypass clean_response (Bug #564-2) ----
1932+
1933+
#[test]
1934+
fn test_clean_response_strips_think_before_json_plan() {
1935+
let raw = r#"<think>I need to plan the steps carefully...</think>{"steps": [{"description": "Step 1", "tool": "search", "expected_outcome": "results"}], "reasoning": "Simple plan"}"#;
1936+
let cleaned = clean_response(raw);
1937+
// After cleaning, the JSON should be parseable
1938+
let json_str = extract_json(&cleaned).unwrap();
1939+
let parsed: serde_json::Value = serde_json::from_str(json_str).unwrap();
1940+
assert!(parsed.get("steps").is_some());
1941+
}
1942+
1943+
#[test]
1944+
fn test_clean_response_strips_think_before_json_evaluation() {
1945+
let raw = r#"<think>Let me evaluate whether this was successful...</think>{"success": true, "confidence": 0.95, "reasoning": "Task completed", "issues": [], "suggestions": []}"#;
1946+
let cleaned = clean_response(raw);
1947+
let json_str = extract_json(&cleaned).unwrap();
1948+
let eval: SuccessEvaluation = serde_json::from_str(json_str).unwrap();
1949+
assert!(eval.success);
1950+
assert_eq!(eval.confidence, 0.95);
1951+
}
1952+
1953+
// ---- Unclosed think before final (Bug #564-3) ----
1954+
1955+
#[test]
1956+
fn test_unclosed_think_before_final() {
1957+
assert_eq!(
1958+
clean_response("<think>reasoning no close tag <final>actual answer</final>"),
1959+
"actual answer"
1960+
);
1961+
}
1962+
1963+
#[test]
1964+
fn test_unclosed_thinking_before_final() {
1965+
assert_eq!(
1966+
clean_response("<thinking>long reasoning... <final>the real answer</final>"),
1967+
"the real answer"
1968+
);
1969+
}
1970+
1971+
#[test]
1972+
fn test_unclosed_think_before_final_with_prefix() {
1973+
assert_eq!(
1974+
clean_response("Hello <think>reasoning <final>world</final>"),
1975+
"Hello world"
1976+
);
1977+
}
1978+
1979+
#[test]
1980+
fn test_unclosed_think_no_final_still_discards() {
1981+
assert_eq!(clean_response("Hello <thinking>this never closes"), "Hello");
1982+
}
1983+
19211984
#[test]
19221985
fn test_recover_bracket_format_tool_call() {
19231986
let tools = make_tools(&["http"]);

0 commit comments

Comments
 (0)