Skip to content

Commit aeab9b0

Browse files
henrypark133claude
andauthored
fix(llm): invert reasoning default — unknown models skip think/final tags (nearai#1952)
* fix(llm): invert reasoning default — unknown models skip <think>/<final> injection When NEAR AI model="auto" resolves server-side to Qwen 3.5, the system prompt injected <think>/<final> tags because "auto" didn't match any known native-thinking pattern. This caused empty responses: 1. Qwen 3.5's native thinking puts reasoning in a `reasoning` field (not `reasoning_content`) — silently dropped due to field name mismatch 2. Content contained only <think> tags or <tool_call> XML, which clean_response() stripped to empty → "I'm not sure how to respond" Three fixes: - Invert the default: new requires_think_final_tags() with empty allowlist means unknown/alias models get the safe direct-answer prompt - Add #[serde(alias = "reasoning")] so vLLM's field name is accepted - Update active_model from API response.model so capability checks use the resolved model name after the first call Confirmed via direct API testing against NEAR AI staging with Qwen/Qwen3.5-122B-A10B. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * remove model alias resolution from nearai_chat auto should stay as the active model name — no reason to overwrite it with the resolved model since requires_think_final_tags() returns false for both "auto" and the resolved name. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix wording: remove native-thinking assumption from direct-answer prompt The direct-answer prompt is now the default for all models, not just native-thinking ones. Remove misleading "handled natively" language. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 8fd4f17 commit aeab9b0

3 files changed

Lines changed: 229 additions & 85 deletions

File tree

src/llm/nearai_chat.rs

Lines changed: 91 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,9 +1040,10 @@ struct ChatCompletionResponseMessage {
10401040
#[allow(dead_code)]
10411041
role: String,
10421042
content: Option<String>,
1043-
/// Some models (e.g. GLM-5) return chain-of-thought reasoning here
1044-
/// instead of in `content`.
1045-
#[serde(default)]
1043+
/// Some models return chain-of-thought reasoning here instead of in
1044+
/// `content`. vLLM/SGLang backends (used by NEAR AI) return the field
1045+
/// as `reasoning`; other APIs (GLM-5, DeepSeek) use `reasoning_content`.
1046+
#[serde(default, alias = "reasoning")]
10461047
reasoning_content: Option<String>,
10471048
tool_calls: Option<Vec<ChatCompletionToolCall>>,
10481049
}
@@ -1531,6 +1532,93 @@ mod tests {
15311532
assert!(tool_calls.is_empty());
15321533
}
15331534

1535+
/// The vLLM/SGLang API returns `reasoning` (not `reasoning_content`).
1536+
/// Verify that the serde alias deserializes it correctly.
1537+
#[test]
1538+
fn test_reasoning_field_alias_accepted() {
1539+
let response: ChatCompletionResponse = serde_json::from_value(serde_json::json!({
1540+
"id": "chatcmpl-test",
1541+
"model": "Qwen/Qwen3.5-122B-A10B",
1542+
"choices": [{
1543+
"message": {
1544+
"role": "assistant",
1545+
"content": null,
1546+
"reasoning": "The answer is 42."
1547+
},
1548+
"finish_reason": "stop"
1549+
}],
1550+
"usage": { "prompt_tokens": 50, "completion_tokens": 20 }
1551+
}))
1552+
.unwrap();
1553+
1554+
let choice = response.choices.into_iter().next().unwrap();
1555+
let content = choice.message.content.or(choice.message.reasoning_content);
1556+
1557+
assert_eq!(
1558+
content,
1559+
Some("The answer is 42.".to_string()),
1560+
"reasoning field (vLLM alias) should deserialize into reasoning_content"
1561+
);
1562+
}
1563+
1564+
/// Verify that `reasoning` field does NOT leak into tool-call responses
1565+
/// (same logic as reasoning_content — only used for text fallback).
1566+
#[test]
1567+
fn test_reasoning_alias_not_leaked_into_tool_calls() {
1568+
let response: ChatCompletionResponse = serde_json::from_value(serde_json::json!({
1569+
"id": "chatcmpl-test",
1570+
"model": "Qwen/Qwen3.5-122B-A10B",
1571+
"choices": [{
1572+
"message": {
1573+
"role": "assistant",
1574+
"content": null,
1575+
"reasoning": "Let me think about which tool to call...",
1576+
"tool_calls": [{
1577+
"id": "call_xyz",
1578+
"type": "function",
1579+
"function": {
1580+
"name": "web_search",
1581+
"arguments": "{\"query\":\"test\"}"
1582+
}
1583+
}]
1584+
},
1585+
"finish_reason": "tool_calls"
1586+
}],
1587+
"usage": { "prompt_tokens": 100, "completion_tokens": 50 }
1588+
}))
1589+
.unwrap();
1590+
1591+
let choice = response.choices.into_iter().next().unwrap();
1592+
let tool_calls: Vec<ToolCall> = choice
1593+
.message
1594+
.tool_calls
1595+
.unwrap_or_default()
1596+
.into_iter()
1597+
.map(|tc| {
1598+
let arguments = serde_json::from_str(&tc.function.arguments)
1599+
.unwrap_or(serde_json::Value::Object(Default::default()));
1600+
ToolCall {
1601+
id: tc.id,
1602+
name: tc.function.name,
1603+
arguments,
1604+
reasoning: None,
1605+
}
1606+
})
1607+
.collect();
1608+
1609+
let content = if tool_calls.is_empty() {
1610+
choice.message.content.or(choice.message.reasoning_content)
1611+
} else {
1612+
choice.message.content
1613+
};
1614+
1615+
assert!(
1616+
content.is_none(),
1617+
"reasoning (alias) should NOT leak into tool-call responses"
1618+
);
1619+
assert_eq!(tool_calls.len(), 1);
1620+
}
1621+
15341622
#[tokio::test]
15351623
async fn test_resolve_bearer_token_config_api_key() {
15361624
// When config.api_key is set, it takes top priority.

src/llm/reasoning.rs

Lines changed: 45 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1003,21 +1003,16 @@ Respond with a JSON plan in this format:
10031003
.to_string()
10041004
};
10051005

1006-
// Models with native thinking (Qwen3, DeepSeek-R1, etc.) produce their
1007-
// own <think> tags or reasoning_content. Injecting our <think>/<final>
1008-
// format collides with their native behavior, causing thinking-only
1009-
// responses that clean to empty strings. See issue #789.
1010-
let has_native_thinking = self
1006+
// Default: direct-answer format. Only inject <think>/<final> tags for
1007+
// models explicitly known to require them. Unknown models, aliases like
1008+
// "auto", and native-thinking models all get the safe direct-answer
1009+
// format. See issue #789.
1010+
let needs_tags = self
10111011
.model_name
10121012
.as_ref()
1013-
.is_some_and(|n| crate::llm::reasoning_models::has_native_thinking(n));
1013+
.is_some_and(|n| crate::llm::reasoning_models::requires_think_final_tags(n));
10141014

1015-
let response_format = if has_native_thinking {
1016-
r#"## Response Format
1017-
1018-
Respond directly with your answer. Do not wrap your response in any special tags.
1019-
Your reasoning process is handled natively — just provide the final user-facing answer."#
1020-
} else {
1015+
let response_format = if needs_tags {
10211016
r#"## Response Format — CRITICAL
10221017
10231018
ALL internal reasoning MUST be inside <think>...</think> tags.
@@ -1029,6 +1024,10 @@ Only text inside <final> is shown to the user; everything else is discarded.
10291024
Example:
10301025
<think>The user is asking about X.</think>
10311026
<final>Here is the answer about X.</final>"#
1027+
} else {
1028+
r#"## Response Format
1029+
1030+
Respond directly with your final answer. Do not wrap your response in any special tags."#
10321031
};
10331032

10341033
format!(
@@ -3048,41 +3047,61 @@ That's my plan."#;
30483047
}
30493048

30503049
#[test]
3051-
fn test_system_prompt_skips_think_final_for_native_thinking() {
3050+
fn test_system_prompt_direct_answer_for_native_thinking_model() {
30523051
let reasoning = make_reasoning_with_model("qwen3-8b");
30533052
let prompt = reasoning.build_system_prompt_with_tools(&[]);
30543053
assert!(
30553054
!prompt.contains("<think>"),
30563055
"Native thinking model should NOT have <think> in system prompt"
30573056
);
3058-
assert!(prompt.contains("Respond directly with your answer"));
3057+
assert!(prompt.contains("Respond directly"));
30593058
}
30603059

30613060
#[test]
3062-
fn test_system_prompt_includes_think_final_for_regular_model() {
3061+
fn test_system_prompt_direct_answer_for_regular_model() {
3062+
// Regular models also get direct-answer format by default (inverted default)
30633063
let reasoning = make_reasoning_with_model("llama-3.1-70b");
30643064
let prompt = reasoning.build_system_prompt_with_tools(&[]);
3065-
assert!(prompt.contains("<think>"));
3066-
assert!(prompt.contains("<final>"));
3065+
assert!(!prompt.contains("<think>"));
3066+
assert!(!prompt.contains("<final>"));
3067+
assert!(prompt.contains("Respond directly"));
30673068
}
30683069

30693070
#[test]
3070-
fn test_system_prompt_defaults_to_think_final_when_no_model() {
3071+
fn test_system_prompt_defaults_to_direct_answer_when_no_model() {
30713072
use crate::testing::StubLlm;
30723073
let reasoning = Reasoning::new(Arc::new(StubLlm::new("test")));
30733074
let prompt = reasoning.build_system_prompt_with_tools(&[]);
3074-
assert!(prompt.contains("<think>"));
3075-
assert!(prompt.contains("<final>"));
3075+
// No model name → safe default → direct-answer (no tags)
3076+
assert!(!prompt.contains("<think>"));
3077+
assert!(!prompt.contains("<final>"));
3078+
assert!(prompt.contains("Respond directly"));
30763079
}
30773080

30783081
#[test]
3079-
fn test_system_prompt_deepseek_r1_skips_think_final() {
3082+
fn test_system_prompt_direct_answer_for_deepseek_r1() {
30803083
let reasoning = make_reasoning_with_model("deepseek-r1-distill-qwen-32b");
30813084
let prompt = reasoning.build_system_prompt_with_tools(&[]);
30823085
assert!(!prompt.contains("CRITICAL"));
30833086
assert!(prompt.contains("Respond directly"));
30843087
}
30853088

3089+
#[test]
3090+
fn test_system_prompt_direct_answer_for_auto_alias() {
3091+
let reasoning = make_reasoning_with_model("auto");
3092+
let prompt = reasoning.build_system_prompt_with_tools(&[]);
3093+
assert!(!prompt.contains("<think>"));
3094+
assert!(prompt.contains("Respond directly"));
3095+
}
3096+
3097+
#[test]
3098+
fn test_system_prompt_direct_answer_for_resolved_qwen() {
3099+
let reasoning = make_reasoning_with_model("Qwen/Qwen3.5-122B-A10B");
3100+
let prompt = reasoning.build_system_prompt_with_tools(&[]);
3101+
assert!(!prompt.contains("<think>"));
3102+
assert!(prompt.contains("Respond directly"));
3103+
}
3104+
30863105
// ---- Issue #789: additional edge case tests for truncate_at_tool_tags ----
30873106

30883107
#[test]
@@ -3373,13 +3392,15 @@ That's my plan."#;
33733392
);
33743393
assert!(prompt.contains("Respond directly"));
33753394

3376-
// Now create reasoning WITHOUT with_model_name — should get default prompt
3395+
// Now create reasoning WITHOUT with_model_name — should get direct-answer
3396+
// default (inverted default: unknown models are native-thinking-safe)
33773397
let reasoning_no_model = Reasoning::new(llm);
33783398
let prompt2 = reasoning_no_model.build_system_prompt_with_tools(&[]);
33793399
assert!(
3380-
prompt2.contains("<think>"),
3381-
"Without model name, should get default think/final prompt"
3400+
!prompt2.contains("<think>"),
3401+
"Without model name, should get direct-answer prompt (safe default)"
33823402
);
3403+
assert!(prompt2.contains("Respond directly"));
33833404
}
33843405

33853406
// ---- Issue #789: case-insensitive truncation ----

0 commit comments

Comments
 (0)