fix(llm): invert reasoning default — unknown models skip think/final tags (nearai#1952)

henrypark133 · claude · web-flow · commit aeab9b096d7c · 2026-04-03T17:51:54.000-07:00
* fix(llm): invert reasoning default — unknown models skip &lt;think&gt;/&lt;final&gt; injection

When NEAR AI model="auto" resolves server-side to Qwen 3.5, the system
prompt injected &lt;think&gt;/&lt;final&gt; tags because "auto" didn't match any
known native-thinking pattern. This caused empty responses:

1. Qwen 3.5's native thinking puts reasoning in a `reasoning` field
   (not `reasoning_content`) — silently dropped due to field name mismatch
2. Content contained only &lt;think&gt; tags or &lt;tool_call&gt; XML, which
   clean_response() stripped to empty → "I'm not sure how to respond"

Three fixes:
- Invert the default: new requires_think_final_tags() with empty allowlist
  means unknown/alias models get the safe direct-answer prompt
- Add #[serde(alias = "reasoning")] so vLLM's field name is accepted
- Update active_model from API response.model so capability checks
  use the resolved model name after the first call

Confirmed via direct API testing against NEAR AI staging with
Qwen/Qwen3.5-122B-A10B.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

* remove model alias resolution from nearai_chat

auto should stay as the active model name — no reason to overwrite it
with the resolved model since requires_think_final_tags() returns false
for both "auto" and the resolved name.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

* fix wording: remove native-thinking assumption from direct-answer prompt

The direct-answer prompt is now the default for all models, not just
native-thinking ones. Remove misleading "handled natively" language.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/llm/nearai_chat.rs b/src/llm/nearai_chat.rs
@@ -1040,9 +1040,10 @@ struct ChatCompletionResponseMessage {
     #[allow(dead_code)]
     role: String,
     content: Option<String>,
-    /// Some models (e.g. GLM-5) return chain-of-thought reasoning here
-    /// instead of in `content`.
-    #[serde(default)]
+    /// Some models return chain-of-thought reasoning here instead of in
+    /// `content`. vLLM/SGLang backends (used by NEAR AI) return the field
+    /// as `reasoning`; other APIs (GLM-5, DeepSeek) use `reasoning_content`.
+    #[serde(default, alias = "reasoning")]
     reasoning_content: Option<String>,
     tool_calls: Option<Vec<ChatCompletionToolCall>>,
 }
@@ -1531,6 +1532,93 @@ mod tests {
         assert!(tool_calls.is_empty());
     }
 
+    /// The vLLM/SGLang API returns `reasoning` (not `reasoning_content`).
+    /// Verify that the serde alias deserializes it correctly.
+    #[test]
+    fn test_reasoning_field_alias_accepted() {
+        let response: ChatCompletionResponse = serde_json::from_value(serde_json::json!({
+            "id": "chatcmpl-test",
+            "model": "Qwen/Qwen3.5-122B-A10B",
+            "choices": [{
+                "message": {
+                    "role": "assistant",
+                    "content": null,
+                    "reasoning": "The answer is 42."
+                },
+                "finish_reason": "stop"
+            }],
+            "usage": { "prompt_tokens": 50, "completion_tokens": 20 }
+        }))
+        .unwrap();
+
+        let choice = response.choices.into_iter().next().unwrap();
+        let content = choice.message.content.or(choice.message.reasoning_content);
+
+        assert_eq!(
+            content,
+            Some("The answer is 42.".to_string()),
+            "reasoning field (vLLM alias) should deserialize into reasoning_content"
+        );
+    }
+
+    /// Verify that `reasoning` field does NOT leak into tool-call responses
+    /// (same logic as reasoning_content — only used for text fallback).
+    #[test]
+    fn test_reasoning_alias_not_leaked_into_tool_calls() {
+        let response: ChatCompletionResponse = serde_json::from_value(serde_json::json!({
+            "id": "chatcmpl-test",
+            "model": "Qwen/Qwen3.5-122B-A10B",
+            "choices": [{
+                "message": {
+                    "role": "assistant",
+                    "content": null,
+                    "reasoning": "Let me think about which tool to call...",
+                    "tool_calls": [{
+                        "id": "call_xyz",
+                        "type": "function",
+                        "function": {
+                            "name": "web_search",
+                            "arguments": "{\"query\":\"test\"}"
+                        }
+                    }]
+                },
+                "finish_reason": "tool_calls"
+            }],
+            "usage": { "prompt_tokens": 100, "completion_tokens": 50 }
+        }))
+        .unwrap();
+
+        let choice = response.choices.into_iter().next().unwrap();
+        let tool_calls: Vec<ToolCall> = choice
+            .message
+            .tool_calls
+            .unwrap_or_default()
+            .into_iter()
+            .map(|tc| {
+                let arguments = serde_json::from_str(&tc.function.arguments)
+                    .unwrap_or(serde_json::Value::Object(Default::default()));
+                ToolCall {
+                    id: tc.id,
+                    name: tc.function.name,
+                    arguments,
+                    reasoning: None,
+                }
+            })
+            .collect();
+
+        let content = if tool_calls.is_empty() {
+            choice.message.content.or(choice.message.reasoning_content)
+        } else {
+            choice.message.content
+        };
+
+        assert!(
+            content.is_none(),
+            "reasoning (alias) should NOT leak into tool-call responses"
+        );
+        assert_eq!(tool_calls.len(), 1);
+    }
+
     #[tokio::test]
     async fn test_resolve_bearer_token_config_api_key() {
         // When config.api_key is set, it takes top priority.
diff --git a/src/llm/reasoning.rs b/src/llm/reasoning.rs
@@ -1003,21 +1003,16 @@ Respond with a JSON plan in this format:
                 .to_string()
         };
 
-        // Models with native thinking (Qwen3, DeepSeek-R1, etc.) produce their
-        // own <think> tags or reasoning_content. Injecting our <think>/<final>
-        // format collides with their native behavior, causing thinking-only
-        // responses that clean to empty strings. See issue #789.
-        let has_native_thinking = self
+        // Default: direct-answer format. Only inject <think>/<final> tags for
+        // models explicitly known to require them. Unknown models, aliases like
+        // "auto", and native-thinking models all get the safe direct-answer
+        // format. See issue #789.
+        let needs_tags = self
             .model_name
             .as_ref()
-            .is_some_and(|n| crate::llm::reasoning_models::has_native_thinking(n));
+            .is_some_and(|n| crate::llm::reasoning_models::requires_think_final_tags(n));
 
-        let response_format = if has_native_thinking {
-            r#"## Response Format
-
-Respond directly with your answer. Do not wrap your response in any special tags.
-Your reasoning process is handled natively — just provide the final user-facing answer."#
-        } else {
+        let response_format = if needs_tags {
             r#"## Response Format — CRITICAL
 
 ALL internal reasoning MUST be inside <think>...</think> tags.
@@ -1029,6 +1024,10 @@ Only text inside <final> is shown to the user; everything else is discarded.
 Example:
 <think>The user is asking about X.</think>
 <final>Here is the answer about X.</final>"#
+        } else {
+            r#"## Response Format
+
+Respond directly with your final answer. Do not wrap your response in any special tags."#
         };
 
         format!(
@@ -3048,41 +3047,61 @@ That's my plan."#;
     }
 
     #[test]
-    fn test_system_prompt_skips_think_final_for_native_thinking() {
+    fn test_system_prompt_direct_answer_for_native_thinking_model() {
         let reasoning = make_reasoning_with_model("qwen3-8b");
         let prompt = reasoning.build_system_prompt_with_tools(&[]);
         assert!(
             !prompt.contains("<think>"),
             "Native thinking model should NOT have <think> in system prompt"
         );
-        assert!(prompt.contains("Respond directly with your answer"));
+        assert!(prompt.contains("Respond directly"));
     }
 
     #[test]
-    fn test_system_prompt_includes_think_final_for_regular_model() {
+    fn test_system_prompt_direct_answer_for_regular_model() {
+        // Regular models also get direct-answer format by default (inverted default)
         let reasoning = make_reasoning_with_model("llama-3.1-70b");
         let prompt = reasoning.build_system_prompt_with_tools(&[]);
-        assert!(prompt.contains("<think>"));
-        assert!(prompt.contains("<final>"));
+        assert!(!prompt.contains("<think>"));
+        assert!(!prompt.contains("<final>"));
+        assert!(prompt.contains("Respond directly"));
     }
 
     #[test]
-    fn test_system_prompt_defaults_to_think_final_when_no_model() {
+    fn test_system_prompt_defaults_to_direct_answer_when_no_model() {
         use crate::testing::StubLlm;
         let reasoning = Reasoning::new(Arc::new(StubLlm::new("test")));
         let prompt = reasoning.build_system_prompt_with_tools(&[]);
-        assert!(prompt.contains("<think>"));
-        assert!(prompt.contains("<final>"));
+        // No model name → safe default → direct-answer (no tags)
+        assert!(!prompt.contains("<think>"));
+        assert!(!prompt.contains("<final>"));
+        assert!(prompt.contains("Respond directly"));
     }
 
     #[test]
-    fn test_system_prompt_deepseek_r1_skips_think_final() {
+    fn test_system_prompt_direct_answer_for_deepseek_r1() {
         let reasoning = make_reasoning_with_model("deepseek-r1-distill-qwen-32b");
         let prompt = reasoning.build_system_prompt_with_tools(&[]);
         assert!(!prompt.contains("CRITICAL"));
         assert!(prompt.contains("Respond directly"));
     }
 
+    #[test]
+    fn test_system_prompt_direct_answer_for_auto_alias() {
+        let reasoning = make_reasoning_with_model("auto");
+        let prompt = reasoning.build_system_prompt_with_tools(&[]);
+        assert!(!prompt.contains("<think>"));
+        assert!(prompt.contains("Respond directly"));
+    }
+
+    #[test]
+    fn test_system_prompt_direct_answer_for_resolved_qwen() {
+        let reasoning = make_reasoning_with_model("Qwen/Qwen3.5-122B-A10B");
+        let prompt = reasoning.build_system_prompt_with_tools(&[]);
+        assert!(!prompt.contains("<think>"));
+        assert!(prompt.contains("Respond directly"));
+    }
+
     // ---- Issue #789: additional edge case tests for truncate_at_tool_tags ----
 
     #[test]
@@ -3373,13 +3392,15 @@ That's my plan."#;
         );
         assert!(prompt.contains("Respond directly"));
 
-        // Now create reasoning WITHOUT with_model_name — should get default prompt
+        // Now create reasoning WITHOUT with_model_name — should get direct-answer
+        // default (inverted default: unknown models are native-thinking-safe)
         let reasoning_no_model = Reasoning::new(llm);
         let prompt2 = reasoning_no_model.build_system_prompt_with_tools(&[]);
         assert!(
-            prompt2.contains("<think>"),
-            "Without model name, should get default think/final prompt"
+            !prompt2.contains("<think>"),
+            "Without model name, should get direct-answer prompt (safe default)"
         );
+        assert!(prompt2.contains("Respond directly"));
     }
 
     // ---- Issue #789: case-insensitive truncation ----
diff --git a/src/llm/reasoning_models.rs b/src/llm/reasoning_models.rs