From d978023b29b019ce66b615d6a4262083206b5136 Mon Sep 17 00:00:00 2001
From: Exergenion <m-hoevel@gmx.de>
Date: Tue, 21 Apr 2026 11:35:57 +0200
Subject: [PATCH] =?UTF-8?q?fix:=20local=20LLM=20ingest=20=E2=80=94=20add?=
 =?UTF-8?q?=20max=5Ftokens=20and=20handle=20reasoning=5Fcontent?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two bugs prevented analysis from working with local OpenAI-compatible
endpoints (LM Studio, llama.cpp):

1. buildOpenAiBody() sent no max_tokens. Many local servers default to
   ~2 048 output tokens, which is too short to emit all FILE blocks for
   a typical wiki ingest. Set an explicit 8 192 token limit that covers
   multi-file generation while staying within common local model limits.

2. parseOpenAiLine() only read delta.content and returned null while a
   reasoning model (Nemotron, DeepSeek-R1) was in its thinking phase
   (tokens appear in delta.reasoning_content, not delta.content). The
   accumulated text was therefore empty and no FILE blocks were found,
   producing "(Analysis not available)" for every ingest attempt.
   Fall back to reasoning_content so thinking-phase tokens are included.

Fixes: #41, #26
---
 src/lib/llm-providers.ts | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/lib/llm-providers.ts b/src/lib/llm-providers.ts
index 23702aee..0653e84e 100644
--- a/src/lib/llm-providers.ts
+++ b/src/lib/llm-providers.ts
@@ -20,9 +20,13 @@ function parseOpenAiLine(line: string): string | null {
   if (data === "[DONE]") return null
   try {
     const parsed = JSON.parse(data) as {
-      choices: Array<{ delta: { content?: string } }>
+      choices: Array<{ delta: { content?: string; reasoning_content?: string } }>
     }
-    return parsed.choices?.[0]?.delta?.content ?? null
+    const delta = parsed.choices?.[0]?.delta
+    // Reasoning models (e.g. Nemotron, DeepSeek-R1) emit thinking tokens in
+    // reasoning_content while content is null. Fall back so the accumulated
+    // text includes both the thinking phase and the final answer.
+    return delta?.content ?? delta?.reasoning_content ?? null
   } catch {
     return null
   }
@@ -64,7 +68,11 @@ function parseGoogleLine(line: string): string | null {
 }
 
 function buildOpenAiBody(messages: ChatMessage[]): Record<string, unknown> {
-  return { messages, stream: true }
+  // max_tokens must be set explicitly: without it many local servers (LM Studio,
+  // llama.cpp) fall back to a small default (often 2 048) that is too short for
+  // multi-file wiki generation. 8 192 matches common local model limits while
+  // leaving ample room for reasoning models that spend tokens on thinking first.
+  return { messages, stream: true, max_tokens: 8192 }
 }
 
 function buildAnthropicBody(messages: ChatMessage[]): Record<string, unknown> {