From d978023b29b019ce66b615d6a4262083206b5136 Mon Sep 17 00:00:00 2001 From: Exergenion Date: Tue, 21 Apr 2026 11:35:57 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20local=20LLM=20ingest=20=E2=80=94=20add?= =?UTF-8?q?=20max=5Ftokens=20and=20handle=20reasoning=5Fcontent?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs prevented analysis from working with local OpenAI-compatible endpoints (LM Studio, llama.cpp): 1. buildOpenAiBody() sent no max_tokens. Many local servers default to ~2 048 output tokens, which is too short to emit all FILE blocks for a typical wiki ingest. Set an explicit 8 192 token limit that covers multi-file generation while staying within common local model limits. 2. parseOpenAiLine() only read delta.content and returned null while a reasoning model (Nemotron, DeepSeek-R1) was in its thinking phase (tokens appear in delta.reasoning_content, not delta.content). The accumulated text was therefore empty and no FILE blocks were found, producing "(Analysis not available)" for every ingest attempt. Fall back to reasoning_content so thinking-phase tokens are included. Fixes: #41, #26 --- src/lib/llm-providers.ts | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/lib/llm-providers.ts b/src/lib/llm-providers.ts index 23702aee..0653e84e 100644 --- a/src/lib/llm-providers.ts +++ b/src/lib/llm-providers.ts @@ -20,9 +20,13 @@ function parseOpenAiLine(line: string): string | null { if (data === "[DONE]") return null try { const parsed = JSON.parse(data) as { - choices: Array<{ delta: { content?: string } }> + choices: Array<{ delta: { content?: string; reasoning_content?: string } }> } - return parsed.choices?.[0]?.delta?.content ?? null + const delta = parsed.choices?.[0]?.delta + // Reasoning models (e.g. Nemotron, DeepSeek-R1) emit thinking tokens in + // reasoning_content while content is null. Fall back so the accumulated + // text includes both the thinking phase and the final answer. + return delta?.content ?? delta?.reasoning_content ?? null } catch { return null } @@ -64,7 +68,11 @@ function parseGoogleLine(line: string): string | null { } function buildOpenAiBody(messages: ChatMessage[]): Record { - return { messages, stream: true } + // max_tokens must be set explicitly: without it many local servers (LM Studio, + // llama.cpp) fall back to a small default (often 2 048) that is too short for + // multi-file wiki generation. 8 192 matches common local model limits while + // leaving ample room for reasoning models that spend tokens on thinking first. + return { messages, stream: true, max_tokens: 8192 } } function buildAnthropicBody(messages: ChatMessage[]): Record {