netzbegruenung
diff --git a/‎apps/api/routes/chat/agents/providers.ts‎
Lines changed: 51 additions & 12 deletions b/‎apps/api/routes/chat/agents/providers.ts‎
Lines changed: 51 additions & 12 deletions
diff --git a/‎apps/api/routes/chat/notebookStreamCore.ts‎
Lines changed: 33 additions & 121 deletions b/‎apps/api/routes/chat/notebookStreamCore.ts‎
Lines changed: 33 additions & 121 deletions
@@ -7,6 +7,7 @@ import { createMistral } from '@ai-sdk/mistral';
 import { createOpenAI } from '@ai-sdk/openai';
 
 import { env } from '../../../config/env.js';
+import { litellmFetchWithThinkingDisabled } from '../../../services/ai/litellmThinkingFetch.js';
 import { isVisionCapable } from '../../../services/ai/modelDiscovery.js';
 import { regoloFetchWithThinkingDisabled } from '../../../services/ai/regoloThinkingFetch.js';
 
@@ -29,33 +30,69 @@ export { isVisionCapable };
  * Maps user-facing model IDs to provider/model configurations.
  * contextWindow is in tokens — used by downstream context management to adapt budgets.
  */
-export const AVAILABLE_MODELS: Record<
-  string,
-  { provider: 'mistral' | 'litellm' | 'regolo'; model: string; contextWindow: number }
-> = {
+export interface ModelConfig {
+  provider: 'mistral' | 'litellm' | 'regolo';
+  model: string;
+  contextWindow: number;
+  /**
+   * User-facing model ID to fall back to when this model fails to produce
+   * output (first-token timeout, empty completion, or upstream HTTP error).
+   *
+   * Chinese-trained models (Qwen) intentionally have NO fallback. The user
+   * sees an explicit "Chinesisches Modell"-warning before selecting them
+   * (informed-consent boundary in chatStore.ts MODEL_OPTIONS); auto-routing
+   * either INTO or OUT OF Qwen would break that contract. Qwen failures must
+   * surface as errors so the user can choose to retry or switch manually.
+   */
+  fallback?: string;
+}
+
+export const AVAILABLE_MODELS: Record<string, ModelConfig> = {
   // 'mistral' is intentionally absent — it uses agent defaults (like 'auto')
   // Legacy IDs kept for backward compatibility (old stored client preferences)
   'mistral-large': { provider: 'mistral', model: 'mistral-large-latest', contextWindow: 128000 },
   'mistral-medium': { provider: 'mistral', model: 'mistral-medium-latest', contextWindow: 128000 },
   'pixtral-large': { provider: 'mistral', model: 'pixtral-large-latest', contextWindow: 128000 },
-  litellm: { provider: 'litellm', model: 'gpt-oss:120b', contextWindow: 16384 },
+  litellm: {
+    provider: 'litellm',
+    model: 'gpt-oss:120b',
+    contextWindow: 16384,
+    fallback: 'gpt-oss-regolo',
+  },
   regolo: {
     provider: 'regolo',
     model: env.REGOLO_DEFAULT_MODEL || 'qwen3.5-122b',
     contextWindow: 32768,
   },
-  'gpt-oss-regolo': { provider: 'regolo', model: 'gpt-oss-120b', contextWindow: 32768 },
-  'gemma-regolo': { provider: 'regolo', model: 'gemma4-31b', contextWindow: 32768 },
+  'gpt-oss-regolo': {
+    provider: 'regolo',
+    model: 'gpt-oss-120b',
+    contextWindow: 32768,
+    fallback: 'gemma-litellm',
+  },
+  // Chinese-trained models — no `fallback` field by design. See ModelConfig.
   'qwen-regolo': { provider: 'regolo', model: 'qwen3.5-122b', contextWindow: 32768 },
+  'qwen3.6-regolo': { provider: 'regolo', model: 'qwen3.6-27b', contextWindow: 32768 },
+};
+
+const GEMMA_LITELLM: ModelConfig = {
+  provider: 'litellm',
+  model: 'gpt-oss:120b',
+  contextWindow: 32768,
+  fallback: 'gpt-oss-regolo',
 };
+AVAILABLE_MODELS['gemma-litellm'] = GEMMA_LITELLM;
+// Legacy ID — old persisted client state may still send 'gemma-regolo'.
+// Aliased to the LiteLLM-served gemma so requests don't hit Regolo's broken
+// gemma4-31b endpoint. ChatStore migration upgrades the persisted ID on next
+// page load.
+AVAILABLE_MODELS['gemma-regolo'] = GEMMA_LITELLM;
 
 /**
  * Get model configuration by user-facing model ID.
  * Returns null if model ID is not recognized.
  */
-export function getModelConfig(
-  modelId: string
-): { provider: 'mistral' | 'litellm' | 'regolo'; model: string; contextWindow: number } | null {
+export function getModelConfig(modelId: string): ModelConfig | null {
   return AVAILABLE_MODELS[modelId] || null;
 }
 
@@ -108,6 +145,7 @@ function getLiteLLMProvider() {
       baseURL: `${baseURL}/v1`,
       apiKey: env.LITELLM_API_KEY || '',
       name: 'litellm',
+      fetch: litellmFetchWithThinkingDisabled,
     });
   }
   return litellmInstance;
@@ -169,9 +207,10 @@ export function getModel(provider: string, modelId: string): LanguageModel {
       return model;
     }
     case 'litellm': {
-      console.log(`[providers] Creating LiteLLM model with default: ${LITELLM_DEFAULT_MODEL}`);
+      const resolvedModel = modelId || LITELLM_DEFAULT_MODEL;
+      console.log(`[providers] Creating LiteLLM model: ${resolvedModel}`);
       const litellm = getLiteLLMProvider();
-      const model = litellm.chat(LITELLM_DEFAULT_MODEL);
+      const model = litellm.chat(resolvedModel);
       console.log(`[providers] LiteLLM model created successfully`);
       return model;
     }
 
@@ -4,7 +4,7 @@
  * notebook controller and the public Gruen-O-Mat controller.
  */
 
-import { streamText, type ModelMessage } from 'ai';
+import { type ModelMessage } from 'ai';
 
 import {
   buildConcisePromptGrundsatz,
@@ -15,10 +15,7 @@ import {
   getSystemCollectionConfig,
 } from '../../config/systemCollectionsConfig.js';
 import { NotebookQdrantHelper } from '../../database/services/NotebookQdrantHelper.js';
-import {
-  isRegoloReasoningModel,
-  streamRegoloWithReasoning,
-} from '../../services/ai/regoloReasoningStream.js';
+import { isRegoloReasoningModel } from '../../services/ai/regoloReasoningStream.js';
 import { notebookQAService } from '../../services/notebook/index.js';
 import { rerankNotebookResults } from '../../services/notebook/rerankNotebookResults.js';
 import {
@@ -30,7 +27,11 @@ import { createLogger } from '../../utils/logger.js';
 import { containsPromptLeakage } from '../gruenomat/topicGuard.js';
 
 import { isProviderConfigured } from './agents/providers.js';
-import { resolveModel } from './services/responseStreamingService.js';
+import {
+  resolveModel,
+  streamForResolution,
+  streamWithFallback,
+} from './services/responseStreamingService.js';
 import { SSEWriter } from './services/sseHelpers.js';
 
 import type { SearchContext } from '../../services/notebook/types.js';
@@ -253,14 +254,10 @@ export async function handleNotebookStream(
 
     // Determine AI provider and model (same resolution as chat — handles model ID → real name)
     const defaultAgentConfig = { provider: DEFAULT_PROVIDER, model: DEFAULT_MODEL };
-    const {
-      model: aiModel,
-      provider: resolvedProvider,
-      modelName: resolvedModelName,
-    } = resolveModel(defaultAgentConfig, model);
-
-    if (!isProviderConfigured(resolvedProvider)) {
-      sse.send('error', { error: `Provider "${resolvedProvider}" is not configured` });
+    const primaryResolution = resolveModel(defaultAgentConfig, model);
+
+    if (!isProviderConfigured(primaryResolution.provider)) {
+      sse.send('error', { error: `Provider "${primaryResolution.provider}" is not configured` });
       sse.end();
       return null;
     }
@@ -281,127 +278,42 @@ export async function handleNotebookStream(
     const t2 = Date.now();
     log.debug(`⏱ Model setup: ${t2 - t1}ms`);
 
-    const useReasoningStream = isRegoloReasoningModel(resolvedProvider, resolvedModelName);
-    // Reasoning models spend most of their budget on the <think> block before
-    // emitting the answer. Triple the ceiling so there's room for both phases.
+    // Reasoning models need extra room for the <think> block before content.
     const baseMaxOutput = isFast ? 3000 : 16000;
-    const maxOutputTokens = useReasoningStream ? Math.max(baseMaxOutput, 9000) : baseMaxOutput;
 
     sse.send('response_start', { message: 'Generiere Antwort...' });
 
-    let fullText = '';
-    let firstChunkTime: number | undefined;
-
-    try {
-      if (useReasoningStream) {
-        for await (const chunk of streamRegoloWithReasoning({
-          model: resolvedModelName,
+    const fullText = await streamWithFallback({
+      primary: primaryResolution,
+      sse,
+      logPrefix: '[Notebook]',
+      buildStream: async (resolution) => {
+        const isReasoning = isRegoloReasoningModel(resolution.provider, resolution.modelName);
+        return streamForResolution({
+          resolution,
           messages: aiMessages,
-          maxTokens: maxOutputTokens,
+          maxTokens: isReasoning ? Math.max(baseMaxOutput, 9000) : baseMaxOutput,
           temperature: 0.2,
+          sse,
           signal: abortController.signal,
-        })) {
-          if (abortController.signal.aborted) break;
-          if (!firstChunkTime) {
-            firstChunkTime = Date.now();
-            log.debug(`⏱ First token latency: ${firstChunkTime - t2}ms`);
-          }
-          if (chunk.type === 'text') {
-            fullText += chunk.delta;
-            sse.send('text_delta', { text: chunk.delta });
-          } else {
-            sse.send('reasoning_delta', { text: chunk.delta });
-          }
-        }
-      } else {
-        const result = streamText({
-          model: aiModel,
-          messages: aiMessages,
-          maxOutputTokens,
-          temperature: 0.2,
-          abortSignal: abortController.signal,
+          logPrefix: '[Notebook]',
         });
-        for await (const chunk of result.textStream) {
-          if (abortController.signal.aborted) break;
-          if (!firstChunkTime) {
-            firstChunkTime = Date.now();
-            log.debug(`⏱ First token latency: ${firstChunkTime - t2}ms`);
-          }
-          fullText += chunk;
-          sse.send('text_delta', { text: chunk });
-        }
-      }
-    } catch (streamError: unknown) {
-      if (abortController.signal.aborted) {
-        log.debug('Notebook stream aborted by client disconnect');
-        log.debug(`⏱ Total (aborted): ${Date.now() - t0}ms, ${fullText.length} chars`);
-        sse.end();
-        return null;
-      }
-      const streamErrMsg = streamError instanceof Error ? streamError.message : String(streamError);
-      const t4err = Date.now();
-      log.warn('Stream error (accumulated %d chars): %s', fullText.length, streamErrMsg);
-      log.debug(
-        `⏱ Streaming (error): ${t4err - (firstChunkTime || t2)}ms, ${fullText.length} chars`
-      );
+      },
+    });
 
-      if (fullText.length > 0) {
-        try {
-          const { renumberedDraft, newReferencesMap } = renumberCitationsInOrder(
-            fullText,
-            searchContext.referencesMap
-          );
-          const { cleanDraft, citations, sources } = validateAndInjectCitations(
-            renumberedDraft,
-            newReferencesMap
-          );
-          const allSources = searchContext.sortedResults
-            .filter((_, i) => !citations.some((c) => c.index === String(i + 1)))
-            .slice(0, 10);
-
-          let sourcesByCollection: SourcesByCollection | undefined;
-          if (searchContext.isMulti && searchContext.effectiveCollectionIds) {
-            const collectionsConfig: { [collectionId: string]: CollectionConfig } = {};
-            for (const id of searchContext.effectiveCollectionIds) {
-              const config = SYSTEM_COLLECTIONS[id];
-              if (config) collectionsConfig[id] = { name: config.name };
-            }
-            sourcesByCollection = groupSourcesByCollection(
-              citations,
-              searchContext.sortedResults,
-              collectionsConfig
-            );
-          }
+    if (fullText === null) {
+      log.debug(`⏱ Total (stream failed): ${Date.now() - t0}ms`);
+      return null;
+    }
 
-          sse.send('completion', {
-            answer: cleanDraft,
-            citations,
-            sources,
-            allSources,
-            ...(sourcesByCollection && { sourcesByCollection }),
-            metadata: {
-              isMulti: searchContext.isMulti,
-              collectionName: searchContext.collectionName,
-              effectiveCollectionIds: searchContext.effectiveCollectionIds,
-              totalResults: searchContext.sortedResults.length,
-              citationsCount: citations.length,
-              partial: true,
-            },
-          });
-        } catch (citationError: unknown) {
-          log.error('Failed to process partial citations:', citationError);
-          sse.send('error', { error: streamErrMsg || 'Stream interrupted' });
-        }
-      } else {
-        sse.send('error', { error: streamErrMsg || 'Stream interrupted' });
-      }
-      log.debug(`⏱ Total (error path): ${Date.now() - t0}ms`);
+    if (abortController.signal.aborted) {
+      log.debug('Notebook stream aborted by client disconnect');
       sse.end();
       return null;
     }
 
     const t4 = Date.now();
-    log.debug(`⏱ Streaming: ${t4 - (firstChunkTime || t2)}ms, ${fullText.length} chars`);
+    log.debug(`⏱ Streaming: ${t4 - t2}ms, ${fullText.length} chars`);
 
     // Layer 5: Output leakage detection — check if the LLM leaked system prompt fragments
     if (options.systemPromptOverride && containsPromptLeakage(fullText)) {
@@ -466,7 +378,7 @@ export async function handleNotebookStream(
 
     const t6 = Date.now();
     log.debug(
-      `⏱ Total: ${t6 - t0}ms [${isFast ? 'fast' : 'deep'}] (search=${t1 - t0}, setup=${t2 - t1}, ttft=${(firstChunkTime || t2) - t2}, stream=${t4 - (firstChunkTime || t2)}, cite=${t5 - t4})`
+      `⏱ Total: ${t6 - t0}ms [${isFast ? 'fast' : 'deep'}] (search=${t1 - t0}, setup=${t2 - t1}, stream=${t4 - t2}, cite=${t5 - t4})`
     );
     sse.end();