netzbegruenung · Movm · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/.github/workflows/build-images.yml b/.github/workflows/build-images.yml
@@ -62,6 +62,8 @@ jobs:
       shared: ${{ steps.filter.outputs.shared }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
 
       - uses: dorny/paths-filter@v3
         id: filter

diff --git a/.gitignore b/.gitignore
@@ -135,6 +135,9 @@ typecheck_output.txt
 .turbo/
 **/.turbo/
 
+# Remotion build output
+apps/api/services/remotion/out/
+
 # ComfyUI (local-only, not deployed via CI)
 apps/api/services/comfyui/
 services/comfyui/

diff --git a/apps/api/config/keycloakOIDCStrategy.ts b/apps/api/config/keycloakOIDCStrategy.ts
@@ -8,6 +8,7 @@ import {
 } from 'openid-client';
 
 import { isAllowedDomain, buildDomainUrl, URLS } from '../utils/domainUtils.js';
+import { storeOIDCState, consumeOIDCState } from '../utils/redis/OIDCStateStore.js';
 
 import type { Request } from 'express';
 import type { Strategy } from 'passport';
@@ -32,7 +33,7 @@ export interface PassportProfile {
 /**
  * OIDC session data stored in express-session
  */
-interface OIDCSessionData {
+export interface OIDCSessionData {
   state: string;
   redirectTo: string | null;
   originDomain: string | null;
@@ -228,6 +229,16 @@ class KeycloakOIDCStrategy extends (class {} as any as typeof Strategy) {
           throw new Error('Session data verification failed - data not found after save');
         }
 
+        // Store state in Redis as fallback for privacy browsers that block cookies
+        try {
+          await storeOIDCState(state, req.session['oidc:keycloak']!);
+        } catch (redisErr) {
+          console.warn(
+            `[KeycloakOIDC:${correlationId}] Redis state store failed (cookie flow still works):`,
+            redisErr
+          );
+        }
+
         this.redirect(authUrl.href);
       } catch (saveError) {
         console.error(`[KeycloakOIDC:${correlationId}] Failed to save session:`, saveError);
@@ -258,7 +269,30 @@ class KeycloakOIDCStrategy extends (class {} as any as typeof Strategy) {
         console.log(`[KeycloakOIDC:callback] gruenerator.sid cookie present: ${hasOurCookie}`);
       }
 
-      const sessionData = req.session['oidc:keycloak'];
+      let sessionData = req.session['oidc:keycloak'];
+
+      // Fallback: if session cookie was blocked (privacy browsers), recover from Redis
+      if (!sessionData) {
+        const stateParam = req.query.state as string | undefined;
+        if (stateParam) {
+          console.warn(
+            `[KeycloakOIDC:callback] Session cookie missing, attempting Redis state fallback`
+          );
+          try {
+            const redisData = await consumeOIDCState(stateParam);
+            if (redisData) {
+              console.log(
+                `[KeycloakOIDC:${redisData.correlationId}] Redis state fallback successful`
+              );
+              sessionData = redisData;
+              req.session['oidc:keycloak'] = sessionData;
+            }
+          } catch (redisErr) {
+            console.error('[KeycloakOIDC:callback] Redis state fallback failed:', redisErr);
+          }
+        }
+      }
+
       const correlationId = sessionData?.correlationId || 'unknown';
 
       if (!sessionData) {

diff --git a/apps/api/services/OcrService/OcrService.ts b/apps/api/services/OcrService/OcrService.ts
@@ -4,15 +4,26 @@
  * Handles document processing, embedding generation, and database updates
  */
 
+import { createRequire } from 'module';
 import path from 'path';
-import { mistralEmbeddingService } from '../mistral/index.js';
-import { smartChunkDocument } from '../document-services/index.js';
+
+import { vectorConfig } from '../../config/vectorConfig.js';
 import { getPostgresInstance } from '../../database/services/PostgresService.js';
 import { getQdrantInstance } from '../../database/services/QdrantService.js';
-import { vectorConfig } from '../../config/vectorConfig.js';
+import { smartChunkDocument } from '../document-services/index.js';
+import { mistralEmbeddingService } from '../mistral/index.js';
 
 // Import module functions
-import { validateDocumentLimits as validateLimits, getMediaType } from './validation.js';
+import {
+  updateDocumentStatus as updateStatus,
+  updateDocumentWithResults as updateResults,
+  generateAndStoreEmbeddings as generateEmbeddings,
+} from './databaseOperations.js';
+import {
+  extractTextWithDocling as extractDocling,
+  isDoclingAvailable as checkDocling,
+} from './doclingIntegration.js';
+import { extractTextWithMistralOCR as extractMistral } from './mistralIntegration.js';
 import {
   getPdfJs as loadPdfJs,
   openPdfDocument as openPdf,
@@ -22,17 +33,12 @@ import {
   extractPageTextDirectly as extractPage,
   extractTextFromBase64PDF as extractBase64,
 } from './pdfOperations.js';
-import { extractTextWithMistralOCR as extractMistral } from './mistralIntegration.js';
 import {
   applyMarkdownFormatting as formatMarkdown,
   isLikelyHeading,
   determineHeadingLevel,
 } from './textFormatting.js';
-import {
-  updateDocumentStatus as updateStatus,
-  updateDocumentWithResults as updateResults,
-  generateAndStoreEmbeddings as generateEmbeddings,
-} from './databaseOperations.js';
+import { validateDocumentLimits as validateLimits, getMediaType } from './validation.js';
 
 import type {
   DocumentLimits,
@@ -71,11 +77,10 @@ export class OCRService {
 
     const pdfjsLib = await loadPdfJs();
 
-    // Configure worker path
-    const workerPath = path.resolve(
-      path.dirname(new URL(import.meta.url).pathname),
-      '../../node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs'
-    );
+    // Configure worker path — use createRequire to resolve from the actual
+    // installed location, which may be hoisted to the monorepo root.
+    const require = createRequire(import.meta.url);
+    const workerPath = require.resolve('pdfjs-dist/legacy/build/pdf.worker.mjs');
     pdfjsLib.GlobalWorkerOptions.workerSrc = `file://${workerPath}`;
 
     this._pdfjsLib = pdfjsLib;
@@ -150,13 +155,16 @@ export class OCRService {
   }
 
   /**
-   * Extract text from documents using Mistral OCR exclusively
+   * Extract text from documents using the configured OCR provider.
+   * Provider is controlled by OCR_PROVIDER env var: 'mistral' (default) or 'docling'.
+   * When set to 'docling', falls back to Mistral if the Docling sidecar is unreachable.
    */
   async extractTextFromDocument(filePath: string): Promise<DocumentExtractionResult> {
     const startTime = Date.now();
     const fileExtension = path.extname(filePath).toLowerCase();
+    const configuredProvider = (process.env.OCR_PROVIDER || 'mistral').toLowerCase();
     console.log(
-      `[OCRService] Starting document text extraction with Mistral OCR: ${filePath} (${fileExtension})`
+      `[OCRService] Starting document text extraction (provider=${configuredProvider}): ${filePath} (${fileExtension})`
     );
 
     try {
@@ -169,23 +177,48 @@ export class OCRService {
         parseCheck = await this.canExtractTextDirectly(filePath);
       }
 
-      // Always use Mistral OCR - supports PDF, DOCX, PPTX, images
-      console.log(`[OCRService] Using Mistral OCR for ${fileExtension} document`);
-      const result = await this.extractTextWithMistralOCR(filePath);
-      const totalTime = Date.now() - startTime;
+      let result: ExtractionResult;
+      let usedProvider: string;
+
+      if (configuredProvider === 'docling') {
+        // Try Docling first, fall back to Mistral if unavailable or on error
+        const doclingReady = await checkDocling();
+        if (doclingReady) {
+          try {
+            console.log(`[OCRService] Using Docling for ${fileExtension} document`);
+            result = await this.extractTextWithDocling(filePath);
+            usedProvider = 'docling';
+          } catch (doclingError) {
+            console.warn(`[OCRService] Docling failed, falling back to Mistral OCR:`, doclingError);
+            result = await this.extractTextWithMistralOCR(filePath);
+            usedProvider = 'mistral-ocr';
+          }
+        } else {
+          console.log(
+            `[OCRService] Docling unavailable, falling back to Mistral OCR for ${fileExtension} document`
+          );
+          result = await this.extractTextWithMistralOCR(filePath);
+          usedProvider = 'mistral-ocr';
+        }
+      } else {
+        console.log(`[OCRService] Using Mistral OCR for ${fileExtension} document`);
+        result = await this.extractTextWithMistralOCR(filePath);
+        usedProvider = 'mistral-ocr';
+      }
 
-      console.log(`[OCRService] Mistral OCR completed successfully in ${totalTime}ms`);
+      const totalTime = Date.now() - startTime;
+      console.log(`[OCRService] ${usedProvider} completed successfully in ${totalTime}ms`);
 
       return {
         ...result,
-        extractionMethod: 'mistral-ocr',
+        extractionMethod: usedProvider,
         fileType: fileExtension,
         parseabilityStats: parseCheck?.stats || null,
         totalProcessingTimeMs: totalTime,
       };
     } catch (error) {
       const totalTime = Date.now() - startTime;
-      console.error(`[OCRService] Mistral OCR extraction failed after ${totalTime}ms:`, error);
+      console.error(`[OCRService] OCR extraction failed after ${totalTime}ms:`, error);
       throw error;
     }
   }
@@ -197,6 +230,13 @@ export class OCRService {
     return await extractMistral(filePath, getMediaType);
   }
 
+  /**
+   * Use Docling-Serve sidecar to extract text as markdown
+   */
+  async extractTextWithDocling(filePath: string): Promise<ExtractionResult> {
+    return await extractDocling(filePath);
+  }
+
   /**
    * Check if PDF text can be extracted directly
    */

diff --git a/apps/api/services/OcrService/doclingIntegration.ts b/apps/api/services/OcrService/doclingIntegration.ts
@@ -0,0 +1,117 @@
+/**
+ * Docling-Serve integration
+ * Calls the self-hosted docling-serve sidecar container for document-to-markdown conversion.
+ * See: https://github.com/docling-project/docling-serve
+ */
+
+import { promises as fs } from 'fs';
+import path from 'path';
+
+import type { ExtractionResult } from './types.js';
+
+const DOCLING_BASE_URL = process.env.DOCLING_URL || 'http://ocr:5001';
+
+/**
+ * Extract text from a document using the Docling-Serve sidecar.
+ * Sends the file as multipart/form-data to /v1/convert/file and
+ * requests Markdown output (matching the format Mistral OCR produces).
+ */
+export async function extractTextWithDocling(filePath: string): Promise<ExtractionResult> {
+  const startTime = Date.now();
+
+  try {
+    console.log(`[DoclingOCR] Starting extraction:`, { filePath });
+
+    const fileBuffer = await fs.readFile(filePath);
+    const fileName = path.basename(filePath);
+
+    // Build multipart form with the file and conversion options
+    const formData = new FormData();
+    formData.append('files', new Blob([fileBuffer]), fileName);
+
+    // Request markdown output to match the existing pipeline format
+    const optionsPayload = JSON.stringify({
+      to_formats: ['md'],
+      image_export_mode: 'placeholder',
+      do_ocr: true,
+      force_ocr: false,
+    });
+    formData.append('options', new Blob([optionsPayload], { type: 'application/json' }));
+
+    console.log(
+      `[DoclingOCR] Sending to ${DOCLING_BASE_URL}/v1/convert/file (${fileBuffer.length} bytes)`
+    );
+
+    const response = await fetch(`${DOCLING_BASE_URL}/v1/convert/file`, {
+      method: 'POST',
+      body: formData,
+    });
+
+    if (!response.ok) {
+      const errorText = await response.text().catch(() => 'unknown');
+      throw new Error(`Docling API returned ${response.status}: ${errorText}`);
+    }
+
+    const result = await response.json();
+
+    // docling-serve returns { document: { md_content, filename, ... }, status, processing_time }
+    const documents = result?.document ?? result?.documents ?? [result];
+    const markdownParts: string[] = [];
+    let totalPages = 0;
+
+    for (const doc of Array.isArray(documents) ? documents : [documents]) {
+      // The markdown content can be in different fields depending on the version
+      const md = doc?.md_content ?? doc?.markdown ?? doc?.md ?? doc?.text ?? '';
+      if (md.trim()) {
+        markdownParts.push(md.trim());
+      }
+      totalPages += doc?.num_pages ?? doc?.page_count ?? 1;
+    }
+
+    const allText = markdownParts.join('\n\n---\n\n');
+
+    if (!allText.trim()) {
+      throw new Error('Docling returned no text content');
+    }
+
+    const processingTimeMs = Date.now() - startTime;
+    console.log(
+      `[DoclingOCR] Extraction completed in ${processingTimeMs}ms: ${totalPages} pages, ${allText.length} characters`
+    );
+
+    return {
+      text: allText.trim(),
+      pageCount: totalPages,
+      method: 'docling',
+      confidence: 0.9,
+      stats: {
+        pages: totalPages,
+        successfulPages: totalPages,
+        processingTimeMs,
+        method: 'docling-serve',
+      },
+    };
+  } catch (error: any) {
+    const elapsed = Date.now() - startTime;
+    console.error(`[DoclingOCR] Extraction FAILED after ${elapsed}ms:`, {
+      errorMessage: error?.message,
+      errorType: error?.constructor?.name,
+      filePath,
+    });
+    throw new Error(`Docling extraction failed: ${error?.message}`);
+  }
+}
+
+/**
+ * Check if the Docling-Serve sidecar is healthy and reachable.
+ */
+export async function isDoclingAvailable(): Promise<boolean> {
+  try {
+    const response = await fetch(`${DOCLING_BASE_URL}/health`, {
+      signal: AbortSignal.timeout(5000),
+    });
+    return response.ok;
+  } catch {
+    return false;
+  }
+}
diff --git a/apps/api/services/OcrService/types.ts b/apps/api/services/OcrService/types.ts
@@ -26,7 +26,7 @@ export interface ParseabilityCheck {
 export interface ExtractionResult {
   text: string;
   pageCount: number;
-  method: 'mistral-ocr' | 'direct' | 'pdfjs-dist';
+  method: 'mistral-ocr' | 'docling' | 'direct' | 'pdfjs-dist';
   confidence?: number;
   stats?: {
     pages?: number;