Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/build-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ jobs:
shared: ${{ steps.filter.outputs.shared }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 2

- uses: dorny/paths-filter@v3
id: filter
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ typecheck_output.txt
.turbo/
**/.turbo/

# Remotion build output
apps/api/services/remotion/out/

# ComfyUI (local-only, not deployed via CI)
apps/api/services/comfyui/
services/comfyui/
Expand Down
38 changes: 36 additions & 2 deletions apps/api/config/keycloakOIDCStrategy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
} from 'openid-client';

import { isAllowedDomain, buildDomainUrl, URLS } from '../utils/domainUtils.js';
import { storeOIDCState, consumeOIDCState } from '../utils/redis/OIDCStateStore.js';

import type { Request } from 'express';
import type { Strategy } from 'passport';
Expand All @@ -32,7 +33,7 @@ export interface PassportProfile {
/**
* OIDC session data stored in express-session
*/
interface OIDCSessionData {
export interface OIDCSessionData {
state: string;
redirectTo: string | null;
originDomain: string | null;
Expand Down Expand Up @@ -228,6 +229,16 @@ class KeycloakOIDCStrategy extends (class {} as any as typeof Strategy) {
throw new Error('Session data verification failed - data not found after save');
}

// Store state in Redis as fallback for privacy browsers that block cookies
try {
await storeOIDCState(state, req.session['oidc:keycloak']!);
} catch (redisErr) {
console.warn(
`[KeycloakOIDC:${correlationId}] Redis state store failed (cookie flow still works):`,
redisErr
);
}

this.redirect(authUrl.href);
} catch (saveError) {
console.error(`[KeycloakOIDC:${correlationId}] Failed to save session:`, saveError);
Expand Down Expand Up @@ -258,7 +269,30 @@ class KeycloakOIDCStrategy extends (class {} as any as typeof Strategy) {
console.log(`[KeycloakOIDC:callback] gruenerator.sid cookie present: ${hasOurCookie}`);
}

const sessionData = req.session['oidc:keycloak'];
let sessionData = req.session['oidc:keycloak'];

// Fallback: if session cookie was blocked (privacy browsers), recover from Redis
if (!sessionData) {
const stateParam = req.query.state as string | undefined;
if (stateParam) {
console.warn(
`[KeycloakOIDC:callback] Session cookie missing, attempting Redis state fallback`
);
try {
const redisData = await consumeOIDCState(stateParam);
if (redisData) {
console.log(
`[KeycloakOIDC:${redisData.correlationId}] Redis state fallback successful`
);
sessionData = redisData;
req.session['oidc:keycloak'] = sessionData;
}
} catch (redisErr) {
console.error('[KeycloakOIDC:callback] Redis state fallback failed:', redisErr);
}
}
}

const correlationId = sessionData?.correlationId || 'unknown';

if (!sessionData) {
Expand Down
88 changes: 64 additions & 24 deletions apps/api/services/OcrService/OcrService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,26 @@
* Handles document processing, embedding generation, and database updates
*/

import { createRequire } from 'module';
import path from 'path';
import { mistralEmbeddingService } from '../mistral/index.js';
import { smartChunkDocument } from '../document-services/index.js';

import { vectorConfig } from '../../config/vectorConfig.js';
import { getPostgresInstance } from '../../database/services/PostgresService.js';
import { getQdrantInstance } from '../../database/services/QdrantService.js';
import { vectorConfig } from '../../config/vectorConfig.js';
import { smartChunkDocument } from '../document-services/index.js';
import { mistralEmbeddingService } from '../mistral/index.js';

// Import module functions
import { validateDocumentLimits as validateLimits, getMediaType } from './validation.js';
import {
updateDocumentStatus as updateStatus,
updateDocumentWithResults as updateResults,
generateAndStoreEmbeddings as generateEmbeddings,
} from './databaseOperations.js';
import {
extractTextWithDocling as extractDocling,
isDoclingAvailable as checkDocling,
} from './doclingIntegration.js';
import { extractTextWithMistralOCR as extractMistral } from './mistralIntegration.js';
import {
getPdfJs as loadPdfJs,
openPdfDocument as openPdf,
Expand All @@ -22,17 +33,12 @@ import {
extractPageTextDirectly as extractPage,
extractTextFromBase64PDF as extractBase64,
} from './pdfOperations.js';
import { extractTextWithMistralOCR as extractMistral } from './mistralIntegration.js';
import {
applyMarkdownFormatting as formatMarkdown,
isLikelyHeading,
determineHeadingLevel,
} from './textFormatting.js';
import {
updateDocumentStatus as updateStatus,
updateDocumentWithResults as updateResults,
generateAndStoreEmbeddings as generateEmbeddings,
} from './databaseOperations.js';
import { validateDocumentLimits as validateLimits, getMediaType } from './validation.js';

import type {
DocumentLimits,
Expand Down Expand Up @@ -71,11 +77,10 @@ export class OCRService {

const pdfjsLib = await loadPdfJs();

// Configure worker path
const workerPath = path.resolve(
path.dirname(new URL(import.meta.url).pathname),
'../../node_modules/pdfjs-dist/legacy/build/pdf.worker.mjs'
);
// Configure worker path — use createRequire to resolve from the actual
// installed location, which may be hoisted to the monorepo root.
const require = createRequire(import.meta.url);
const workerPath = require.resolve('pdfjs-dist/legacy/build/pdf.worker.mjs');
pdfjsLib.GlobalWorkerOptions.workerSrc = `file://${workerPath}`;

this._pdfjsLib = pdfjsLib;
Expand Down Expand Up @@ -150,13 +155,16 @@ export class OCRService {
}

/**
* Extract text from documents using Mistral OCR exclusively
* Extract text from documents using the configured OCR provider.
* Provider is controlled by OCR_PROVIDER env var: 'mistral' (default) or 'docling'.
* When set to 'docling', falls back to Mistral if the Docling sidecar is unreachable.
*/
async extractTextFromDocument(filePath: string): Promise<DocumentExtractionResult> {
const startTime = Date.now();
const fileExtension = path.extname(filePath).toLowerCase();
const configuredProvider = (process.env.OCR_PROVIDER || 'mistral').toLowerCase();
console.log(
`[OCRService] Starting document text extraction with Mistral OCR: ${filePath} (${fileExtension})`
`[OCRService] Starting document text extraction (provider=${configuredProvider}): ${filePath} (${fileExtension})`
);

try {
Expand All @@ -169,23 +177,48 @@ export class OCRService {
parseCheck = await this.canExtractTextDirectly(filePath);
}

// Always use Mistral OCR - supports PDF, DOCX, PPTX, images
console.log(`[OCRService] Using Mistral OCR for ${fileExtension} document`);
const result = await this.extractTextWithMistralOCR(filePath);
const totalTime = Date.now() - startTime;
let result: ExtractionResult;
let usedProvider: string;

if (configuredProvider === 'docling') {
// Try Docling first, fall back to Mistral if unavailable or on error
const doclingReady = await checkDocling();
if (doclingReady) {
try {
console.log(`[OCRService] Using Docling for ${fileExtension} document`);
result = await this.extractTextWithDocling(filePath);
usedProvider = 'docling';
} catch (doclingError) {
console.warn(`[OCRService] Docling failed, falling back to Mistral OCR:`, doclingError);
result = await this.extractTextWithMistralOCR(filePath);
usedProvider = 'mistral-ocr';
}
} else {
console.log(
`[OCRService] Docling unavailable, falling back to Mistral OCR for ${fileExtension} document`
);
result = await this.extractTextWithMistralOCR(filePath);
usedProvider = 'mistral-ocr';
}
} else {
console.log(`[OCRService] Using Mistral OCR for ${fileExtension} document`);
result = await this.extractTextWithMistralOCR(filePath);
usedProvider = 'mistral-ocr';
}

console.log(`[OCRService] Mistral OCR completed successfully in ${totalTime}ms`);
const totalTime = Date.now() - startTime;
console.log(`[OCRService] ${usedProvider} completed successfully in ${totalTime}ms`);

return {
...result,
extractionMethod: 'mistral-ocr',
extractionMethod: usedProvider,
fileType: fileExtension,
parseabilityStats: parseCheck?.stats || null,
totalProcessingTimeMs: totalTime,
};
} catch (error) {
const totalTime = Date.now() - startTime;
console.error(`[OCRService] Mistral OCR extraction failed after ${totalTime}ms:`, error);
console.error(`[OCRService] OCR extraction failed after ${totalTime}ms:`, error);
throw error;
}
}
Expand All @@ -197,6 +230,13 @@ export class OCRService {
return await extractMistral(filePath, getMediaType);
}

/**
* Use Docling-Serve sidecar to extract text as markdown
*/
async extractTextWithDocling(filePath: string): Promise<ExtractionResult> {
return await extractDocling(filePath);
}

/**
* Check if PDF text can be extracted directly
*/
Expand Down
117 changes: 117 additions & 0 deletions apps/api/services/OcrService/doclingIntegration.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
/**
* Docling-Serve integration
* Calls the self-hosted docling-serve sidecar container for document-to-markdown conversion.
* See: https://github.com/docling-project/docling-serve
*/

import { promises as fs } from 'fs';
import path from 'path';

import type { ExtractionResult } from './types.js';

const DOCLING_BASE_URL = process.env.DOCLING_URL || 'http://ocr:5001';

/**
* Extract text from a document using the Docling-Serve sidecar.
* Sends the file as multipart/form-data to /v1/convert/file and
* requests Markdown output (matching the format Mistral OCR produces).
*/
export async function extractTextWithDocling(filePath: string): Promise<ExtractionResult> {
const startTime = Date.now();

try {
console.log(`[DoclingOCR] Starting extraction:`, { filePath });

const fileBuffer = await fs.readFile(filePath);
const fileName = path.basename(filePath);

// Build multipart form with the file and conversion options
const formData = new FormData();
formData.append('files', new Blob([fileBuffer]), fileName);

// Request markdown output to match the existing pipeline format
const optionsPayload = JSON.stringify({
to_formats: ['md'],
image_export_mode: 'placeholder',
do_ocr: true,
force_ocr: false,
});
formData.append('options', new Blob([optionsPayload], { type: 'application/json' }));

console.log(
`[DoclingOCR] Sending to ${DOCLING_BASE_URL}/v1/convert/file (${fileBuffer.length} bytes)`
);

const response = await fetch(`${DOCLING_BASE_URL}/v1/convert/file`, {
method: 'POST',
body: formData,
});

if (!response.ok) {
const errorText = await response.text().catch(() => 'unknown');
throw new Error(`Docling API returned ${response.status}: ${errorText}`);
}

const result = await response.json();

// docling-serve returns { document: { md_content, filename, ... }, status, processing_time }
const documents = result?.document ?? result?.documents ?? [result];
const markdownParts: string[] = [];
let totalPages = 0;

for (const doc of Array.isArray(documents) ? documents : [documents]) {
// The markdown content can be in different fields depending on the version
const md = doc?.md_content ?? doc?.markdown ?? doc?.md ?? doc?.text ?? '';
if (md.trim()) {
markdownParts.push(md.trim());
}
totalPages += doc?.num_pages ?? doc?.page_count ?? 1;
}

const allText = markdownParts.join('\n\n---\n\n');

if (!allText.trim()) {
throw new Error('Docling returned no text content');
}

const processingTimeMs = Date.now() - startTime;
console.log(
`[DoclingOCR] Extraction completed in ${processingTimeMs}ms: ${totalPages} pages, ${allText.length} characters`
);

return {
text: allText.trim(),
pageCount: totalPages,
method: 'docling',
confidence: 0.9,
stats: {
pages: totalPages,
successfulPages: totalPages,
processingTimeMs,
method: 'docling-serve',
},
};
} catch (error: any) {
const elapsed = Date.now() - startTime;
console.error(`[DoclingOCR] Extraction FAILED after ${elapsed}ms:`, {
errorMessage: error?.message,
errorType: error?.constructor?.name,
filePath,
});
throw new Error(`Docling extraction failed: ${error?.message}`);
}
}

/**
* Check if the Docling-Serve sidecar is healthy and reachable.
*/
export async function isDoclingAvailable(): Promise<boolean> {
try {
const response = await fetch(`${DOCLING_BASE_URL}/health`, {
signal: AbortSignal.timeout(5000),
});
return response.ok;
} catch {
return false;
}
}
2 changes: 1 addition & 1 deletion apps/api/services/OcrService/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export interface ParseabilityCheck {
export interface ExtractionResult {
text: string;
pageCount: number;
method: 'mistral-ocr' | 'direct' | 'pdfjs-dist';
method: 'mistral-ocr' | 'docling' | 'direct' | 'pdfjs-dist';
confidence?: number;
stats?: {
pages?: number;
Expand Down
Loading
Loading