diff --git a/backend/app/services/pdf_extractor.py b/backend/app/services/pdf_extractor.py index bed896f..856435b 100644 --- a/backend/app/services/pdf_extractor.py +++ b/backend/app/services/pdf_extractor.py @@ -1,30 +1,90 @@ -from datetime import datetime - - -def extract_pdf_data(pdf_bytes: bytes, file_name: str) -> dict: - """ - Pure function: Takes PDF bytes, returns extracted JSON data - No database or storage operations - just PDF → JSON transformation - - TODO: Replace placeholder with actual PDF extraction logic - """ - - # Placeholder extraction logic - extracted_data = { - "source_pdf": file_name, - "extracted_at": datetime.now().isoformat(), - "placeholder": True, - "data": { - "title": f"Extracted from {file_name}", - "content": "PDF content will be extracted here", - "tables": [], - "text_blocks": [], - }, - "metadata": { - "extractor_version": "0.1.0-placeholder", +# --- LLM-only extractor (no Docling/pdfplumber) --- +import io, os, re, json, time, tempfile +try: + import google.generativeai as genai +except Exception: + genai = None + +def _llm_only_prompt() -> str: + return ( + "You are a PDF→JSON structurer for manufacturing/robotics documents. " + "Return ONE valid JSON object only (no markdown). Keep only meaningful specs " + "(manufacturer, model, document identifiers, key specs like payload/reach/" + "repeatability/mass/mounting/protection/environment, axis JT1..JTn ranges & speeds). " + "Preserve symbols like ±, °/s, φ. Normalize ranges as strings (e.g., '±180', '+140 to -105'). " + "Do not invent values; omit if missing." + ) + +def extract_pdf_data( + pdf_bytes: bytes, + file_name: str, + model: str = "gemini-2.5-pro", # or "gemini-2.5-pro" if enabled +) -> dict: + if genai is None: + raise RuntimeError("google-generativeai is not installed. `pip install google-generativeai`") + + api_key = os.getenv("GEMINI_API_KEY") + if not api_key: + raise RuntimeError("GEMINI_API_KEY is not set") + genai.configure(api_key=api_key) + + # 1) Write bytes to a temp file and upload via path=... + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: + tmp.write(pdf_bytes) + tmp_path = tmp.name + + uploaded = None + try: + uploaded = genai.upload_file( + path=tmp_path, # <-- use path, not file + mime_type="application/pdf", + display_name=file_name, + ) + + # 2) Wait until Gemini finishes processing + # (some SDKs expose state.name, others just state) + def _state_name(f): + try: + return getattr(getattr(f, "state", None), "name", None) or getattr(f, "state", None) + except Exception: + return None + + while _state_name(uploaded) == "PROCESSING": + time.sleep(0.5) + uploaded = genai.get_file(uploaded.name) + + # 3) Ask for JSON only + model_obj = genai.GenerativeModel(model) + resp = model_obj.generate_content( + [ + {"text": _llm_only_prompt()}, + uploaded, + ], + generation_config={"response_mime_type": "application/json"}, + ) + + text = (getattr(resp, "text", "") or "").strip() + try: + data = json.loads(text) + except Exception: + # Fallback if the model ignored the JSON MIME and returned prose + i, j = text.find("{"), text.rfind("}") + data = json.loads(text[i:j+1]) if i != -1 and j > i else {"error": "LLM did not return JSON"} + + return { "file_name": file_name, - "byte_size": len(pdf_bytes), - }, - } + "result": data, + "meta": {"llm_model": model, "source": "gemini-pdf-only"}, + } - return extracted_data + finally: + try: + os.unlink(tmp_path) + except Exception: + pass + # Optional: delete the uploaded file from Gemini storage + try: + if uploaded is not None: + genai.delete_file(uploaded.name) + except Exception: + pass \ No newline at end of file diff --git a/backend/requirements.txt b/backend/requirements.txt index 124c4e9..ed9aeb1 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -4,4 +4,15 @@ supabase==2.10.0 python-dotenv==1.0.1 pydantic[email]==2.10.3 pydantic-settings==2.6.1 -ruff==0.14.0 \ No newline at end of file +ruff==0.14.0 +docling==2.55.1 +docling-core==2.48.4 +docling-ibm-models==3.9.1 +docling-parse==4.5.0 +pdfplumber==0.11.7 +google-ai-generativelanguage==0.6.15 +google-api-core==2.25.2 +google-api-python-client==2.184.0 +google-auth==2.41.1 +google-auth-httplib2==0.2.0 +google-generativeai==0.8.5 \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 6ced3d1..32a8c4f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,6 +12,7 @@ services: SUPABASE_SERVICE_ROLE_KEY: ${BACKEND_SUPABASE_SERVICE_ROLE_KEY} WEBHOOK_BASE_URL: ${WEBHOOK_BASE_URL} WEBHOOK_SECRET: ${WEBHOOK_SECRET} + GEMINI_API_KEY: ${GEMINI_API_KEY} volumes: - ./backend:/app extra_hosts: diff --git a/init-dev.js b/init-dev.js index c54682f..e9e5821 100644 --- a/init-dev.js +++ b/init-dev.js @@ -1,3 +1,4 @@ +require('dotenv').config() const fs = require("fs"); const crypto = require("crypto"); const { exec, execSync } = require("child_process"); @@ -95,6 +96,9 @@ STUDIO_URL=${cliEnv.STUDIO_URL || "http://localhost:54323"} DB_URL=${ cliEnv.DB_URL || "postgresql://postgres:postgres@localhost:54322/postgres" } + +# Gemini API Key +GEMINI_API_KEY=${process.env.GEMINI_API_KEY} `; } diff --git a/package-lock.json b/package-lock.json index 338ce5f..60a1f76 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,15 @@ { - "name": "cortex-etl-source", + "name": "cortex-etl", "version": "1.0.0", "lockfileVersion": 3, "requires": true, "packages": { "": { + "dependencies": { + "dotenv": "^17.2.3" + }, "devDependencies": { - "supabase": "^2.47.2" + "supabase": "2.47.2" } }, "node_modules/@isaacs/fs-minipass": { @@ -97,6 +100,18 @@ } } }, + "node_modules/dotenv": { + "version": "17.2.3", + "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.2.3.tgz", + "integrity": "sha512-JVUnt+DUIzu87TABbhPmNfVdBDt18BLOWjMUFJMSi/Qqg7NTYtabbvSNJGOJ7afbRuv9D/lngizHtP7QyLQ+9w==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://dotenvx.com" + } + }, "node_modules/fetch-blob": { "version": "3.2.0", "resolved": "https://registry.npmjs.org/fetch-blob/-/fetch-blob-3.2.0.tgz", diff --git a/package.json b/package.json index 0d1fc95..3ebf7bc 100644 --- a/package.json +++ b/package.json @@ -15,5 +15,8 @@ }, "devDependencies": { "supabase": "2.47.2" + }, + "dependencies": { + "dotenv": "^17.2.3" } }