Merge pull request #7 from GenerateNU/pdf_extraction_test

bamarler · web-flow · commit 46c75d2478d5 · 2025-10-14T17:04:27.000-04:00
Added pdf extraction logic
diff --git a/backend/app/services/pdf_extractor.py b/backend/app/services/pdf_extractor.py
@@ -1,30 +1,90 @@
-from datetime import datetime
-
-
-def extract_pdf_data(pdf_bytes: bytes, file_name: str) -> dict:
-    """
-    Pure function: Takes PDF bytes, returns extracted JSON data
-    No database or storage operations - just PDF → JSON transformation
-
-    TODO: Replace placeholder with actual PDF extraction logic
-    """
-
-    # Placeholder extraction logic
-    extracted_data = {
-        "source_pdf": file_name,
-        "extracted_at": datetime.now().isoformat(),
-        "placeholder": True,
-        "data": {
-            "title": f"Extracted from {file_name}",
-            "content": "PDF content will be extracted here",
-            "tables": [],
-            "text_blocks": [],
-        },
-        "metadata": {
-            "extractor_version": "0.1.0-placeholder",
+# --- LLM-only extractor (no Docling/pdfplumber) ---
+import io, os, re, json, time, tempfile
+try:
+    import google.generativeai as genai
+except Exception:
+    genai = None
+
+def _llm_only_prompt() -> str:
+    return (
+        "You are a PDF→JSON structurer for manufacturing/robotics documents. "
+        "Return ONE valid JSON object only (no markdown). Keep only meaningful specs "
+        "(manufacturer, model, document identifiers, key specs like payload/reach/"
+        "repeatability/mass/mounting/protection/environment, axis JT1..JTn ranges & speeds). "
+        "Preserve symbols like ±, °/s, φ. Normalize ranges as strings (e.g., '±180', '+140 to -105'). "
+        "Do not invent values; omit if missing."
+    )
+
+def extract_pdf_data(
+    pdf_bytes: bytes,
+    file_name: str,
+    model: str = "gemini-2.5-pro",   # or "gemini-2.5-pro" if enabled
+) -> dict:
+    if genai is None:
+        raise RuntimeError("google-generativeai is not installed. `pip install google-generativeai`")
+
+    api_key = os.getenv("GEMINI_API_KEY")
+    if not api_key:
+        raise RuntimeError("GEMINI_API_KEY is not set")
+    genai.configure(api_key=api_key)
+
+    # 1) Write bytes to a temp file and upload via path=...
+    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
+        tmp.write(pdf_bytes)
+        tmp_path = tmp.name
+
+    uploaded = None
+    try:
+        uploaded = genai.upload_file(
+            path=tmp_path,                       # <-- use path, not file
+            mime_type="application/pdf",
+            display_name=file_name,
+        )
+
+        # 2) Wait until Gemini finishes processing
+        #    (some SDKs expose state.name, others just state)
+        def _state_name(f):
+            try:
+                return getattr(getattr(f, "state", None), "name", None) or getattr(f, "state", None)
+            except Exception:
+                return None
+
+        while _state_name(uploaded) == "PROCESSING":
+            time.sleep(0.5)
+            uploaded = genai.get_file(uploaded.name)
+
+        # 3) Ask for JSON only
+        model_obj = genai.GenerativeModel(model)
+        resp = model_obj.generate_content(
+            [
+                {"text": _llm_only_prompt()},
+                uploaded,
+            ],
+            generation_config={"response_mime_type": "application/json"},
+        )
+
+        text = (getattr(resp, "text", "") or "").strip()
+        try:
+            data = json.loads(text)
+        except Exception:
+            # Fallback if the model ignored the JSON MIME and returned prose
+            i, j = text.find("{"), text.rfind("}")
+            data = json.loads(text[i:j+1]) if i != -1 and j > i else {"error": "LLM did not return JSON"}
+
+        return {
             "file_name": file_name,
-            "byte_size": len(pdf_bytes),
-        },
-    }
+            "result": data,
+            "meta": {"llm_model": model, "source": "gemini-pdf-only"},
+        }
 
-    return extracted_data
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except Exception:
+            pass
+        # Optional: delete the uploaded file from Gemini storage
+        try:
+            if uploaded is not None:
+                genai.delete_file(uploaded.name)
+        except Exception:
+            pass
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -4,4 +4,15 @@ supabase==2.10.0
 python-dotenv==1.0.1
 pydantic[email]==2.10.3
 pydantic-settings==2.6.1
-ruff==0.14.0
+ruff==0.14.0
+docling==2.55.1
+docling-core==2.48.4
+docling-ibm-models==3.9.1
+docling-parse==4.5.0
+pdfplumber==0.11.7
+google-ai-generativelanguage==0.6.15
+google-api-core==2.25.2
+google-api-python-client==2.184.0
+google-auth==2.41.1
+google-auth-httplib2==0.2.0
+google-generativeai==0.8.5
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -12,6 +12,7 @@ services:
       SUPABASE_SERVICE_ROLE_KEY: ${BACKEND_SUPABASE_SERVICE_ROLE_KEY}
       WEBHOOK_BASE_URL: ${WEBHOOK_BASE_URL}
       WEBHOOK_SECRET: ${WEBHOOK_SECRET}
+      GEMINI_API_KEY: ${GEMINI_API_KEY}
     volumes:
       - ./backend:/app
     extra_hosts:
diff --git a/init-dev.js b/init-dev.js
@@ -1,3 +1,4 @@
+require('dotenv').config()
 const fs = require("fs");
 const crypto = require("crypto");
 const { exec, execSync } = require("child_process");
@@ -95,6 +96,9 @@ STUDIO_URL=${cliEnv.STUDIO_URL || "http://localhost:54323"}
 DB_URL=${
     cliEnv.DB_URL || "postgresql://postgres:postgres@localhost:54322/postgres"
   }
+
+# Gemini API Key
+GEMINI_API_KEY=${process.env.GEMINI_API_KEY}
 `;
 }
 
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -15,5 +15,8 @@
     },
     "devDependencies": {
         "supabase": "2.47.2"
+    },
+    "dependencies": {
+        "dotenv": "^17.2.3"
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+require('dotenv').config()`
`1`	`2`	`const fs = require("fs");`
`2`	`3`	`const crypto = require("crypto");`
`3`	`4`	`const { exec, execSync } = require("child_process");`
`@@ -95,6 +96,9 @@ STUDIO_URL=${cliEnv.STUDIO_URL \|\| "http://localhost:54323"}`
`95`	`96`	`DB_URL=${`
`96`	`97`	`cliEnv.DB_URL \|\| "postgresql://postgres:postgres@localhost:54322/postgres"`
`97`	`98`	`}`
	`99`	`+`
	`100`	`+# Gemini API Key`
	`101`	`+GEMINI_API_KEY=${process.env.GEMINI_API_KEY}`
`98`	`102`	`;
`99`	`103`	`}`
`100`	`104`
Original file line number	Diff line number	Diff line change
`@@ -15,5 +15,8 @@`
`15`	`15`	`},`
`16`	`16`	`"devDependencies": {`
`17`	`17`	`"supabase": "2.47.2"`
	`18`	`+ },`
	`19`	`+ "dependencies": {`
	`20`	`+ "dotenv": "^17.2.3"`
`18`	`21`	`}`
`19`	`22`	`}`