Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 88 additions & 28 deletions backend/app/services/pdf_extractor.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,90 @@
from datetime import datetime


def extract_pdf_data(pdf_bytes: bytes, file_name: str) -> dict:
"""
Pure function: Takes PDF bytes, returns extracted JSON data
No database or storage operations - just PDF → JSON transformation

TODO: Replace placeholder with actual PDF extraction logic
"""

# Placeholder extraction logic
extracted_data = {
"source_pdf": file_name,
"extracted_at": datetime.now().isoformat(),
"placeholder": True,
"data": {
"title": f"Extracted from {file_name}",
"content": "PDF content will be extracted here",
"tables": [],
"text_blocks": [],
},
"metadata": {
"extractor_version": "0.1.0-placeholder",
# --- LLM-only extractor (no Docling/pdfplumber) ---
import io, os, re, json, time, tempfile
try:
import google.generativeai as genai
except Exception:
genai = None

def _llm_only_prompt() -> str:
return (
"You are a PDF→JSON structurer for manufacturing/robotics documents. "
"Return ONE valid JSON object only (no markdown). Keep only meaningful specs "
"(manufacturer, model, document identifiers, key specs like payload/reach/"
"repeatability/mass/mounting/protection/environment, axis JT1..JTn ranges & speeds). "
"Preserve symbols like ±, °/s, φ. Normalize ranges as strings (e.g., '±180', '+140 to -105'). "
"Do not invent values; omit if missing."
)

def extract_pdf_data(
pdf_bytes: bytes,
file_name: str,
model: str = "gemini-2.5-pro", # or "gemini-2.5-pro" if enabled
) -> dict:
if genai is None:
raise RuntimeError("google-generativeai is not installed. `pip install google-generativeai`")

api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
raise RuntimeError("GEMINI_API_KEY is not set")
genai.configure(api_key=api_key)

# 1) Write bytes to a temp file and upload via path=...
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
tmp.write(pdf_bytes)
tmp_path = tmp.name

uploaded = None
try:
uploaded = genai.upload_file(
path=tmp_path, # <-- use path, not file
mime_type="application/pdf",
display_name=file_name,
)

# 2) Wait until Gemini finishes processing
# (some SDKs expose state.name, others just state)
def _state_name(f):
try:
return getattr(getattr(f, "state", None), "name", None) or getattr(f, "state", None)
except Exception:
return None

while _state_name(uploaded) == "PROCESSING":
time.sleep(0.5)
uploaded = genai.get_file(uploaded.name)

# 3) Ask for JSON only
model_obj = genai.GenerativeModel(model)
resp = model_obj.generate_content(
[
{"text": _llm_only_prompt()},
uploaded,
],
generation_config={"response_mime_type": "application/json"},
)

text = (getattr(resp, "text", "") or "").strip()
try:
data = json.loads(text)
except Exception:
# Fallback if the model ignored the JSON MIME and returned prose
i, j = text.find("{"), text.rfind("}")
data = json.loads(text[i:j+1]) if i != -1 and j > i else {"error": "LLM did not return JSON"}

return {
"file_name": file_name,
"byte_size": len(pdf_bytes),
},
}
"result": data,
"meta": {"llm_model": model, "source": "gemini-pdf-only"},
}

return extracted_data
finally:
try:
os.unlink(tmp_path)
except Exception:
pass
# Optional: delete the uploaded file from Gemini storage
try:
if uploaded is not None:
genai.delete_file(uploaded.name)
except Exception:
pass
13 changes: 12 additions & 1 deletion backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,15 @@ supabase==2.10.0
python-dotenv==1.0.1
pydantic[email]==2.10.3
pydantic-settings==2.6.1
ruff==0.14.0
ruff==0.14.0
docling==2.55.1
docling-core==2.48.4
docling-ibm-models==3.9.1
docling-parse==4.5.0
pdfplumber==0.11.7
google-ai-generativelanguage==0.6.15
google-api-core==2.25.2
google-api-python-client==2.184.0
google-auth==2.41.1
google-auth-httplib2==0.2.0
google-generativeai==0.8.5
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ services:
SUPABASE_SERVICE_ROLE_KEY: ${BACKEND_SUPABASE_SERVICE_ROLE_KEY}
WEBHOOK_BASE_URL: ${WEBHOOK_BASE_URL}
WEBHOOK_SECRET: ${WEBHOOK_SECRET}
GEMINI_API_KEY: ${GEMINI_API_KEY}
volumes:
- ./backend:/app
extra_hosts:
Expand Down
4 changes: 4 additions & 0 deletions init-dev.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
require('dotenv').config()
const fs = require("fs");
const crypto = require("crypto");
const { exec, execSync } = require("child_process");
Expand Down Expand Up @@ -95,6 +96,9 @@ STUDIO_URL=${cliEnv.STUDIO_URL || "http://localhost:54323"}
DB_URL=${
cliEnv.DB_URL || "postgresql://postgres:postgres@localhost:54322/postgres"
}

# Gemini API Key
GEMINI_API_KEY=${process.env.GEMINI_API_KEY}
`;
}

Expand Down
19 changes: 17 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,8 @@
},
"devDependencies": {
"supabase": "2.47.2"
},
"dependencies": {
"dotenv": "^17.2.3"
}
}
Loading