Skip to content

Commit 46c75d2

Browse files
authored
Merge pull request #7 from GenerateNU/pdf_extraction_test
Added pdf extraction logic
2 parents 9986594 + 38a3f66 commit 46c75d2

File tree

6 files changed

+125
-31
lines changed

6 files changed

+125
-31
lines changed
Lines changed: 88 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,90 @@
1-
from datetime import datetime
2-
3-
4-
def extract_pdf_data(pdf_bytes: bytes, file_name: str) -> dict:
5-
"""
6-
Pure function: Takes PDF bytes, returns extracted JSON data
7-
No database or storage operations - just PDF → JSON transformation
8-
9-
TODO: Replace placeholder with actual PDF extraction logic
10-
"""
11-
12-
# Placeholder extraction logic
13-
extracted_data = {
14-
"source_pdf": file_name,
15-
"extracted_at": datetime.now().isoformat(),
16-
"placeholder": True,
17-
"data": {
18-
"title": f"Extracted from {file_name}",
19-
"content": "PDF content will be extracted here",
20-
"tables": [],
21-
"text_blocks": [],
22-
},
23-
"metadata": {
24-
"extractor_version": "0.1.0-placeholder",
1+
# --- LLM-only extractor (no Docling/pdfplumber) ---
2+
import io, os, re, json, time, tempfile
3+
try:
4+
import google.generativeai as genai
5+
except Exception:
6+
genai = None
7+
8+
def _llm_only_prompt() -> str:
9+
return (
10+
"You are a PDF→JSON structurer for manufacturing/robotics documents. "
11+
"Return ONE valid JSON object only (no markdown). Keep only meaningful specs "
12+
"(manufacturer, model, document identifiers, key specs like payload/reach/"
13+
"repeatability/mass/mounting/protection/environment, axis JT1..JTn ranges & speeds). "
14+
"Preserve symbols like ±, °/s, φ. Normalize ranges as strings (e.g., '±180', '+140 to -105'). "
15+
"Do not invent values; omit if missing."
16+
)
17+
18+
def extract_pdf_data(
19+
pdf_bytes: bytes,
20+
file_name: str,
21+
model: str = "gemini-2.5-pro", # or "gemini-2.5-pro" if enabled
22+
) -> dict:
23+
if genai is None:
24+
raise RuntimeError("google-generativeai is not installed. `pip install google-generativeai`")
25+
26+
api_key = os.getenv("GEMINI_API_KEY")
27+
if not api_key:
28+
raise RuntimeError("GEMINI_API_KEY is not set")
29+
genai.configure(api_key=api_key)
30+
31+
# 1) Write bytes to a temp file and upload via path=...
32+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
33+
tmp.write(pdf_bytes)
34+
tmp_path = tmp.name
35+
36+
uploaded = None
37+
try:
38+
uploaded = genai.upload_file(
39+
path=tmp_path, # <-- use path, not file
40+
mime_type="application/pdf",
41+
display_name=file_name,
42+
)
43+
44+
# 2) Wait until Gemini finishes processing
45+
# (some SDKs expose state.name, others just state)
46+
def _state_name(f):
47+
try:
48+
return getattr(getattr(f, "state", None), "name", None) or getattr(f, "state", None)
49+
except Exception:
50+
return None
51+
52+
while _state_name(uploaded) == "PROCESSING":
53+
time.sleep(0.5)
54+
uploaded = genai.get_file(uploaded.name)
55+
56+
# 3) Ask for JSON only
57+
model_obj = genai.GenerativeModel(model)
58+
resp = model_obj.generate_content(
59+
[
60+
{"text": _llm_only_prompt()},
61+
uploaded,
62+
],
63+
generation_config={"response_mime_type": "application/json"},
64+
)
65+
66+
text = (getattr(resp, "text", "") or "").strip()
67+
try:
68+
data = json.loads(text)
69+
except Exception:
70+
# Fallback if the model ignored the JSON MIME and returned prose
71+
i, j = text.find("{"), text.rfind("}")
72+
data = json.loads(text[i:j+1]) if i != -1 and j > i else {"error": "LLM did not return JSON"}
73+
74+
return {
2575
"file_name": file_name,
26-
"byte_size": len(pdf_bytes),
27-
},
28-
}
76+
"result": data,
77+
"meta": {"llm_model": model, "source": "gemini-pdf-only"},
78+
}
2979

30-
return extracted_data
80+
finally:
81+
try:
82+
os.unlink(tmp_path)
83+
except Exception:
84+
pass
85+
# Optional: delete the uploaded file from Gemini storage
86+
try:
87+
if uploaded is not None:
88+
genai.delete_file(uploaded.name)
89+
except Exception:
90+
pass

backend/requirements.txt

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,15 @@ supabase==2.10.0
44
python-dotenv==1.0.1
55
pydantic[email]==2.10.3
66
pydantic-settings==2.6.1
7-
ruff==0.14.0
7+
ruff==0.14.0
8+
docling==2.55.1
9+
docling-core==2.48.4
10+
docling-ibm-models==3.9.1
11+
docling-parse==4.5.0
12+
pdfplumber==0.11.7
13+
google-ai-generativelanguage==0.6.15
14+
google-api-core==2.25.2
15+
google-api-python-client==2.184.0
16+
google-auth==2.41.1
17+
google-auth-httplib2==0.2.0
18+
google-generativeai==0.8.5

docker-compose.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ services:
1212
SUPABASE_SERVICE_ROLE_KEY: ${BACKEND_SUPABASE_SERVICE_ROLE_KEY}
1313
WEBHOOK_BASE_URL: ${WEBHOOK_BASE_URL}
1414
WEBHOOK_SECRET: ${WEBHOOK_SECRET}
15+
GEMINI_API_KEY: ${GEMINI_API_KEY}
1516
volumes:
1617
- ./backend:/app
1718
extra_hosts:

init-dev.js

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
require('dotenv').config()
12
const fs = require("fs");
23
const crypto = require("crypto");
34
const { exec, execSync } = require("child_process");
@@ -95,6 +96,9 @@ STUDIO_URL=${cliEnv.STUDIO_URL || "http://localhost:54323"}
9596
DB_URL=${
9697
cliEnv.DB_URL || "postgresql://postgres:postgres@localhost:54322/postgres"
9798
}
99+
100+
# Gemini API Key
101+
GEMINI_API_KEY=${process.env.GEMINI_API_KEY}
98102
`;
99103
}
100104

package-lock.json

Lines changed: 17 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,8 @@
1515
},
1616
"devDependencies": {
1717
"supabase": "2.47.2"
18+
},
19+
"dependencies": {
20+
"dotenv": "^17.2.3"
1821
}
1922
}

0 commit comments

Comments
 (0)