1- from datetime import datetime
2-
3-
4- def extract_pdf_data (pdf_bytes : bytes , file_name : str ) -> dict :
5- """
6- Pure function: Takes PDF bytes, returns extracted JSON data
7- No database or storage operations - just PDF → JSON transformation
8-
9- TODO: Replace placeholder with actual PDF extraction logic
10- """
11-
12- # Placeholder extraction logic
13- extracted_data = {
14- "source_pdf" : file_name ,
15- "extracted_at" : datetime .now ().isoformat (),
16- "placeholder" : True ,
17- "data" : {
18- "title" : f"Extracted from { file_name } " ,
19- "content" : "PDF content will be extracted here" ,
20- "tables" : [],
21- "text_blocks" : [],
22- },
23- "metadata" : {
24- "extractor_version" : "0.1.0-placeholder" ,
1+ # --- LLM-only extractor (no Docling/pdfplumber) ---
2+ import io , os , re , json , time , tempfile
3+ try :
4+ import google .generativeai as genai
5+ except Exception :
6+ genai = None
7+
8+ def _llm_only_prompt () -> str :
9+ return (
10+ "You are a PDF→JSON structurer for manufacturing/robotics documents. "
11+ "Return ONE valid JSON object only (no markdown). Keep only meaningful specs "
12+ "(manufacturer, model, document identifiers, key specs like payload/reach/"
13+ "repeatability/mass/mounting/protection/environment, axis JT1..JTn ranges & speeds). "
14+ "Preserve symbols like ±, °/s, φ. Normalize ranges as strings (e.g., '±180', '+140 to -105'). "
15+ "Do not invent values; omit if missing."
16+ )
17+
18+ def extract_pdf_data (
19+ pdf_bytes : bytes ,
20+ file_name : str ,
21+ model : str = "gemini-2.5-pro" , # or "gemini-2.5-pro" if enabled
22+ ) -> dict :
23+ if genai is None :
24+ raise RuntimeError ("google-generativeai is not installed. `pip install google-generativeai`" )
25+
26+ api_key = os .getenv ("GEMINI_API_KEY" )
27+ if not api_key :
28+ raise RuntimeError ("GEMINI_API_KEY is not set" )
29+ genai .configure (api_key = api_key )
30+
31+ # 1) Write bytes to a temp file and upload via path=...
32+ with tempfile .NamedTemporaryFile (suffix = ".pdf" , delete = False ) as tmp :
33+ tmp .write (pdf_bytes )
34+ tmp_path = tmp .name
35+
36+ uploaded = None
37+ try :
38+ uploaded = genai .upload_file (
39+ path = tmp_path , # <-- use path, not file
40+ mime_type = "application/pdf" ,
41+ display_name = file_name ,
42+ )
43+
44+ # 2) Wait until Gemini finishes processing
45+ # (some SDKs expose state.name, others just state)
46+ def _state_name (f ):
47+ try :
48+ return getattr (getattr (f , "state" , None ), "name" , None ) or getattr (f , "state" , None )
49+ except Exception :
50+ return None
51+
52+ while _state_name (uploaded ) == "PROCESSING" :
53+ time .sleep (0.5 )
54+ uploaded = genai .get_file (uploaded .name )
55+
56+ # 3) Ask for JSON only
57+ model_obj = genai .GenerativeModel (model )
58+ resp = model_obj .generate_content (
59+ [
60+ {"text" : _llm_only_prompt ()},
61+ uploaded ,
62+ ],
63+ generation_config = {"response_mime_type" : "application/json" },
64+ )
65+
66+ text = (getattr (resp , "text" , "" ) or "" ).strip ()
67+ try :
68+ data = json .loads (text )
69+ except Exception :
70+ # Fallback if the model ignored the JSON MIME and returned prose
71+ i , j = text .find ("{" ), text .rfind ("}" )
72+ data = json .loads (text [i :j + 1 ]) if i != - 1 and j > i else {"error" : "LLM did not return JSON" }
73+
74+ return {
2575 "file_name" : file_name ,
26- "byte_size " : len ( pdf_bytes ) ,
27- },
28- }
76+ "result " : data ,
77+ "meta" : { "llm_model" : model , "source" : "gemini-pdf-only" },
78+ }
2979
30- return extracted_data
80+ finally :
81+ try :
82+ os .unlink (tmp_path )
83+ except Exception :
84+ pass
85+ # Optional: delete the uploaded file from Gemini storage
86+ try :
87+ if uploaded is not None :
88+ genai .delete_file (uploaded .name )
89+ except Exception :
90+ pass
0 commit comments