-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_processor.py
More file actions
114 lines (94 loc) · 3.5 KB
/
pdf_processor.py
File metadata and controls
114 lines (94 loc) · 3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import io
import pdfplumber
from langchain_core.documents import Document
from minio import Minio
from app.core.config import settings
def get_minio_client() -> Minio:
return Minio(
endpoint=settings.minio_endpoint,
access_key=settings.minio_access_key,
secret_key=settings.minio_secret_key,
secure=settings.minio_secure,
)
def pdf_to_document(
object_name: str,
bucket_name: str | None = None,
minio_client: Minio | None = None,
) -> list[Document]:
"""
Load a PDF file from MinIO and return a list of Document objects.
Each page becomes a separate Document with metadata.
Args:
object_name: Path/name of the PDF object in the bucket
bucket_name: Name of the MinIO bucket (defaults to settings.minio_bucket)
minio_client: Optional MinIO client (creates one if not provided)
Returns:
List of Document objects, one per page
"""
if bucket_name is None:
bucket_name = settings.minio_bucket
if minio_client is None:
minio_client = get_minio_client()
documents: list[Document] = []
# Download the PDF from MinIO into memory
response = minio_client.get_object(bucket_name, object_name)
try:
pdf_bytes = response.read()
finally:
response.close()
response.release_conn()
# Open PDF from bytes using pdfplumber
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
for page_num, page in enumerate(pdf.pages, start=1):
text = page.extract_text() or ""
# Extract tables and convert to text format
tables = page.extract_tables()
table_text = ""
for table in tables:
for row in table:
table_text += " | ".join(str(cell) if cell else "" for cell in row) + "\n"
# Combine text and tables
full_content = text
if table_text:
full_content += f"\n\n[Tables]\n{table_text}"
doc = Document(
page_content=full_content,
metadata={
"source": f"minio://{bucket_name}/{object_name}",
"bucket": bucket_name,
"object_name": object_name,
"page": page_num,
"total_pages": len(pdf.pages),
"filename": object_name.split("/")[-1],
},
)
documents.append(doc)
return documents
def pdf_to_single_document(
object_name: str,
bucket_name: str | None = None,
minio_client: Minio | None = None,
) -> Document:
"""
Load a PDF file from MinIO and return a single Document with all pages combined.
Args:
object_name: Path/name of the PDF object in the bucket
bucket_name: Name of the MinIO bucket (defaults to settings.minio_bucket)
minio_client: Optional MinIO client (creates one if not provided)
Returns:
Single Document object with all content
"""
if bucket_name is None:
bucket_name = settings.minio_bucket
documents = pdf_to_document(object_name, bucket_name, minio_client)
combined_content = "\n\n".join(doc.page_content for doc in documents)
return Document(
page_content=combined_content,
metadata={
"source": f"minio://{bucket_name}/{object_name}",
"bucket": bucket_name,
"object_name": object_name,
"filename": object_name.split("/")[-1],
"total_pages": len(documents),
},
)