-
Notifications
You must be signed in to change notification settings - Fork 0
pdf-processor #19
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
pdf-processor #19
Changes from 2 commits
5086552
108a36a
95646fb
5d3b0cd
ba32b9f
e0619c1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| .env |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,23 +1,114 @@ | ||
| import io | ||
| import pdfplumber | ||
| from langchain_core.documents import Document | ||
| from minio import Minio | ||
|
|
||
| from app.core.config import settings | ||
|
|
||
|
|
||
| def pdf_to_document(minio_url: str) -> Document: | ||
| """ | ||
| Placeholder function - to be implemented later. | ||
|
|
||
| This function will: | ||
| 1. Download the PDF file from MinIO using the provided URL | ||
| 2. Parse the PDF content | ||
| 3. Convert it to a LangChain Document object | ||
| def get_minio_client() -> Minio: | ||
| return Minio( | ||
| endpoint=settings.minio_endpoint, | ||
| access_key=settings.minio_access_key, | ||
| secret_key=settings.minio_secret_key, | ||
| secure=settings.minio_secure, | ||
| ) | ||
|
|
||
| def pdf_to_document( | ||
| object_name: str, | ||
| bucket_name: str | None = None, | ||
| minio_client: Minio | None = None, | ||
| ) -> list[Document]: | ||
|
JuanPalbo marked this conversation as resolved.
JuanPalbo marked this conversation as resolved.
|
||
| """ | ||
| Load a PDF file from MinIO and return a list of Document objects. | ||
| Each page becomes a separate Document with metadata. | ||
|
|
||
| Args: | ||
| minio_url: URL pointing to the PDF file in MinIO | ||
| object_name: Path/name of the PDF object in the bucket | ||
| bucket_name: Name of the MinIO bucket (defaults to settings.minio_bucket) | ||
| minio_client: Optional MinIO client (creates one if not provided) | ||
|
|
||
| Returns: | ||
| Document: LangChain Document object containing the PDF content | ||
| List of Document objects, one per page | ||
| """ | ||
|
Comment on lines
+90
to
+106
|
||
| if bucket_name is None: | ||
| bucket_name = settings.minio_bucket | ||
| if minio_client is None: | ||
| minio_client = get_minio_client() | ||
|
|
||
| documents: list[Document] = [] | ||
|
|
||
| # Download the PDF from MinIO into memory | ||
| response = minio_client.get_object(bucket_name, object_name) | ||
| try: | ||
| pdf_bytes = response.read() | ||
|
||
| finally: | ||
| response.close() | ||
| response.release_conn() | ||
|
||
|
|
||
| # Open PDF from bytes using pdfplumber | ||
| with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: | ||
|
||
| for page_num, page in enumerate(pdf.pages, start=1): | ||
| text = page.extract_text() or "" | ||
|
|
||
| # Extract tables and convert to text format | ||
| tables = page.extract_tables() | ||
| table_text = "" | ||
| for table in tables: | ||
| for row in table: | ||
| table_text += " | ".join(str(cell) if cell else "" for cell in row) + "\n" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Consider more robust table formatting. The current table-to-text conversion is basic and may not preserve table structure well. The simple pipe-delimited format could be ambiguous if cells contain pipe characters. Consider alternative approaches:
Example with Markdown tables: table_text = ""
for table in tables:
if not table:
continue
# Add header separator for Markdown tables
for i, row in enumerate(table):
cells = [str(cell).strip() if cell else "" for cell in row]
table_text += "| " + " | ".join(cells) + " |\n"
if i == 0 and len(table) > 1: # Add separator after header
table_text += "|" + "|".join([" --- " for _ in cells]) + "|\n"
table_text += "\n"🤖 Prompt for AI Agents
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @copilot Does that work for chunking further down the pipeline?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
|
||
| # Combine text and tables | ||
| full_content = text | ||
| if table_text: | ||
| full_content += f"\n\n[Tables]\n{table_text}" | ||
|
coderabbitai[bot] marked this conversation as resolved.
Outdated
|
||
|
|
||
| Raises: | ||
| NotImplementedError: This function is not yet implemented | ||
| doc = Document( | ||
| page_content=full_content, | ||
| metadata={ | ||
| "source": f"minio://{bucket_name}/{object_name}", | ||
| "bucket": bucket_name, | ||
| "object_name": object_name, | ||
| "page": page_num, | ||
| "total_pages": len(pdf.pages), | ||
| "filename": object_name.split("/")[-1], | ||
|
||
| }, | ||
| ) | ||
| documents.append(doc) | ||
|
|
||
| return documents | ||
|
|
||
| def pdf_to_single_document( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this being used? |
||
| object_name: str, | ||
| bucket_name: str | None = None, | ||
| minio_client: Minio | None = None, | ||
| ) -> Document: | ||
| """ | ||
| raise NotImplementedError("This function will be implemented later") | ||
| Load a PDF file from MinIO and return a single Document with all pages combined. | ||
|
|
||
| Args: | ||
| object_name: Path/name of the PDF object in the bucket | ||
| bucket_name: Name of the MinIO bucket (defaults to settings.minio_bucket) | ||
| minio_client: Optional MinIO client (creates one if not provided) | ||
|
|
||
| Returns: | ||
| Single Document object with all content | ||
| """ | ||
| if bucket_name is None: | ||
| bucket_name = settings.minio_bucket | ||
| documents = pdf_to_document(object_name, bucket_name, minio_client) | ||
|
|
||
| combined_content = "\n\n".join(doc.page_content for doc in documents) | ||
|
JuanPalbo marked this conversation as resolved.
|
||
|
|
||
| return Document( | ||
| page_content=combined_content, | ||
| metadata={ | ||
| "source": f"minio://{bucket_name}/{object_name}", | ||
| "bucket": bucket_name, | ||
| "object_name": object_name, | ||
| "filename": object_name.split("/")[-1], | ||
| "total_pages": len(documents), | ||
| }, | ||
| ) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.