reto-xmas-2025-goland-ia-backend/RAGManager/app/services/pdf_processor.py at 108a36a89fd33132e7156c6e31d8926f0b2daed8 · ucudal/reto-xmas-2025-goland-ia-backend · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import io
import pdfplumber
from langchain_core.documents import Document
from minio import Minio

from app.core.config import settings


def get_minio_client() -> Minio:
    return Minio(
        endpoint=settings.minio_endpoint,
        access_key=settings.minio_access_key,
        secret_key=settings.minio_secret_key,
        secure=settings.minio_secure,
    )

def pdf_to_document(
    object_name: str,
    bucket_name: str | None = None,
    minio_client: Minio | None = None,
) -> list[Document]:
    """
    Load a PDF file from MinIO and return a list of Document objects.
    Each page becomes a separate Document with metadata.

    Args:
        object_name: Path/name of the PDF object in the bucket
        bucket_name: Name of the MinIO bucket (defaults to settings.minio_bucket)
        minio_client: Optional MinIO client (creates one if not provided)

    Returns:
        List of Document objects, one per page
    """
    if bucket_name is None:
        bucket_name = settings.minio_bucket
    if minio_client is None:
        minio_client = get_minio_client()

    documents: list[Document] = []

    # Download the PDF from MinIO into memory
    response = minio_client.get_object(bucket_name, object_name)
    try:
        pdf_bytes = response.read()
    finally:
        response.close()
        response.release_conn()

    # Open PDF from bytes using pdfplumber
    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text() or ""

            # Extract tables and convert to text format
            tables = page.extract_tables()
            table_text = ""
            for table in tables:
                for row in table:
                    table_text += " | ".join(str(cell) if cell else "" for cell in row) + "\n"

            # Combine text and tables
            full_content = text
            if table_text:
                full_content += f"\n\n[Tables]\n{table_text}"

            doc = Document(
                page_content=full_content,
                metadata={
                    "source": f"minio://{bucket_name}/{object_name}",
                    "bucket": bucket_name,
                    "object_name": object_name,
                    "page": page_num,
                    "total_pages": len(pdf.pages),
                    "filename": object_name.split("/")[-1],
                },
            )
            documents.append(doc)

    return documents

def pdf_to_single_document(
    object_name: str,
    bucket_name: str | None = None,
    minio_client: Minio | None = None,
) -> Document:
    """
    Load a PDF file from MinIO and return a single Document with all pages combined.

    Args:
        object_name: Path/name of the PDF object in the bucket
        bucket_name: Name of the MinIO bucket (defaults to settings.minio_bucket)
        minio_client: Optional MinIO client (creates one if not provided)

    Returns:
        Single Document object with all content
    """
    if bucket_name is None:
        bucket_name = settings.minio_bucket
    documents = pdf_to_document(object_name, bucket_name, minio_client)

    combined_content = "\n\n".join(doc.page_content for doc in documents)

    return Document(
        page_content=combined_content,
        metadata={
            "source": f"minio://{bucket_name}/{object_name}",
            "bucket": bucket_name,
            "object_name": object_name,
            "filename": object_name.split("/")[-1],
            "total_pages": len(documents),
        },
    )