ucudal · brunoalbin23 · Dec 14, 2025 · Dec 13, 2025 · Dec 13, 2025 · Dec 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/RAGManager/app/services/pdf_processor.py b/RAGManager/app/services/pdf_processor.py
@@ -1,23 +1,187 @@
+import io
+import logging
+
+import pdfplumber
 from langchain_core.documents import Document
+from minio import Minio
 
+from app.core.config import settings
 
-def pdf_to_document(minio_url: str) -> Document:
-    """
-    Placeholder function - to be implemented later.
+logger = logging.getLogger(__name__)
+
+
+def _sanitize_cell(cell) -> str:
+    """Safely convert a cell value to string."""
+    if cell is None:
+        return ""
+    if isinstance(cell, (str, int, float, bool)):
+        return str(cell)
+    try:
+        return str(cell)
+    except Exception:
+        try:
+            return repr(cell)
+        except Exception:
+            return ""
+
+
+def _extract_tables_safely(page, page_num: int) -> str:
+    """Extract tables from a page with robust error handling."""
+    table_text = ""
+    try:
+        tables = page.extract_tables()
+        if not isinstance(tables, (list, tuple)):
+            logger.warning(f"Page {page_num}: extract_tables() returned non-iterable type {type(tables)}, skipping tables")
+            return ""
+
+        for table_idx, table in enumerate(tables):
+            if not isinstance(table, (list, tuple)):
+                logger.warning(f"Page {page_num}, Table {table_idx}: table is not iterable, skipping")
+                continue
+
+            for row_idx, row in enumerate(table):
+                try:
+                    if not isinstance(row, (list, tuple)):
+                        logger.warning(f"Page {page_num}, Table {table_idx}, Row {row_idx}: row is not iterable, skipping")
+                        continue
+                    table_text += " | ".join(_sanitize_cell(cell) for cell in row) + "\n"
+                except Exception as e:
+                    logger.warning(f"Page {page_num}, Table {table_idx}, Row {row_idx}: error processing row: {e}")
+                    continue
 
-    This function will:
-    1. Download the PDF file from MinIO using the provided URL
-    2. Parse the PDF content
-    3. Convert it to a LangChain Document object
+    except Exception as e:
+        logger.warning(f"Page {page_num}: error extracting tables: {e}")
+
+    return table_text
+
+
+def get_minio_client() -> Minio:
+    return Minio(
+        endpoint=settings.minio_endpoint,
+        access_key=settings.minio_access_key,
+        secret_key=settings.minio_secret_key,
+        secure=settings.minio_secure,
+    )
+
+def pdf_to_document(
+    object_name: str,
+    bucket_name: str | None = None,
+    minio_client: Minio | None = None,
+) -> list[Document]:
+    """
+    Load a PDF file from MinIO and return a list of Document objects.
+    Each page becomes a separate Document with metadata.
 
     Args:
-        minio_url: URL pointing to the PDF file in MinIO
+        object_name: Path/name of the PDF object in the bucket
+        bucket_name: Name of the MinIO bucket (defaults to settings.minio_bucket)
+        minio_client: Optional MinIO client (creates one if not provided)
 
     Returns:
-        Document: LangChain Document object containing the PDF content
+        List of Document objects, one per page
+    """
+    if bucket_name is None:
+        bucket_name = settings.minio_bucket
+    if minio_client is None:
+        minio_client = get_minio_client()
+
+    # Validate object_name
+    if not object_name or not object_name.strip():
+        raise ValueError("object_name cannot be empty or whitespace")
+
+    documents: list[Document] = []
 
-    Raises:
-        NotImplementedError: This function is not yet implemented
+    # Download the PDF from MinIO into memory
+    # Note: For very large files, consider streaming to disk instead of loading entirely into memory
+    try:
+        response = minio_client.get_object(bucket_name, object_name)
+    except Exception as e:
+        logger.error(f"Failed to get object from MinIO - bucket: '{bucket_name}', object: '{object_name}': {e}")
+        raise ValueError(f"Failed to retrieve '{object_name}' from bucket '{bucket_name}': {e}") from e
-        raise ValueError(f"Failed to retrieve '{object_name}' from bucket '{bucket_name}': {e}") from e
+        raise ValueError(f"Failed to retrieve '{object_name}' from bucket '{bucket_name}'.") from e
-        raise ValueError(f"Failed to retrieve '{object_name}' from bucket '{bucket_name}': {e}") from e
+        raise ValueError(f"Failed to retrieve '{object_name}' from bucket '{bucket_name}'.") from e
+
+    try:
+        pdf_bytes = response.read()
+        # Optional: warn if file is very large (e.g., > 100MB)
-    except Exception as e:
-        logger.error(f"Failed to get object from MinIO - bucket: '{bucket_name}', object: '{object_name}': {e}")
-        raise ValueError(f"Failed to retrieve '{object_name}' from bucket '{bucket_name}': {e}") from e
-
-    try:
-        pdf_bytes = response.read()
-        # Optional: warn if file is very large (e.g., > 100MB)
+        # Check Content-Length header before reading into memory
+        content_length = None
+        if hasattr(response, "getheader"):
+            content_length = response.getheader("Content-Length")
+        elif hasattr(response, "headers") and "Content-Length" in response.headers:
+            content_length = response.headers["Content-Length"]
+        if content_length is not None:
+            try:
+                file_size_bytes = int(content_length)
+                file_size_mb = file_size_bytes / (1024 * 1024)
+                if file_size_mb > 100:
+                    logger.warning(f"Large PDF (Content-Length): {file_size_mb:.1f} MB for '{object_name}' - loading into memory")
+            except Exception as e:
+                logger.warning(f"Could not parse Content-Length header for '{object_name}': {e}")
+        else:
+            logger.info(f"Content-Length header not found for '{object_name}', proceeding to read file into memory")
+    except Exception as e:
+        logger.error(f"Failed to get object from MinIO - bucket: '{bucket_name}', object: '{object_name}': {e}")
+        raise ValueError(f"Failed to retrieve '{object_name}' from bucket '{bucket_name}': {e}") from e
+
+    try:
+        pdf_bytes = response.read()
+        # Optional: warn if file is very large (e.g., > 100MB) (fallback if Content-Length was not available)
-    except Exception as e:
-        logger.error(f"Failed to get object from MinIO - bucket: '{bucket_name}', object: '{object_name}': {e}")
-        raise ValueError(f"Failed to retrieve '{object_name}' from bucket '{bucket_name}': {e}") from e
-
-    try:
-        pdf_bytes = response.read()
-        # Optional: warn if file is very large (e.g., > 100MB)
+        # Check Content-Length header before reading into memory
+        content_length = None
+        if hasattr(response, "getheader"):
+            content_length = response.getheader("Content-Length")
+        elif hasattr(response, "headers") and "Content-Length" in response.headers:
+            content_length = response.headers["Content-Length"]
+        if content_length is not None:
+            try:
+                file_size_bytes = int(content_length)
+                file_size_mb = file_size_bytes / (1024 * 1024)
+                if file_size_mb > 100:
+                    logger.warning(f"Large PDF (Content-Length): {file_size_mb:.1f} MB for '{object_name}' - loading into memory")
+            except Exception as e:
+                logger.warning(f"Could not parse Content-Length header for '{object_name}': {e}")
+        else:
+            logger.info(f"Content-Length header not found for '{object_name}', proceeding to read file into memory")
+    except Exception as e:
+        logger.error(f"Failed to get object from MinIO - bucket: '{bucket_name}', object: '{object_name}': {e}")
+        raise ValueError(f"Failed to retrieve '{object_name}' from bucket '{bucket_name}': {e}") from e
+
+    try:
+        pdf_bytes = response.read()
+        # Optional: warn if file is very large (e.g., > 100MB) (fallback if Content-Length was not available)
+        file_size_mb = len(pdf_bytes) / (1024 * 1024)
+        if file_size_mb > 100:
+            logger.warning(f"Large PDF loaded into memory: {file_size_mb:.1f} MB for '{object_name}'")
+    except Exception as e:
+        logger.error(f"Failed to read PDF content from MinIO - bucket: '{bucket_name}', object: '{object_name}': {e}")
+        raise ValueError(f"Failed to read content of '{object_name}' from bucket '{bucket_name}': {e}") from e
+    finally:
+        response.close()
+        response.release_conn()
+
+    # Open PDF from bytes using pdfplumber
+    try:
+        pdf = pdfplumber.open(io.BytesIO(pdf_bytes))
+    except Exception as e:
+        logger.error(f"Failed to open PDF '{object_name}': {e} (possibly corrupted or password-protected)")
+        return documents
+
+    try:
+        for page_num, page in enumerate(pdf.pages, start=1):
+            try:
+                text = page.extract_text() or ""
+            except Exception as e:
+                logger.warning(f"Page {page_num}: error extracting text: {e}")
+                text = ""
+
+            # Extract tables and convert to text format
+            table_text = _extract_tables_safely(page, page_num)
+
+            # Combine text and tables
+            full_content = text
+            if table_text:
+                full_content += f"\n\n[Tables]\n{table_text}"
+
+            doc = Document(
+                page_content=full_content,
+                metadata={
+                    "source": f"minio://{bucket_name}/{object_name}",
+                    "bucket": bucket_name,
+                    "object_name": object_name,
+                    "page": page_num,
+                    "total_pages": len(pdf.pages),
+                    "filename": object_name.split("/")[-1],
+                },
+            )
+            documents.append(doc)
+    finally:
+        pdf.close()
+
+    return documents
+
+def pdf_to_single_document(
+    object_name: str,
+    bucket_name: str | None = None,
+    minio_client: Minio | None = None,
+) -> Document:
     """
-    raise NotImplementedError("This function will be implemented later")
+    Load a PDF file from MinIO and return a single Document with all pages combined.
+
+    Args:
+        object_name: Path/name of the PDF object in the bucket
+        bucket_name: Name of the MinIO bucket (defaults to settings.minio_bucket)
+        minio_client: Optional MinIO client (creates one if not provided)
+
+    Returns:
+        Single Document object with all content
+    """
+    if bucket_name is None:
+        bucket_name = settings.minio_bucket
+    documents = pdf_to_document(object_name, bucket_name, minio_client)
+
+    combined_content = "\n\n".join(doc.page_content for doc in documents)
+
+    return Document(
+        page_content=combined_content,
+        metadata={
+            "source": f"minio://{bucket_name}/{object_name}",
+            "bucket": bucket_name,
+            "object_name": object_name,
+            "filename": object_name.split("/")[-1],
+            "total_pages": len(documents),
+        },
+    )
 
diff --git a/RAGManager/app/services/pipeline.py b/RAGManager/app/services/pipeline.py
@@ -10,7 +10,7 @@
 logger = logging.getLogger(__name__)
 
 
-def process_pdf_pipeline(minio_url: str) -> int:
+def process_pdf_pipeline(object_name: str) -> int:
     """
     Orchestrates the PDF processing pipeline.
 
@@ -21,20 +21,20 @@ def process_pdf_pipeline(minio_url: str) -> int:
     4. Store in database (to be implemented)
 
     Args:
-        minio_url: URL pointing to the PDF file in MinIO
+        object_name: Path/name of the PDF object in the MinIO bucket
 
     Returns:
         int: document_id of the created document (mock value for now)
 
     Raises:
         NotImplementedError: If any of the pipeline stages are not yet implemented
     """
-    logger.info(f"Starting PDF processing pipeline for URL: {minio_url}")
+    logger.info(f"Starting PDF processing pipeline for object: {object_name}")
 
     try:
         # Stage 1: PDF to Document
         logger.info("Stage 1: Converting PDF to LangChain Document")
-        document = pdf_to_document(minio_url)
+        document = pdf_to_document(object_name)
         logger.info("Stage 1 completed successfully")
 
         # Stage 2: Document to Chunks