Add PDF page to Zerox ingestion (#1976)

NolanTrem · web-flow · commit bced80f58f2d · 2025-02-17T16:01:23.000-08:00
* Add PDF page to Zerox ingestion

* Remove dump.txt
diff --git a/py/core/main/services/ingestion_service.py b/py/core/main/services/ingestion_service.py
@@ -32,7 +32,7 @@
 )
 from core.base.api.models import User
 from core.telemetry.telemetry_decorator import telemetry_event
-from shared.abstractions import PDFParsingError, PopperNotFoundError
+from shared.abstractions import PDFParsingError, PopplerNotFoundError
 
 from ..abstractions import R2RProviders
 from ..config import R2RConfig
@@ -274,7 +274,7 @@ async def parse_file(
                 extraction.metadata["version"] = version
                 yield extraction
 
-        except (PopperNotFoundError, PDFParsingError) as e:
+        except (PopplerNotFoundError, PDFParsingError) as e:
             raise R2RDocumentProcessingError(
                 error_message=e.message,
                 document_id=document_info.id,
diff --git a/py/core/parsers/media/pdf_parser.py b/py/core/parsers/media/pdf_parser.py
@@ -24,7 +24,7 @@
     DatabaseProvider,
     IngestionConfig,
 )
-from shared.abstractions import PDFParsingError, PopperNotFoundError
+from shared.abstractions import PDFParsingError, PopplerNotFoundError
 
 logger = logging.getLogger()
 
@@ -75,7 +75,7 @@ async def convert_pdf_to_images(
             logger.error(
                 "PDFInfoNotInstalledError encountered during PDF conversion."
             )
-            raise PopperNotFoundError()
+            raise PopplerNotFoundError()
         except Exception as err:
             logger.error(
                 f"Error converting PDF to images: {err} type: {type(err)}"
@@ -130,7 +130,7 @@ async def process_page(
             if response.choices and response.choices[0].message:
                 content = response.choices[0].message.content
                 page_elapsed = time.perf_counter() - page_start
-                logger.info(
+                logger.debug(
                     f"Processed page {page_num} in {page_elapsed:.2f} seconds."
                 )
                 return {"page": str(page_num), "content": content}
@@ -146,7 +146,7 @@ async def process_page(
 
     async def ingest(
         self, data: str | bytes, maintain_order: bool = True, **kwargs
-    ) -> AsyncGenerator[str, None]:
+    ) -> AsyncGenerator[dict[str, str | int], None]:
         """
         Ingest PDF data and yield the text description for each page using the vision model.
         (This version yields a string per page rather than a dictionary.)
@@ -185,16 +185,21 @@ async def ingest(
                         result = await task
                         page_num = int(result["page"])
                         results[page_num] = result
-                        # **Fix:** Yield only the content string instead of the whole dictionary.
                         while next_page in results:
-                            yield results.pop(next_page)["content"]
+                            yield {
+                                "content": results[next_page]["content"],
+                                "page_number": next_page,
+                            }
+                            results.pop(next_page)
                             next_page += 1
             else:
                 # Yield results as tasks complete
                 for coro in asyncio.as_completed(tasks.keys()):
                     result = await coro
-                    # **Fix:** Yield only the content string.
-                    yield result["content"]
+                    yield {
+                        "content": result["content"],
+                        "page_number": int(result["page"]),
+                    }
             total_elapsed = time.perf_counter() - ingest_start
             logger.info(
                 f"Completed PDF ingestion in {total_elapsed:.2f} seconds using VLMPDFParser."
diff --git a/py/core/providers/ingestion/r2r/base.py b/py/core/providers/ingestion/r2r/base.py
@@ -228,7 +228,7 @@ async def parse(  # type: ignore
             )
         else:
             t0 = time.time()
-            contents = ""
+            contents = []
             parser_overrides = ingestion_config_override.get(
                 "parser_overrides", {}
             )
@@ -244,37 +244,48 @@ async def parse(  # type: ignore
                     raise ValueError(
                         "Only Zerox PDF parser override is available."
                     )
-                async for text in self.parsers[
+                async for chunk in self.parsers[
                     f"zerox_{DocumentType.PDF.value}"
                 ].ingest(file_content, **ingestion_config_override):
-                    if text is not None:
-                        contents += text + "\n"
+                    if isinstance(chunk, dict) and chunk.get("content"):
+                        contents.append(chunk)
+                    elif (
+                        chunk
+                    ):  # Handle string output for backward compatibility
+                        contents.append({"content": chunk})
             else:
                 async for text in self.parsers[document.document_type].ingest(
                     file_content, **ingestion_config_override
                 ):
                     if text is not None:
-                        contents += text + "\n"
+                        contents.append({"content": text})
 
-            if not contents.strip():
+            if not contents:
                 logging.warning(
                     "No valid text content was extracted during parsing"
                 )
                 return
 
             iteration = 0
-            chunks = self.chunk(contents, ingestion_config_override)
-            for chunk in chunks:
-                extraction = DocumentChunk(
-                    id=generate_extraction_id(document.id, iteration),
-                    document_id=document.id,
-                    owner_id=document.owner_id,
-                    collection_ids=document.collection_ids,
-                    data=chunk,
-                    metadata={**document.metadata, "chunk_order": iteration},
-                )
-                iteration += 1
-                yield extraction
+            for content_item in contents:
+                chunk_text = content_item["content"]
+                chunks = self.chunk(chunk_text, ingestion_config_override)
+
+                for chunk in chunks:
+                    metadata = {**document.metadata, "chunk_order": iteration}
+                    if "page_number" in content_item:
+                        metadata["page_number"] = content_item["page_number"]
+
+                    extraction = DocumentChunk(
+                        id=generate_extraction_id(document.id, iteration),
+                        document_id=document.id,
+                        owner_id=document.owner_id,
+                        collection_ids=document.collection_ids,
+                        data=chunk,
+                        metadata=metadata,
+                    )
+                    iteration += 1
+                    yield extraction
 
             logger.debug(
                 f"Parsed document with id={document.id}, title={document.metadata.get('title', None)}, "
diff --git a/py/shared/abstractions/__init__.py b/py/shared/abstractions/__init__.py
@@ -14,7 +14,7 @@
 from .embedding import EmbeddingPurpose, default_embedding_prefixes
 from .exception import (
     PDFParsingError,
-    PopperNotFoundError,
+    PopplerNotFoundError,
     R2RDocumentProcessingError,
     R2RException,
 )
@@ -92,7 +92,7 @@
     "R2RDocumentProcessingError",
     "R2RException",
     "PDFParsingError",
-    "PopperNotFoundError",
+    "PopplerNotFoundError",
     # Graph abstractions
     "Entity",
     "Community",
diff --git a/py/shared/abstractions/exception.py b/py/shared/abstractions/exception.py
@@ -51,7 +51,7 @@ def __init__(
         super().__init__(message, status_code, detail)
 
 
-class PopperNotFoundError(PDFParsingError):
+class PopplerNotFoundError(PDFParsingError):
     """Specific error for when Poppler is not installed."""
 
     def __init__(self):