Skip to content

Commit

Permalink
Add PDF page to Zerox ingestion (#1976)
Browse files Browse the repository at this point in the history
* Add PDF page to Zerox ingestion

* Remove dump.txt
  • Loading branch information
NolanTrem authored Feb 18, 2025
1 parent 6601162 commit bced80f
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 31 deletions.
4 changes: 2 additions & 2 deletions py/core/main/services/ingestion_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
)
from core.base.api.models import User
from core.telemetry.telemetry_decorator import telemetry_event
from shared.abstractions import PDFParsingError, PopperNotFoundError
from shared.abstractions import PDFParsingError, PopplerNotFoundError

from ..abstractions import R2RProviders
from ..config import R2RConfig
Expand Down Expand Up @@ -274,7 +274,7 @@ async def parse_file(
extraction.metadata["version"] = version
yield extraction

except (PopperNotFoundError, PDFParsingError) as e:
except (PopplerNotFoundError, PDFParsingError) as e:
raise R2RDocumentProcessingError(
error_message=e.message,
document_id=document_info.id,
Expand Down
21 changes: 13 additions & 8 deletions py/core/parsers/media/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
DatabaseProvider,
IngestionConfig,
)
from shared.abstractions import PDFParsingError, PopperNotFoundError
from shared.abstractions import PDFParsingError, PopplerNotFoundError

logger = logging.getLogger()

Expand Down Expand Up @@ -75,7 +75,7 @@ async def convert_pdf_to_images(
logger.error(
"PDFInfoNotInstalledError encountered during PDF conversion."
)
raise PopperNotFoundError()
raise PopplerNotFoundError()
except Exception as err:
logger.error(
f"Error converting PDF to images: {err} type: {type(err)}"
Expand Down Expand Up @@ -130,7 +130,7 @@ async def process_page(
if response.choices and response.choices[0].message:
content = response.choices[0].message.content
page_elapsed = time.perf_counter() - page_start
logger.info(
logger.debug(
f"Processed page {page_num} in {page_elapsed:.2f} seconds."
)
return {"page": str(page_num), "content": content}
Expand All @@ -146,7 +146,7 @@ async def process_page(

async def ingest(
self, data: str | bytes, maintain_order: bool = True, **kwargs
) -> AsyncGenerator[str, None]:
) -> AsyncGenerator[dict[str, str | int], None]:
"""
Ingest PDF data and yield the text description for each page using the vision model.
(This version yields a string per page rather than a dictionary.)
Expand Down Expand Up @@ -185,16 +185,21 @@ async def ingest(
result = await task
page_num = int(result["page"])
results[page_num] = result
# **Fix:** Yield only the content string instead of the whole dictionary.
while next_page in results:
yield results.pop(next_page)["content"]
yield {
"content": results[next_page]["content"],
"page_number": next_page,
}
results.pop(next_page)
next_page += 1
else:
# Yield results as tasks complete
for coro in asyncio.as_completed(tasks.keys()):
result = await coro
# **Fix:** Yield only the content string.
yield result["content"]
yield {
"content": result["content"],
"page_number": int(result["page"]),
}
total_elapsed = time.perf_counter() - ingest_start
logger.info(
f"Completed PDF ingestion in {total_elapsed:.2f} seconds using VLMPDFParser."
Expand Down
47 changes: 29 additions & 18 deletions py/core/providers/ingestion/r2r/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ async def parse( # type: ignore
)
else:
t0 = time.time()
contents = ""
contents = []
parser_overrides = ingestion_config_override.get(
"parser_overrides", {}
)
Expand All @@ -244,37 +244,48 @@ async def parse( # type: ignore
raise ValueError(
"Only Zerox PDF parser override is available."
)
async for text in self.parsers[
async for chunk in self.parsers[
f"zerox_{DocumentType.PDF.value}"
].ingest(file_content, **ingestion_config_override):
if text is not None:
contents += text + "\n"
if isinstance(chunk, dict) and chunk.get("content"):
contents.append(chunk)
elif (
chunk
): # Handle string output for backward compatibility
contents.append({"content": chunk})
else:
async for text in self.parsers[document.document_type].ingest(
file_content, **ingestion_config_override
):
if text is not None:
contents += text + "\n"
contents.append({"content": text})

if not contents.strip():
if not contents:
logging.warning(
"No valid text content was extracted during parsing"
)
return

iteration = 0
chunks = self.chunk(contents, ingestion_config_override)
for chunk in chunks:
extraction = DocumentChunk(
id=generate_extraction_id(document.id, iteration),
document_id=document.id,
owner_id=document.owner_id,
collection_ids=document.collection_ids,
data=chunk,
metadata={**document.metadata, "chunk_order": iteration},
)
iteration += 1
yield extraction
for content_item in contents:
chunk_text = content_item["content"]
chunks = self.chunk(chunk_text, ingestion_config_override)

for chunk in chunks:
metadata = {**document.metadata, "chunk_order": iteration}
if "page_number" in content_item:
metadata["page_number"] = content_item["page_number"]

extraction = DocumentChunk(
id=generate_extraction_id(document.id, iteration),
document_id=document.id,
owner_id=document.owner_id,
collection_ids=document.collection_ids,
data=chunk,
metadata=metadata,
)
iteration += 1
yield extraction

logger.debug(
f"Parsed document with id={document.id}, title={document.metadata.get('title', None)}, "
Expand Down
4 changes: 2 additions & 2 deletions py/shared/abstractions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .embedding import EmbeddingPurpose, default_embedding_prefixes
from .exception import (
PDFParsingError,
PopperNotFoundError,
PopplerNotFoundError,
R2RDocumentProcessingError,
R2RException,
)
Expand Down Expand Up @@ -92,7 +92,7 @@
"R2RDocumentProcessingError",
"R2RException",
"PDFParsingError",
"PopperNotFoundError",
"PopplerNotFoundError",
# Graph abstractions
"Entity",
"Community",
Expand Down
2 changes: 1 addition & 1 deletion py/shared/abstractions/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(
super().__init__(message, status_code, detail)


class PopperNotFoundError(PDFParsingError):
class PopplerNotFoundError(PDFParsingError):
"""Specific error for when Poppler is not installed."""

def __init__(self):
Expand Down

0 comments on commit bced80f

Please sign in to comment.