Skip to content

Commit bced80f

Browse files
authored
Add PDF page to Zerox ingestion (#1976)
* Add PDF page to Zerox ingestion * Remove dump.txt
1 parent 6601162 commit bced80f

File tree

5 files changed

+47
-31
lines changed

5 files changed

+47
-31
lines changed

py/core/main/services/ingestion_service.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
)
3333
from core.base.api.models import User
3434
from core.telemetry.telemetry_decorator import telemetry_event
35-
from shared.abstractions import PDFParsingError, PopperNotFoundError
35+
from shared.abstractions import PDFParsingError, PopplerNotFoundError
3636

3737
from ..abstractions import R2RProviders
3838
from ..config import R2RConfig
@@ -274,7 +274,7 @@ async def parse_file(
274274
extraction.metadata["version"] = version
275275
yield extraction
276276

277-
except (PopperNotFoundError, PDFParsingError) as e:
277+
except (PopplerNotFoundError, PDFParsingError) as e:
278278
raise R2RDocumentProcessingError(
279279
error_message=e.message,
280280
document_id=document_info.id,

py/core/parsers/media/pdf_parser.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
DatabaseProvider,
2525
IngestionConfig,
2626
)
27-
from shared.abstractions import PDFParsingError, PopperNotFoundError
27+
from shared.abstractions import PDFParsingError, PopplerNotFoundError
2828

2929
logger = logging.getLogger()
3030

@@ -75,7 +75,7 @@ async def convert_pdf_to_images(
7575
logger.error(
7676
"PDFInfoNotInstalledError encountered during PDF conversion."
7777
)
78-
raise PopperNotFoundError()
78+
raise PopplerNotFoundError()
7979
except Exception as err:
8080
logger.error(
8181
f"Error converting PDF to images: {err} type: {type(err)}"
@@ -130,7 +130,7 @@ async def process_page(
130130
if response.choices and response.choices[0].message:
131131
content = response.choices[0].message.content
132132
page_elapsed = time.perf_counter() - page_start
133-
logger.info(
133+
logger.debug(
134134
f"Processed page {page_num} in {page_elapsed:.2f} seconds."
135135
)
136136
return {"page": str(page_num), "content": content}
@@ -146,7 +146,7 @@ async def process_page(
146146

147147
async def ingest(
148148
self, data: str | bytes, maintain_order: bool = True, **kwargs
149-
) -> AsyncGenerator[str, None]:
149+
) -> AsyncGenerator[dict[str, str | int], None]:
150150
"""
151151
Ingest PDF data and yield the text description for each page using the vision model.
152152
(This version yields a string per page rather than a dictionary.)
@@ -185,16 +185,21 @@ async def ingest(
185185
result = await task
186186
page_num = int(result["page"])
187187
results[page_num] = result
188-
# **Fix:** Yield only the content string instead of the whole dictionary.
189188
while next_page in results:
190-
yield results.pop(next_page)["content"]
189+
yield {
190+
"content": results[next_page]["content"],
191+
"page_number": next_page,
192+
}
193+
results.pop(next_page)
191194
next_page += 1
192195
else:
193196
# Yield results as tasks complete
194197
for coro in asyncio.as_completed(tasks.keys()):
195198
result = await coro
196-
# **Fix:** Yield only the content string.
197-
yield result["content"]
199+
yield {
200+
"content": result["content"],
201+
"page_number": int(result["page"]),
202+
}
198203
total_elapsed = time.perf_counter() - ingest_start
199204
logger.info(
200205
f"Completed PDF ingestion in {total_elapsed:.2f} seconds using VLMPDFParser."

py/core/providers/ingestion/r2r/base.py

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ async def parse( # type: ignore
228228
)
229229
else:
230230
t0 = time.time()
231-
contents = ""
231+
contents = []
232232
parser_overrides = ingestion_config_override.get(
233233
"parser_overrides", {}
234234
)
@@ -244,37 +244,48 @@ async def parse( # type: ignore
244244
raise ValueError(
245245
"Only Zerox PDF parser override is available."
246246
)
247-
async for text in self.parsers[
247+
async for chunk in self.parsers[
248248
f"zerox_{DocumentType.PDF.value}"
249249
].ingest(file_content, **ingestion_config_override):
250-
if text is not None:
251-
contents += text + "\n"
250+
if isinstance(chunk, dict) and chunk.get("content"):
251+
contents.append(chunk)
252+
elif (
253+
chunk
254+
): # Handle string output for backward compatibility
255+
contents.append({"content": chunk})
252256
else:
253257
async for text in self.parsers[document.document_type].ingest(
254258
file_content, **ingestion_config_override
255259
):
256260
if text is not None:
257-
contents += text + "\n"
261+
contents.append({"content": text})
258262

259-
if not contents.strip():
263+
if not contents:
260264
logging.warning(
261265
"No valid text content was extracted during parsing"
262266
)
263267
return
264268

265269
iteration = 0
266-
chunks = self.chunk(contents, ingestion_config_override)
267-
for chunk in chunks:
268-
extraction = DocumentChunk(
269-
id=generate_extraction_id(document.id, iteration),
270-
document_id=document.id,
271-
owner_id=document.owner_id,
272-
collection_ids=document.collection_ids,
273-
data=chunk,
274-
metadata={**document.metadata, "chunk_order": iteration},
275-
)
276-
iteration += 1
277-
yield extraction
270+
for content_item in contents:
271+
chunk_text = content_item["content"]
272+
chunks = self.chunk(chunk_text, ingestion_config_override)
273+
274+
for chunk in chunks:
275+
metadata = {**document.metadata, "chunk_order": iteration}
276+
if "page_number" in content_item:
277+
metadata["page_number"] = content_item["page_number"]
278+
279+
extraction = DocumentChunk(
280+
id=generate_extraction_id(document.id, iteration),
281+
document_id=document.id,
282+
owner_id=document.owner_id,
283+
collection_ids=document.collection_ids,
284+
data=chunk,
285+
metadata=metadata,
286+
)
287+
iteration += 1
288+
yield extraction
278289

279290
logger.debug(
280291
f"Parsed document with id={document.id}, title={document.metadata.get('title', None)}, "

py/shared/abstractions/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from .embedding import EmbeddingPurpose, default_embedding_prefixes
1515
from .exception import (
1616
PDFParsingError,
17-
PopperNotFoundError,
17+
PopplerNotFoundError,
1818
R2RDocumentProcessingError,
1919
R2RException,
2020
)
@@ -92,7 +92,7 @@
9292
"R2RDocumentProcessingError",
9393
"R2RException",
9494
"PDFParsingError",
95-
"PopperNotFoundError",
95+
"PopplerNotFoundError",
9696
# Graph abstractions
9797
"Entity",
9898
"Community",

py/shared/abstractions/exception.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def __init__(
5151
super().__init__(message, status_code, detail)
5252

5353

54-
class PopperNotFoundError(PDFParsingError):
54+
class PopplerNotFoundError(PDFParsingError):
5555
"""Specific error for when Poppler is not installed."""
5656

5757
def __init__(self):

0 commit comments

Comments
 (0)