Skip to content

Commit b6b30ac

Browse files
authored
Faster VLM Ingestion (#2157)
* Faster VLM Ingestion * Fix typo in comment * Remove explicit gc
1 parent 4f451d3 commit b6b30ac

File tree

8 files changed

+99
-38
lines changed

8 files changed

+99
-38
lines changed

js/sdk/package-lock.json

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

js/sdk/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "r2r-js",
3-
"version": "0.4.38",
3+
"version": "0.4.39",
44
"description": "",
55
"main": "dist/index.js",
66
"browser": "dist/index.browser.js",

py/core/base/providers/ingestion.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class IngestionConfig(ProviderConfig):
3636
"audio_transcription_model": None,
3737
"vlm": None,
3838
"vlm_batch_size": 5,
39+
"max_concurrent_vlm_tasks": 5,
3940
"vlm_ocr_one_page_per_chunk": True,
4041
"skip_document_summary": False,
4142
"document_summary_system_prompt": "system",
@@ -82,6 +83,11 @@ class IngestionConfig(ProviderConfig):
8283
vlm_batch_size: int = Field(
8384
default_factory=lambda: IngestionConfig._defaults["vlm_batch_size"]
8485
)
86+
max_concurrent_vlm_tasks: int = Field(
87+
default_factory=lambda: IngestionConfig._defaults[
88+
"max_concurrent_vlm_tasks"
89+
]
90+
)
8591
vlm_ocr_one_page_per_chunk: bool = Field(
8692
default_factory=lambda: IngestionConfig._defaults[
8793
"vlm_ocr_one_page_per_chunk"

py/core/parsers/media/pdf_parser.py

Lines changed: 87 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ def __init__(
8484
self.llm_provider = llm_provider
8585
self.config = config
8686
self.vision_prompt_text = None
87+
self.vlm_batch_size = self.config.vlm_batch_size or 5
88+
self.max_concurrent_vlm_tasks = (
89+
self.config.max_concurrent_vlm_tasks or 5
90+
)
91+
self.semaphore = None
8792

8893
async def process_page(self, image, page_num: int) -> dict[str, str]:
8994
"""Process a single PDF page using the vision model."""
@@ -213,6 +218,15 @@ async def process_page(self, image, page_num: int) -> dict[str, str]:
213218
"content": f"Error processing page: {str(e)}",
214219
}
215220

221+
async def process_and_yield(self, image, page_num: int):
222+
"""Process a page and yield the result."""
223+
async with self.semaphore:
224+
result = await self.process_page(image, page_num)
225+
return {
226+
"content": result.get("content", "") or "",
227+
"page_number": page_num,
228+
}
229+
216230
async def ingest(
217231
self, data: str | bytes, **kwargs
218232
) -> AsyncGenerator[dict[str, str | int], None]:
@@ -228,9 +242,9 @@ async def ingest(
228242
)
229243
logger.info("Retrieved vision prompt text from database.")
230244

231-
try:
232-
batch_size = self.config.vlm_batch_size or 5
245+
self.semaphore = asyncio.Semaphore(self.max_concurrent_vlm_tasks)
233246

247+
try:
234248
if isinstance(data, str):
235249
pdf_info = pdf2image.pdfinfo_from_path(data)
236250
else:
@@ -240,52 +254,95 @@ async def ingest(
240254
max_pages = pdf_info["Pages"]
241255
logger.info(f"PDF has {max_pages} pages to process")
242256

243-
# Convert and process each batch of rasterized pages
244-
for batch_start in range(0, max_pages, batch_size):
245-
batch_end = min(batch_start + batch_size, max_pages)
246-
logger.info(
247-
f"Processing batch: pages {batch_start + 1}-{batch_end}/{max_pages}"
257+
# Create a task queue to process pages in order
258+
pending_tasks = []
259+
completed_tasks = []
260+
next_page_to_yield = 1
261+
262+
# Process pages with a sliding window, in batches
263+
for batch_start in range(1, max_pages + 1, self.vlm_batch_size):
264+
batch_end = min(
265+
batch_start + self.vlm_batch_size - 1, max_pages
266+
)
267+
logger.debug(
268+
f"Preparing batch of pages {batch_start}-{batch_end}/{max_pages}"
248269
)
249270

271+
# Convert the batch of pages to images
250272
if isinstance(data, str):
251-
batch_images = pdf2image.convert_from_path(
273+
images = pdf2image.convert_from_path(
252274
data,
253275
dpi=150,
254-
first_page=batch_start + 1,
276+
first_page=batch_start,
255277
last_page=batch_end,
256278
)
257279
else:
258280
pdf_bytes = BytesIO(data)
259-
batch_images = pdf2image.convert_from_bytes(
281+
images = pdf2image.convert_from_bytes(
260282
pdf_bytes.getvalue(),
261283
dpi=150,
262-
first_page=batch_start + 1,
284+
first_page=batch_start,
263285
last_page=batch_end,
264286
)
265287

266-
batch_tasks = []
267-
for i, image in enumerate(batch_images):
268-
page_num = batch_start + i + 1
269-
batch_tasks.append(self.process_page(image, page_num))
270-
271-
# Process the batch concurrently
272-
batch_results = await asyncio.gather(*batch_tasks)
273-
274-
for i, result in enumerate(batch_results):
275-
page_num = batch_start + i + 1
276-
yield {
277-
"content": result.get("content", "") or "",
278-
"page_number": page_num,
279-
}
280-
281-
# Force garbage collection after each batch
282-
import gc
288+
# Create tasks for each page in the batch
289+
for i, image in enumerate(images):
290+
page_num = batch_start + i
291+
task = asyncio.create_task(
292+
self.process_and_yield(image, page_num)
293+
)
294+
task.page_num = page_num # Store page number for sorting
295+
pending_tasks.append(task)
296+
297+
# Check if any tasks have completed and yield them in order
298+
while pending_tasks:
299+
# Get the first done task without waiting
300+
done_tasks, pending_tasks_set = await asyncio.wait(
301+
pending_tasks,
302+
timeout=0.01,
303+
return_when=asyncio.FIRST_COMPLETED,
304+
)
283305

284-
gc.collect()
306+
if not done_tasks:
307+
break
308+
309+
# Add completed tasks to our completed list
310+
pending_tasks = list(pending_tasks_set)
311+
completed_tasks.extend(iter(done_tasks))
312+
313+
# Sort completed tasks by page number
314+
completed_tasks.sort(key=lambda t: t.page_num)
315+
316+
# Yield results in order
317+
while (
318+
completed_tasks
319+
and completed_tasks[0].page_num == next_page_to_yield
320+
):
321+
task = completed_tasks.pop(0)
322+
yield await task
323+
next_page_to_yield += 1
324+
325+
# Wait for and yield any remaining tasks in order
326+
while pending_tasks:
327+
done_tasks, _ = await asyncio.wait(pending_tasks)
328+
completed_tasks.extend(done_tasks)
329+
pending_tasks = []
330+
331+
# Sort and yield remaining completed tasks
332+
completed_tasks.sort(key=lambda t: t.page_num)
333+
334+
# Yield results in order
335+
while (
336+
completed_tasks
337+
and completed_tasks[0].page_num == next_page_to_yield
338+
):
339+
task = completed_tasks.pop(0)
340+
yield await task
341+
next_page_to_yield += 1
285342

286343
total_elapsed = time.perf_counter() - ingest_start
287344
logger.info(
288-
f"Completed PDF ingestion in {total_elapsed:.2f} seconds"
345+
f"Completed PDF conversion in {total_elapsed:.2f} seconds"
289346
)
290347

291348
except Exception as e:

py/core/providers/ingestion/unstructured/base.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ async def parse_fallback(
254254
metadata["page_number"] = content_item["page_number"]
255255

256256
yield FallbackElement(
257-
text=text,
257+
text=text or "No content extracted.",
258258
metadata=metadata,
259259
)
260260
iteration += 1
@@ -433,10 +433,6 @@ async def parse(
433433
metadata=metadata,
434434
)
435435

436-
# TODO: explore why this is throwing inadvertedly
437-
# if iteration == 0:
438-
# raise ValueError(f"No chunks found for document {document.id}")
439-
440436
logger.debug(
441437
f"Parsed document with id={document.id}, title={document.metadata.get('title', None)}, "
442438
f"user_id={document.metadata.get('user_id', None)}, metadata={document.metadata} "

py/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "r2r"
7-
version = "3.5.15"
7+
version = "3.5.16"
88
description = "SciPhi R2R"
99
readme = "README.md"
1010
license = {text = "MIT"}

py/r2r/r2r.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ chunk_overlap = 512
106106
excluded_parsers = ["mp4"]
107107
automatic_extraction = true # enable automatic extraction of entities and relations
108108
vlm_batch_size=20
109+
max_concurrent_vlm_tasks=20
109110
vlm_ocr_one_page_per_chunk = true
110111

111112
[ingestion.chunk_enrichment_settings]

py/shared/abstractions/document.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@ class IngestionConfig(R2RSerializable):
326326

327327
vlm: Optional[str] = None
328328
vlm_batch_size: int = 5
329+
max_concurrent_vlm_tasks: int = 5
329330
vlm_ocr_one_page_per_chunk: bool = True
330331

331332
skip_document_summary: bool = False

0 commit comments

Comments
 (0)