Faster VLM Ingestion (#2157)

NolanTrem · web-flow · commit b6b30acc4d3e · 2025-04-25T11:54:00.000-07:00
* Faster VLM Ingestion

* Fix typo in comment

* Remove explicit gc
diff --git a/js/sdk/package-lock.json b/js/sdk/package-lock.json
diff --git a/js/sdk/package.json b/js/sdk/package.json
@@ -1,6 +1,6 @@
 {
   "name": "r2r-js",
-  "version": "0.4.38",
+  "version": "0.4.39",
   "description": "",
   "main": "dist/index.js",
   "browser": "dist/index.browser.js",
diff --git a/py/core/base/providers/ingestion.py b/py/core/base/providers/ingestion.py
@@ -36,6 +36,7 @@ class IngestionConfig(ProviderConfig):
         "audio_transcription_model": None,
         "vlm": None,
         "vlm_batch_size": 5,
+        "max_concurrent_vlm_tasks": 5,
         "vlm_ocr_one_page_per_chunk": True,
         "skip_document_summary": False,
         "document_summary_system_prompt": "system",
@@ -82,6 +83,11 @@ class IngestionConfig(ProviderConfig):
     vlm_batch_size: int = Field(
         default_factory=lambda: IngestionConfig._defaults["vlm_batch_size"]
     )
+    max_concurrent_vlm_tasks: int = Field(
+        default_factory=lambda: IngestionConfig._defaults[
+            "max_concurrent_vlm_tasks"
+        ]
+    )
     vlm_ocr_one_page_per_chunk: bool = Field(
         default_factory=lambda: IngestionConfig._defaults[
             "vlm_ocr_one_page_per_chunk"
diff --git a/py/core/parsers/media/pdf_parser.py b/py/core/parsers/media/pdf_parser.py
@@ -84,6 +84,11 @@ def __init__(
         self.llm_provider = llm_provider
         self.config = config
         self.vision_prompt_text = None
+        self.vlm_batch_size = self.config.vlm_batch_size or 5
+        self.max_concurrent_vlm_tasks = (
+            self.config.max_concurrent_vlm_tasks or 5
+        )
+        self.semaphore = None
 
     async def process_page(self, image, page_num: int) -> dict[str, str]:
         """Process a single PDF page using the vision model."""
@@ -213,6 +218,15 @@ async def process_page(self, image, page_num: int) -> dict[str, str]:
                 "content": f"Error processing page: {str(e)}",
             }
 
+    async def process_and_yield(self, image, page_num: int):
+        """Process a page and yield the result."""
+        async with self.semaphore:
+            result = await self.process_page(image, page_num)
+            return {
+                "content": result.get("content", "") or "",
+                "page_number": page_num,
+            }
+
     async def ingest(
         self, data: str | bytes, **kwargs
     ) -> AsyncGenerator[dict[str, str | int], None]:
@@ -228,9 +242,9 @@ async def ingest(
             )
             logger.info("Retrieved vision prompt text from database.")
 
-        try:
-            batch_size = self.config.vlm_batch_size or 5
+        self.semaphore = asyncio.Semaphore(self.max_concurrent_vlm_tasks)
 
+        try:
             if isinstance(data, str):
                 pdf_info = pdf2image.pdfinfo_from_path(data)
             else:
@@ -240,52 +254,95 @@ async def ingest(
             max_pages = pdf_info["Pages"]
             logger.info(f"PDF has {max_pages} pages to process")
 
-            # Convert and process each batch of rasterized pages
-            for batch_start in range(0, max_pages, batch_size):
-                batch_end = min(batch_start + batch_size, max_pages)
-                logger.info(
-                    f"Processing batch: pages {batch_start + 1}-{batch_end}/{max_pages}"
+            # Create a task queue to process pages in order
+            pending_tasks = []
+            completed_tasks = []
+            next_page_to_yield = 1
+
+            # Process pages with a sliding window, in batches
+            for batch_start in range(1, max_pages + 1, self.vlm_batch_size):
+                batch_end = min(
+                    batch_start + self.vlm_batch_size - 1, max_pages
+                )
+                logger.debug(
+                    f"Preparing batch of pages {batch_start}-{batch_end}/{max_pages}"
                 )
 
+                # Convert the batch of pages to images
                 if isinstance(data, str):
-                    batch_images = pdf2image.convert_from_path(
+                    images = pdf2image.convert_from_path(
                         data,
                         dpi=150,
-                        first_page=batch_start + 1,
+                        first_page=batch_start,
                         last_page=batch_end,
                     )
                 else:
                     pdf_bytes = BytesIO(data)
-                    batch_images = pdf2image.convert_from_bytes(
+                    images = pdf2image.convert_from_bytes(
                         pdf_bytes.getvalue(),
                         dpi=150,
-                        first_page=batch_start + 1,
+                        first_page=batch_start,
                         last_page=batch_end,
                     )
 
-                batch_tasks = []
-                for i, image in enumerate(batch_images):
-                    page_num = batch_start + i + 1
-                    batch_tasks.append(self.process_page(image, page_num))
-
-                # Process the batch concurrently
-                batch_results = await asyncio.gather(*batch_tasks)
-
-                for i, result in enumerate(batch_results):
-                    page_num = batch_start + i + 1
-                    yield {
-                        "content": result.get("content", "") or "",
-                        "page_number": page_num,
-                    }
-
-                # Force garbage collection after each batch
-                import gc
+                # Create tasks for each page in the batch
+                for i, image in enumerate(images):
+                    page_num = batch_start + i
+                    task = asyncio.create_task(
+                        self.process_and_yield(image, page_num)
+                    )
+                    task.page_num = page_num  # Store page number for sorting
+                    pending_tasks.append(task)
+
+                # Check if any tasks have completed and yield them in order
+                while pending_tasks:
+                    # Get the first done task without waiting
+                    done_tasks, pending_tasks_set = await asyncio.wait(
+                        pending_tasks,
+                        timeout=0.01,
+                        return_when=asyncio.FIRST_COMPLETED,
+                    )
 
-                gc.collect()
+                    if not done_tasks:
+                        break
+
+                    # Add completed tasks to our completed list
+                    pending_tasks = list(pending_tasks_set)
+                    completed_tasks.extend(iter(done_tasks))
+
+                    # Sort completed tasks by page number
+                    completed_tasks.sort(key=lambda t: t.page_num)
+
+                    # Yield results in order
+                    while (
+                        completed_tasks
+                        and completed_tasks[0].page_num == next_page_to_yield
+                    ):
+                        task = completed_tasks.pop(0)
+                        yield await task
+                        next_page_to_yield += 1
+
+            # Wait for and yield any remaining tasks in order
+            while pending_tasks:
+                done_tasks, _ = await asyncio.wait(pending_tasks)
+                completed_tasks.extend(done_tasks)
+                pending_tasks = []
+
+                # Sort and yield remaining completed tasks
+                completed_tasks.sort(key=lambda t: t.page_num)
+
+                # Yield results in order
+                while (
+                    completed_tasks
+                    and completed_tasks[0].page_num == next_page_to_yield
+                ):
+                    task = completed_tasks.pop(0)
+                    yield await task
+                    next_page_to_yield += 1
 
             total_elapsed = time.perf_counter() - ingest_start
             logger.info(
-                f"Completed PDF ingestion in {total_elapsed:.2f} seconds"
+                f"Completed PDF conversion in {total_elapsed:.2f} seconds"
             )
 
         except Exception as e:
diff --git a/py/core/providers/ingestion/unstructured/base.py b/py/core/providers/ingestion/unstructured/base.py
@@ -254,7 +254,7 @@ async def parse_fallback(
                     metadata["page_number"] = content_item["page_number"]
 
                 yield FallbackElement(
-                    text=text,
+                    text=text or "No content extracted.",
                     metadata=metadata,
                 )
                 iteration += 1
@@ -433,10 +433,6 @@ async def parse(
                 metadata=metadata,
             )
 
-        # TODO: explore why this is throwing inadvertedly
-        # if iteration == 0:
-        #     raise ValueError(f"No chunks found for document {document.id}")
-
         logger.debug(
             f"Parsed document with id={document.id}, title={document.metadata.get('title', None)}, "
             f"user_id={document.metadata.get('user_id', None)}, metadata={document.metadata} "
diff --git a/py/pyproject.toml b/py/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "r2r"
-version = "3.5.15"
+version = "3.5.16"
 description = "SciPhi R2R"
 readme = "README.md"
 license = {text = "MIT"}
diff --git a/py/r2r/r2r.toml b/py/r2r/r2r.toml
@@ -106,6 +106,7 @@ chunk_overlap = 512
 excluded_parsers = ["mp4"]
 automatic_extraction = true # enable automatic extraction of entities and relations
 vlm_batch_size=20
+max_concurrent_vlm_tasks=20
 vlm_ocr_one_page_per_chunk = true
 
   [ingestion.chunk_enrichment_settings]
diff --git a/py/shared/abstractions/document.py b/py/shared/abstractions/document.py
@@ -326,6 +326,7 @@ class IngestionConfig(R2RSerializable):
 
     vlm: Optional[str] = None
     vlm_batch_size: int = 5
+    max_concurrent_vlm_tasks: int = 5
     vlm_ocr_one_page_per_chunk: bool = True
 
     skip_document_summary: bool = False

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "r2r-js",`
`3`		`- "version": "0.4.38",`
	`3`	`+ "version": "0.4.39",`
`4`	`4`	`"description": "",`
`5`	`5`	`"main": "dist/index.js",`
`6`	`6`	`"browser": "dist/index.browser.js",`