zai-org · ShayDuane · Feb 6, 2026
diff --git a/glmocr/dataloader/page_loader.py b/glmocr/dataloader/page_loader.py
@@ -25,6 +25,7 @@
 from glmocr.utils.image_utils import (
     load_image_to_base64,
     pdf_to_images_pil,
+    pdf_to_images_pil_iter,
     PYPDFIUM2_AVAILABLE,
 )
 from glmocr.utils.logging import get_logger, get_profiler
@@ -141,6 +142,61 @@ def load_pages_with_unit_indices(
             unit_indices.extend([unit_idx] * len(pages))
         return all_pages, unit_indices
 
+    def iter_pages_with_unit_indices(
+        self, sources: Union[str, List[str]]
+    ):
+        """Stream pages one at a time with unit index per page.
+
+        Yields (page, unit_idx) so the pipeline can enqueue each page as soon
+        as it is rendered (e.g. PDF: render one page → yield → next page).
+
+        Args:
+            sources: Single path/URL or a list.
+
+        Yields:
+            (PIL.Image, unit_idx) for each page.
+        """
+        if isinstance(sources, str):
+            sources = [sources]
+        for unit_idx, source in enumerate(sources):
+            for page in self._iter_source(source):
+                yield page, unit_idx
+
+    def _iter_source(self, source: str):
+        """Yield pages from a single source one at a time."""
+        if source.startswith("file://"):
+            file_path = source[7:]
+        else:
+            file_path = source
+
+        if os.path.isfile(file_path) and file_path.lower().endswith(".pdf"):
+            yield from self._iter_pdf(file_path)
+        else:
+            yield self._load_image(source)
+
+    def _iter_pdf(self, file_path: str):
+        """Yield PDF pages one at a time (streaming)."""
+        if not PYPDFIUM2_AVAILABLE:
+            raise RuntimeError(
+                "PDF support requires pypdfium2. Install: pip install pypdfium2"
+            )
+        end_page = None
+        if self.pdf_max_pages is not None:
+            try:
+                mp = int(self.pdf_max_pages)
+                if mp > 0:
+                    end_page = mp - 1  # 0-based inclusive
+            except Exception:
+                pass
+        for image in pdf_to_images_pil_iter(
+            file_path,
+            dpi=self.pdf_dpi,
+            max_width_or_height=3500,
+            start_page_id=0,
+            end_page_id=end_page,
+        ):
+            yield image
+
     def _load_source(self, source: str) -> List[Image.Image]:
         """Load a single source and return a list of pages.
 

diff --git a/glmocr/pipeline/pipeline.py b/glmocr/pipeline/pipeline.py
@@ -305,14 +305,17 @@ def process(
 
         def data_loading_thread() -> None:
             try:
-                pages, unit_indices = self.page_loader.load_pages_with_unit_indices(
+                img_idx = 0
+                unit_indices_list: List[int] = []
+                for page, unit_idx in self.page_loader.iter_pages_with_unit_indices(
                     image_urls
-                )
-                state.num_images_loaded[0] = len(pages)
-                state.unit_indices_holder[0] = unit_indices
-                for img_idx, page in enumerate(pages):
+                ):
                     state.images_dict[img_idx] = page
                     state.page_queue.put(("image", img_idx, page))
+                    unit_indices_list.append(unit_idx)
+                    img_idx += 1
+                state.num_images_loaded[0] = img_idx
+                state.unit_indices_holder[0] = unit_indices_list
                 state.page_queue.put(("done", None, None))
             except Exception as e:
                 logger.exception("Data loading thread error: %s", e)

diff --git a/glmocr/utils/image_utils.py b/glmocr/utils/image_utils.py
@@ -318,18 +318,71 @@ def pdf_to_images_pil(
         )
     import pypdfium2 as pdfium
 
-    pdf = pdfium.PdfDocument(pdf_path)
-    page_count = len(pdf)
-    if end_page_id is None or end_page_id < 0:
-        end_page_id = page_count - 1
-    if end_page_id >= page_count:
-        end_page_id = page_count - 1
-    images = []
-    for i in range(start_page_id, end_page_id + 1):
-        page = pdf[i]
-        image, _ = _page_to_image(
-            page, dpi=dpi, max_width_or_height=max_width_or_height
+    try:
+        pdf = pdfium.PdfDocument(pdf_path)
+        page_count = len(pdf)
+        if end_page_id is None or end_page_id < 0:
+            end_page_id = page_count - 1
+        if end_page_id >= page_count:
+            end_page_id = page_count - 1
+        images = []
+        for i in range(start_page_id, end_page_id + 1):
+            page = pdf[i]
+            try:
+                image, _ = _page_to_image(
+                    page, dpi=dpi, max_width_or_height=max_width_or_height
+                )
+                images.append(image)
+            finally:
+                page.close()
+        return images
+    finally:
+        pdf.close()
+
+
+def pdf_to_images_pil_iter(
+    pdf_path: str,
+    dpi: int = 200,
+    max_width_or_height: int = 3500,
+    start_page_id: int = 0,
+    end_page_id: int = None,
+):
+    """Convert PDF to PIL Images one page at a time (generator).
+
+    Use for streaming: each page is rendered and yielded immediately so
+    downstream can start processing before the whole PDF is loaded.
+
+    Args:
+        pdf_path: PDF file path.
+        dpi: Render DPI.
+        max_width_or_height: Max width or height.
+        start_page_id: Start page index (0-based).
+        end_page_id: End page index (inclusive); None = last page.
+
+    Yields:
+        PIL.Image per page.
+    """
+    if not PYPDFIUM2_AVAILABLE:
+        raise ImportError(
+            "PDF support requires pypdfium2. Install with: pip install pypdfium2"
         )
-        images.append(image)
-    pdf.close()
-    return images
+    import pypdfium2 as pdfium
+
+    try:
+        pdf = pdfium.PdfDocument(pdf_path)
+        page_count = len(pdf)
+        if end_page_id is None or end_page_id < 0:
+            end_page_id = page_count - 1
+        if end_page_id >= page_count:
+            end_page_id = page_count - 1
+        for i in range(start_page_id, end_page_id + 1):
+            page = pdf[i]
+            try:
+                image, _ = _page_to_image(
+                    page, dpi=dpi, max_width_or_height=max_width_or_height
+                )
+                yield image
+            finally:
+                page.close()
+    finally:
+        pdf.close()