Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions glmocr/dataloader/page_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from glmocr.utils.image_utils import (
load_image_to_base64,
pdf_to_images_pil,
pdf_to_images_pil_iter,
PYPDFIUM2_AVAILABLE,
)
from glmocr.utils.logging import get_logger, get_profiler
Expand Down Expand Up @@ -141,6 +142,61 @@ def load_pages_with_unit_indices(
unit_indices.extend([unit_idx] * len(pages))
return all_pages, unit_indices

def iter_pages_with_unit_indices(
self, sources: Union[str, List[str]]
):
"""Stream pages one at a time with unit index per page.

Yields (page, unit_idx) so the pipeline can enqueue each page as soon
as it is rendered (e.g. PDF: render one page → yield → next page).

Args:
sources: Single path/URL or a list.

Yields:
(PIL.Image, unit_idx) for each page.
"""
if isinstance(sources, str):
sources = [sources]
for unit_idx, source in enumerate(sources):
for page in self._iter_source(source):
yield page, unit_idx

def _iter_source(self, source: str):
"""Yield pages from a single source one at a time."""
if source.startswith("file://"):
file_path = source[7:]
else:
file_path = source

if os.path.isfile(file_path) and file_path.lower().endswith(".pdf"):
yield from self._iter_pdf(file_path)
else:
yield self._load_image(source)

def _iter_pdf(self, file_path: str):
"""Yield PDF pages one at a time (streaming)."""
if not PYPDFIUM2_AVAILABLE:
raise RuntimeError(
"PDF support requires pypdfium2. Install: pip install pypdfium2"
)
end_page = None
if self.pdf_max_pages is not None:
try:
mp = int(self.pdf_max_pages)
if mp > 0:
end_page = mp - 1 # 0-based inclusive
except Exception:
pass
for image in pdf_to_images_pil_iter(
file_path,
dpi=self.pdf_dpi,
max_width_or_height=3500,
start_page_id=0,
end_page_id=end_page,
):
yield image

def _load_source(self, source: str) -> List[Image.Image]:
"""Load a single source and return a list of pages.

Expand Down
13 changes: 8 additions & 5 deletions glmocr/pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,14 +305,17 @@ def process(

def data_loading_thread() -> None:
try:
pages, unit_indices = self.page_loader.load_pages_with_unit_indices(
img_idx = 0
unit_indices_list: List[int] = []
for page, unit_idx in self.page_loader.iter_pages_with_unit_indices(
image_urls
)
state.num_images_loaded[0] = len(pages)
state.unit_indices_holder[0] = unit_indices
for img_idx, page in enumerate(pages):
):
state.images_dict[img_idx] = page
state.page_queue.put(("image", img_idx, page))
unit_indices_list.append(unit_idx)
img_idx += 1
state.num_images_loaded[0] = img_idx
state.unit_indices_holder[0] = unit_indices_list
state.page_queue.put(("done", None, None))
except Exception as e:
logger.exception("Data loading thread error: %s", e)
Expand Down
81 changes: 67 additions & 14 deletions glmocr/utils/image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,18 +318,71 @@ def pdf_to_images_pil(
)
import pypdfium2 as pdfium

pdf = pdfium.PdfDocument(pdf_path)
page_count = len(pdf)
if end_page_id is None or end_page_id < 0:
end_page_id = page_count - 1
if end_page_id >= page_count:
end_page_id = page_count - 1
images = []
for i in range(start_page_id, end_page_id + 1):
page = pdf[i]
image, _ = _page_to_image(
page, dpi=dpi, max_width_or_height=max_width_or_height
try:
pdf = pdfium.PdfDocument(pdf_path)
page_count = len(pdf)
if end_page_id is None or end_page_id < 0:
end_page_id = page_count - 1
if end_page_id >= page_count:
end_page_id = page_count - 1
images = []
for i in range(start_page_id, end_page_id + 1):
page = pdf[i]
try:
image, _ = _page_to_image(
page, dpi=dpi, max_width_or_height=max_width_or_height
)
images.append(image)
finally:
page.close()
return images
finally:
pdf.close()


def pdf_to_images_pil_iter(
pdf_path: str,
dpi: int = 200,
max_width_or_height: int = 3500,
start_page_id: int = 0,
end_page_id: int = None,
):
"""Convert PDF to PIL Images one page at a time (generator).

Use for streaming: each page is rendered and yielded immediately so
downstream can start processing before the whole PDF is loaded.

Args:
pdf_path: PDF file path.
dpi: Render DPI.
max_width_or_height: Max width or height.
start_page_id: Start page index (0-based).
end_page_id: End page index (inclusive); None = last page.

Yields:
PIL.Image per page.
"""
if not PYPDFIUM2_AVAILABLE:
raise ImportError(
"PDF support requires pypdfium2. Install with: pip install pypdfium2"
)
images.append(image)
pdf.close()
return images
import pypdfium2 as pdfium

try:
pdf = pdfium.PdfDocument(pdf_path)
page_count = len(pdf)
if end_page_id is None or end_page_id < 0:
end_page_id = page_count - 1
if end_page_id >= page_count:
end_page_id = page_count - 1
for i in range(start_page_id, end_page_id + 1):
page = pdf[i]
try:
image, _ = _page_to_image(
page, dpi=dpi, max_width_or_height=max_width_or_height
)
yield image
finally:
page.close()
finally:
pdf.close()