Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 37 additions & 22 deletions docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
from PIL import Image
from pypdfium2 import PdfPage

from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.backend.managed_pdfium_backend import (
ManagedPdfiumDocumentBackend,
ManagedPdfiumPageBackend,
)
from docling.datamodel.backend_options import PdfBackendOptions
from docling.datamodel.base_models import Size
from docling.utils.locks import pypdfium2_lock
Expand All @@ -23,10 +26,11 @@
_log = logging.getLogger(__name__)


class DoclingParsePageBackend(PdfPageBackend):
class DoclingParsePageBackend(ManagedPdfiumPageBackend):
def __init__(
self,
*,
owner: "DoclingParseDocumentBackend",
dp_doc: PdfDocument,
page_obj: PdfPage,
page_no: int,
Expand All @@ -36,8 +40,9 @@ def __init__(
keep_lines: bool = False,
keep_images: bool = True,
):
super().__init__(owner)
self._ppage = page_obj
self._dp_doc = dp_doc
self._dp_doc: Optional[PdfDocument] = dp_doc
self._page_no = page_no

self._create_words = create_words
Expand All @@ -51,6 +56,10 @@ def __init__(
self._unloaded = False
self.valid = (self._ppage is not None) and (self._dp_doc is not None)

def _require_page(self) -> PdfPage:
assert self._ppage is not None, "Page backend was unloaded."
return self._ppage

def _ensure_parsed(self) -> None:
if self._dpage is not None:
return
Expand All @@ -72,6 +81,7 @@ def _ensure_parsed(self) -> None:
config.create_line_cells = self._create_textlines
config.enforce_same_font = True

assert self._dp_doc is not None
seg_page = self._dp_doc.get_page(self._page_no + 1, config=config)

# In Docling, all TextCell instances are expected with top-left origin.
Expand Down Expand Up @@ -165,41 +175,46 @@ def get_page_image(
padbox.t = page_size.height - padbox.t

with pypdfium2_lock:
image = (
self._ppage.render(
scale=scale * 1.5,
rotation=0, # no additional rotation
crop=padbox.as_tuple(),
)
.to_pil()
.resize(
size=(round(cropbox.width * scale), round(cropbox.height * scale))
)
) # We resize the image from 1.5x the given scale to make it sharper.
bitmap = self._require_page().render(
scale=scale * 1.5,
rotation=0, # no additional rotation
crop=padbox.as_tuple(),
)
image = bitmap.to_pil().copy()
bitmap.close()
# We resize the image from 1.5x the given scale to make it sharper.
image = image.resize(
size=(round(cropbox.width * scale), round(cropbox.height * scale))
)

return image

def get_size(self) -> Size:
with pypdfium2_lock:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
page = self._require_page()
return Size(width=page.get_width(), height=page.get_height())

# TODO: Take width and height from docling-parse.
# return Size(
# width=self._dpage.dimension.width,
# height=self._dpage.dimension.height,
# )

def unload(self):
def _close_native_page(self) -> None:
if not self._unloaded and self._dp_doc is not None:
self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
self._unloaded = True

with pypdfium2_lock:
if self._ppage is not None:
self._ppage.close()

self._ppage = None
self._dpage = None
self._dp_doc = None


class DoclingParseDocumentBackend(PdfDocumentBackend):
class DoclingParseDocumentBackend(ManagedPdfiumDocumentBackend):
def __init__(
self,
in_doc: "InputDocument",
Expand All @@ -215,7 +230,7 @@ def __init__(
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
self.parser = DoclingPdfParser(loglevel="fatal")

self.dp_doc: PdfDocument = self.parser.load(
self.dp_doc: Optional[PdfDocument] = self.parser.load(
path_or_stream=self.path_or_stream, password=password
)
success = self.dp_doc is not None
Expand All @@ -229,6 +244,7 @@ def page_count(self) -> int:
# return len(self._pdoc) # To be replaced with docling-parse API

len_1 = len(self._pdoc)
assert self.dp_doc is not None
len_2 = self.dp_doc.number_of_pages()

if len_1 != len_2:
Expand All @@ -239,10 +255,12 @@ def page_count(self) -> int:
def load_page(
self, page_no: int, create_words: bool = True, create_textlines: bool = True
) -> DoclingParsePageBackend:
assert self.dp_doc is not None
with pypdfium2_lock:
ppage = self._pdoc[page_no]

return DoclingParsePageBackend(
owner=self,
dp_doc=self.dp_doc,
page_obj=ppage,
page_no=page_no,
Expand All @@ -253,14 +271,11 @@ def load_page(
def is_valid(self) -> bool:
return self.page_count() > 0

def unload(self):
super().unload()
# Unload docling-parse document first
def _close_native_document(self) -> None:
if self.dp_doc is not None:
self.dp_doc.unload()
self.dp_doc = None

# Then close pypdfium2 document with proper locking
if self._pdoc is not None:
with pypdfium2_lock:
try:
Expand Down
100 changes: 100 additions & 0 deletions docling/backend/managed_pdfium_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from __future__ import annotations

import threading
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Union

from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.backend_options import PdfBackendOptions

if TYPE_CHECKING:
from docling.datamodel.document import InputDocument


class ManagedPdfiumDocumentBackend(PdfDocumentBackend, ABC):
"""Shared lifecycle management for PDFium-backed document backends."""

def __init__(
self,
in_doc: InputDocument,
path_or_stream: Union[BytesIO, Path],
options: PdfBackendOptions = PdfBackendOptions(),
) -> None:
super().__init__(in_doc, path_or_stream, options)
self._live_pages: set[ManagedPdfiumPageBackend] = set()
self._live_pages_lock = threading.Lock()
self._live_pages_cond = threading.Condition(self._live_pages_lock)
self._closing = False
self._closed = False

def _register_live_page(self, page_backend: ManagedPdfiumPageBackend) -> None:
with self._live_pages_cond:
if self._closing or self._closed:
raise RuntimeError(
"Cannot register a page while the document is closing."
)
self._live_pages.add(page_backend)

def _release_live_page(self, page_backend: ManagedPdfiumPageBackend) -> None:
with self._live_pages_cond:
self._live_pages.discard(page_backend)
self._live_pages_cond.notify_all()

def _close_live_pages(self) -> None:
while True:
with self._live_pages_cond:
live_pages = list(self._live_pages)
if not live_pages:
return
for page_backend in live_pages:
page_backend.unload()

@abstractmethod
def _close_native_document(self) -> None:
pass

def unload(self) -> None:
with self._live_pages_cond:
if self._closed:
return
self._closing = True

try:
self._close_live_pages()
self._close_native_document()
finally:
with self._live_pages_cond:
self._closed = True
self._closing = False
self._live_pages.clear()
self._live_pages_cond.notify_all()

super().unload()


class ManagedPdfiumPageBackend(PdfPageBackend, ABC):
"""Shared page lifecycle for PDFium-backed page backends."""

def __init__(self, owner: ManagedPdfiumDocumentBackend) -> None:
self._owner: ManagedPdfiumDocumentBackend | None = owner
self._closed = False
owner._register_live_page(self)

@abstractmethod
def _close_native_page(self) -> None:
pass

def unload(self) -> None:
if self._closed:
return

owner = self._owner
try:
self._close_native_page()
finally:
self._closed = True
self._owner = None
if owner is not None:
owner._release_live_page(self)
Loading
Loading