Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 35 additions & 23 deletions docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
from PIL import Image
from pypdfium2 import PdfPage

from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.backend.managed_pdfium_backend import (
ManagedPdfiumDocumentBackend,
ManagedPdfiumPageBackend,
)
from docling.datamodel.backend_options import PdfBackendOptions
from docling.datamodel.base_models import Size
from docling.utils.locks import pypdfium2_lock
Expand All @@ -23,7 +26,7 @@
_log = logging.getLogger(__name__)


class DoclingParsePageBackend(PdfPageBackend):
class DoclingParsePageBackend(ManagedPdfiumPageBackend):
def __init__(
self,
*,
Expand All @@ -36,8 +39,9 @@ def __init__(
keep_lines: bool = False,
keep_images: bool = True,
):
super().__init__()
self._ppage = page_obj
self._dp_doc = dp_doc
self._dp_doc: Optional[PdfDocument] = dp_doc
self._page_no = page_no

self._create_words = create_words
Expand All @@ -51,6 +55,10 @@ def __init__(
self._unloaded = False
self.valid = (self._ppage is not None) and (self._dp_doc is not None)

def _require_page(self) -> PdfPage:
assert self._ppage is not None, "Page backend was unloaded."
return self._ppage

def _ensure_parsed(self) -> None:
if self._dpage is not None:
return
Expand All @@ -72,6 +80,7 @@ def _ensure_parsed(self) -> None:
config.create_line_cells = self._create_textlines
config.enforce_same_font = True

assert self._dp_doc is not None
seg_page = self._dp_doc.get_page(self._page_no + 1, config=config)

# In Docling, all TextCell instances are expected with top-left origin.
Expand Down Expand Up @@ -165,41 +174,46 @@ def get_page_image(
padbox.t = page_size.height - padbox.t

with pypdfium2_lock:
image = (
self._ppage.render(
scale=scale * 1.5,
rotation=0, # no additional rotation
crop=padbox.as_tuple(),
)
.to_pil()
.resize(
size=(round(cropbox.width * scale), round(cropbox.height * scale))
)
) # We resize the image from 1.5x the given scale to make it sharper.
bitmap = self._ppage.render(
scale=scale * 1.5,
rotation=0, # no additional rotation
crop=padbox.as_tuple(),
)
image = bitmap.to_pil().copy()
bitmap.close()
# We resize the image from 1.5x the given scale to make it sharper.
image = image.resize(
size=(round(cropbox.width * scale), round(cropbox.height * scale))
)

return image

def get_size(self) -> Size:
with pypdfium2_lock:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
page = self._require_page()
return Size(width=page.get_width(), height=page.get_height())

# TODO: Take width and height from docling-parse.
# return Size(
# width=self._dpage.dimension.width,
# height=self._dpage.dimension.height,
# )

def unload(self):
def _close_native_page(self) -> None:
if not self._unloaded and self._dp_doc is not None:
self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
self._unloaded = True

with pypdfium2_lock:
if self._ppage is not None:
self._ppage.close()

self._ppage = None
self._dpage = None
self._dp_doc = None


class DoclingParseDocumentBackend(PdfDocumentBackend):
class DoclingParseDocumentBackend(ManagedPdfiumDocumentBackend):
def __init__(
self,
in_doc: "InputDocument",
Expand All @@ -215,7 +229,7 @@ def __init__(
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
self.parser = DoclingPdfParser(loglevel="fatal")

self.dp_doc: PdfDocument = self.parser.load(
self.dp_doc: Optional[PdfDocument] = self.parser.load(
path_or_stream=self.path_or_stream, password=password
)
success = self.dp_doc is not None
Expand All @@ -229,6 +243,7 @@ def page_count(self) -> int:
# return len(self._pdoc) # To be replaced with docling-parse API

len_1 = len(self._pdoc)
assert self.dp_doc is not None
len_2 = self.dp_doc.number_of_pages()

if len_1 != len_2:
Expand All @@ -239,6 +254,7 @@ def page_count(self) -> int:
def load_page(
self, page_no: int, create_words: bool = True, create_textlines: bool = True
) -> DoclingParsePageBackend:
assert self.dp_doc is not None
with pypdfium2_lock:
ppage = self._pdoc[page_no]

Expand All @@ -253,19 +269,15 @@ def load_page(
def is_valid(self) -> bool:
return self.page_count() > 0

def unload(self):
super().unload()
# Unload docling-parse document first
def _close_native_document(self) -> None:
if self.dp_doc is not None:
self.dp_doc.unload()
self.dp_doc = None

# Then close pypdfium2 document with proper locking
if self._pdoc is not None:
with pypdfium2_lock:
try:
self._pdoc.close()
except Exception:
# Ignore cleanup errors
pass
self._pdoc = None
53 changes: 53 additions & 0 deletions docling/backend/managed_pdfium_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Union

from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.backend_options import PdfBackendOptions

if TYPE_CHECKING:
from docling.datamodel.document import InputDocument


class ManagedPdfiumDocumentBackend(PdfDocumentBackend, ABC):
"""Shared lifecycle management for PDFium-backed document backends."""

def __init__(
self,
in_doc: InputDocument,
path_or_stream: Union[BytesIO, Path],
options: PdfBackendOptions = PdfBackendOptions(),
) -> None:
super().__init__(in_doc, path_or_stream, options)
self._closed = False

@abstractmethod
def _close_native_document(self) -> None:
pass

def unload(self) -> None:
if self._closed:
return
self._closed = True
self._close_native_document()
super().unload()


class ManagedPdfiumPageBackend(PdfPageBackend, ABC):
"""Shared page lifecycle for PDFium-backed page backends."""

def __init__(self) -> None:
self._closed = False

@abstractmethod
def _close_native_page(self) -> None:
pass

def unload(self) -> None:
if self._closed:
return
self._closed = True
self._close_native_page()
77 changes: 48 additions & 29 deletions docling/backend/pypdfium2_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError

from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.backend.managed_pdfium_backend import (
ManagedPdfiumDocumentBackend,
ManagedPdfiumPageBackend,
)
from docling.datamodel.backend_options import PdfBackendOptions
from docling.utils.locks import pypdfium2_lock

Expand Down Expand Up @@ -105,14 +108,19 @@ def get_pdf_page_geometry(
_PYPDFIUM2_MAJOR_VERSION = int(version("pypdfium2").split(".")[0])


class PyPdfiumPageBackend(PdfPageBackend):
class PyPdfiumPageBackend(ManagedPdfiumPageBackend):
def __init__(
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
self,
pdfium_doc: pdfium.PdfDocument,
document_hash: str,
page_no: int,
):
super().__init__()
# Note: lock applied by the caller
self.valid = True # No better way to tell from pypdfium.
self._ppage: pdfium.PdfPage | None = None
try:
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
self._ppage = pdfium_doc[page_no]
except PdfiumError:
_log.info(
f"An exception occurred when loading page {page_no} of document {document_hash}.",
Expand All @@ -124,11 +132,15 @@ def __init__(
def is_valid(self) -> bool:
return self.valid

def _require_page(self) -> pdfium.PdfPage:
assert self._ppage is not None, "Page backend was unloaded."
return self._ppage

def _compute_text_cells(self) -> List[TextCell]:
"""Compute text cells from pypdfium."""
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
self.text_page = self._require_page().get_textpage()

cells = []
cell_counter = 0
Expand Down Expand Up @@ -263,8 +275,9 @@ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
page_size = self.get_size()

with pypdfium2_lock:
rotation = self._ppage.get_rotation()
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
page = self._require_page()
rotation = page.get_rotation()
for obj in page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
if _PYPDFIUM2_MAJOR_VERSION >= 5:
pos = obj.get_bounds() # pypdfium2 >= 5.x
else:
Expand Down Expand Up @@ -301,7 +314,7 @@ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
def get_text_in_rect(self, bbox: BoundingBox) -> str:
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
self.text_page = self._require_page().get_textpage()

if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
bbox = bbox.to_bottom_left_origin(self.get_size().height)
Expand All @@ -318,7 +331,7 @@ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
text_cells = self._compute_text_cells()

# Get the PDF page geometry from pypdfium2
dimension = get_pdf_page_geometry(self._ppage)
dimension = get_pdf_page_geometry(self._require_page())

# Create SegmentedPdfPage
return SegmentedPdfPage(
Expand Down Expand Up @@ -356,31 +369,37 @@ def get_page_image(
padbox.t = page_size.height - padbox.t

with pypdfium2_lock:
image = (
self._ppage.render(
scale=scale * 1.5,
rotation=0, # no additional rotation
crop=padbox.as_tuple(),
)
.to_pil()
.resize(
size=(round(cropbox.width * scale), round(cropbox.height * scale))
)
) # We resize the image from 1.5x the given scale to make it sharper.
bitmap = self._require_page().render(
scale=scale * 1.5,
rotation=0, # no additional rotation
crop=padbox.as_tuple(),
)
image = bitmap.to_pil().copy()
bitmap.close()
# We resize the image from 1.5x the given scale to make it sharper.
image = image.resize(
size=(round(cropbox.width * scale), round(cropbox.height * scale))
)

return image

def get_size(self) -> Size:
with pypdfium2_lock:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
page = self._require_page()
return Size(width=page.get_width(), height=page.get_height())

def unload(self):
def _close_native_page(self) -> None:
with pypdfium2_lock:
self._ppage = None
self.text_page = None
if self.text_page is not None:
self.text_page.close()
if self._ppage is not None:
self._ppage.close()

self.text_page = None
self._ppage = None


class PyPdfiumDocumentBackend(PdfDocumentBackend):
class PyPdfiumDocumentBackend(ManagedPdfiumDocumentBackend):
def __init__(
self,
in_doc: "InputDocument",
Expand Down Expand Up @@ -411,8 +430,8 @@ def load_page(self, page_no: int) -> PyPdfiumPageBackend:
def is_valid(self) -> bool:
return self.page_count() > 0

def unload(self):
super().unload()
def _close_native_document(self) -> None:
with pypdfium2_lock:
self._pdoc.close()
self._pdoc = None
if self._pdoc is not None:
self._pdoc.close()
self._pdoc = None
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ dev = [
]
docs = [
"mkdocs-material~=9.5",
"mkdocs-jupyter~=0.25",
"mkdocs-jupyter>=0.25,<0.26",
"mkdocs-click~=0.8",
"mkdocs-redirects~=1.2",
"mkdocstrings[python]~=0.27",
Expand Down
Loading
Loading