Skip to content

Commit b389c82

Browse files
cau-gitclaude
andcommitted
fix: managed PDFium backend lifecycle with explicit native close and live-page tracking
Introduces ManagedPdfiumDocumentBackend / ManagedPdfiumPageBackend base classes that both PDF backends now inherit from. Key changes: - Live pages are tracked in a set on the document; document unload waits for all pages to be released before tearing down native handles. - Page and document unload now call explicit .close() on native PDFium objects under the lock, rather than just nulling Python references. This makes teardown deterministic rather than relying on GC finalizers which can fire from any thread without the lock. - text_page is explicitly closed before _ppage to respect the PDFium parent/child handle hierarchy. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 79b1894 commit b389c82

3 files changed

Lines changed: 166 additions & 32 deletions

File tree

docling/backend/docling_parse_backend.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@
1212
from PIL import Image
1313
from pypdfium2 import PdfPage
1414

15-
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
15+
from docling.backend.managed_pdfium_backend import (
16+
ManagedPdfiumDocumentBackend,
17+
ManagedPdfiumPageBackend,
18+
)
1619
from docling.datamodel.backend_options import PdfBackendOptions
1720
from docling.datamodel.base_models import Size
1821
from docling.utils.locks import pypdfium2_lock
@@ -23,10 +26,11 @@
2326
_log = logging.getLogger(__name__)
2427

2528

26-
class DoclingParsePageBackend(PdfPageBackend):
29+
class DoclingParsePageBackend(ManagedPdfiumPageBackend):
2730
def __init__(
2831
self,
2932
*,
33+
owner: "DoclingParseDocumentBackend",
3034
dp_doc: PdfDocument,
3135
page_obj: PdfPage,
3236
page_no: int,
@@ -36,8 +40,9 @@ def __init__(
3640
keep_lines: bool = False,
3741
keep_images: bool = True,
3842
):
43+
super().__init__(owner)
3944
self._ppage = page_obj
40-
self._dp_doc = dp_doc
45+
self._dp_doc: Optional[PdfDocument] = dp_doc
4146
self._page_no = page_no
4247

4348
self._create_words = create_words
@@ -51,6 +56,10 @@ def __init__(
5156
self._unloaded = False
5257
self.valid = (self._ppage is not None) and (self._dp_doc is not None)
5358

59+
def _require_page(self) -> PdfPage:
60+
assert self._ppage is not None, "Page backend was unloaded."
61+
return self._ppage
62+
5463
def _ensure_parsed(self) -> None:
5564
if self._dpage is not None:
5665
return
@@ -72,6 +81,7 @@ def _ensure_parsed(self) -> None:
7281
config.create_line_cells = self._create_textlines
7382
config.enforce_same_font = True
7483

84+
assert self._dp_doc is not None
7585
seg_page = self._dp_doc.get_page(self._page_no + 1, config=config)
7686

7787
# In Docling, all TextCell instances are expected with top-left origin.
@@ -181,25 +191,30 @@ def get_page_image(
181191

182192
def get_size(self) -> Size:
183193
with pypdfium2_lock:
184-
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
194+
page = self._require_page()
195+
return Size(width=page.get_width(), height=page.get_height())
185196

186197
# TODO: Take width and height from docling-parse.
187198
# return Size(
188199
# width=self._dpage.dimension.width,
189200
# height=self._dpage.dimension.height,
190201
# )
191202

192-
def unload(self):
203+
def _close_native_page(self) -> None:
193204
if not self._unloaded and self._dp_doc is not None:
194205
self._dp_doc.unload_pages((self._page_no + 1, self._page_no + 2))
195206
self._unloaded = True
196207

208+
with pypdfium2_lock:
209+
if self._ppage is not None:
210+
self._ppage.close()
211+
197212
self._ppage = None
198213
self._dpage = None
199214
self._dp_doc = None
200215

201216

202-
class DoclingParseDocumentBackend(PdfDocumentBackend):
217+
class DoclingParseDocumentBackend(ManagedPdfiumDocumentBackend):
203218
def __init__(
204219
self,
205220
in_doc: "InputDocument",
@@ -215,7 +230,7 @@ def __init__(
215230
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
216231
self.parser = DoclingPdfParser(loglevel="fatal")
217232

218-
self.dp_doc: PdfDocument = self.parser.load(
233+
self.dp_doc: Optional[PdfDocument] = self.parser.load(
219234
path_or_stream=self.path_or_stream, password=password
220235
)
221236
success = self.dp_doc is not None
@@ -229,6 +244,7 @@ def page_count(self) -> int:
229244
# return len(self._pdoc) # To be replaced with docling-parse API
230245

231246
len_1 = len(self._pdoc)
247+
assert self.dp_doc is not None
232248
len_2 = self.dp_doc.number_of_pages()
233249

234250
if len_1 != len_2:
@@ -239,10 +255,12 @@ def page_count(self) -> int:
239255
def load_page(
240256
self, page_no: int, create_words: bool = True, create_textlines: bool = True
241257
) -> DoclingParsePageBackend:
258+
assert self.dp_doc is not None
242259
with pypdfium2_lock:
243260
ppage = self._pdoc[page_no]
244261

245262
return DoclingParsePageBackend(
263+
owner=self,
246264
dp_doc=self.dp_doc,
247265
page_obj=ppage,
248266
page_no=page_no,
@@ -253,19 +271,15 @@ def load_page(
253271
def is_valid(self) -> bool:
254272
return self.page_count() > 0
255273

256-
def unload(self):
257-
super().unload()
258-
# Unload docling-parse document first
274+
def _close_native_document(self) -> None:
259275
if self.dp_doc is not None:
260276
self.dp_doc.unload()
261277
self.dp_doc = None
262278

263-
# Then close pypdfium2 document with proper locking
264279
if self._pdoc is not None:
265280
with pypdfium2_lock:
266281
try:
267282
self._pdoc.close()
268283
except Exception:
269-
# Ignore cleanup errors
270284
pass
271285
self._pdoc = None
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
from __future__ import annotations
2+
3+
import threading
4+
from abc import ABC, abstractmethod
5+
from io import BytesIO
6+
from pathlib import Path
7+
from typing import TYPE_CHECKING, Union
8+
9+
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
10+
from docling.datamodel.backend_options import PdfBackendOptions
11+
12+
if TYPE_CHECKING:
13+
from docling.datamodel.document import InputDocument
14+
15+
16+
class ManagedPdfiumDocumentBackend(PdfDocumentBackend, ABC):
17+
"""Shared lifecycle management for PDFium-backed document backends."""
18+
19+
def __init__(
20+
self,
21+
in_doc: InputDocument,
22+
path_or_stream: Union[BytesIO, Path],
23+
options: PdfBackendOptions = PdfBackendOptions(),
24+
) -> None:
25+
super().__init__(in_doc, path_or_stream, options)
26+
self._live_pages: set[ManagedPdfiumPageBackend] = set()
27+
self._live_pages_lock = threading.Lock()
28+
self._live_pages_cond = threading.Condition(self._live_pages_lock)
29+
self._closing = False
30+
self._closed = False
31+
32+
def _register_live_page(self, page_backend: ManagedPdfiumPageBackend) -> None:
33+
with self._live_pages_cond:
34+
if self._closing or self._closed:
35+
raise RuntimeError(
36+
"Cannot register a page while the document is closing."
37+
)
38+
self._live_pages.add(page_backend)
39+
40+
def _release_live_page(self, page_backend: ManagedPdfiumPageBackend) -> None:
41+
with self._live_pages_cond:
42+
self._live_pages.discard(page_backend)
43+
self._live_pages_cond.notify_all()
44+
45+
def _close_live_pages(self) -> None:
46+
while True:
47+
with self._live_pages_cond:
48+
live_pages = list(self._live_pages)
49+
if not live_pages:
50+
return
51+
for page_backend in live_pages:
52+
page_backend.unload()
53+
54+
@abstractmethod
55+
def _close_native_document(self) -> None:
56+
pass
57+
58+
def unload(self) -> None:
59+
with self._live_pages_cond:
60+
if self._closed:
61+
return
62+
self._closing = True
63+
64+
try:
65+
self._close_live_pages()
66+
self._close_native_document()
67+
finally:
68+
with self._live_pages_cond:
69+
self._closed = True
70+
self._closing = False
71+
self._live_pages.clear()
72+
self._live_pages_cond.notify_all()
73+
74+
super().unload()
75+
76+
77+
class ManagedPdfiumPageBackend(PdfPageBackend, ABC):
78+
"""Shared page lifecycle for PDFium-backed page backends."""
79+
80+
def __init__(self, owner: ManagedPdfiumDocumentBackend) -> None:
81+
self._owner: ManagedPdfiumDocumentBackend | None = owner
82+
self._closed = False
83+
owner._register_live_page(self)
84+
85+
@abstractmethod
86+
def _close_native_page(self) -> None:
87+
pass
88+
89+
def unload(self) -> None:
90+
if self._closed:
91+
return
92+
93+
owner = self._owner
94+
try:
95+
self._close_native_page()
96+
finally:
97+
self._closed = True
98+
self._owner = None
99+
if owner is not None:
100+
owner._release_live_page(self)

docling/backend/pypdfium2_backend.py

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@
2020
from pypdfium2 import PdfTextPage
2121
from pypdfium2._helpers.misc import PdfiumError
2222

23-
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
23+
from docling.backend.managed_pdfium_backend import (
24+
ManagedPdfiumDocumentBackend,
25+
ManagedPdfiumPageBackend,
26+
)
2427
from docling.datamodel.backend_options import PdfBackendOptions
2528
from docling.utils.locks import pypdfium2_lock
2629

@@ -105,14 +108,20 @@ def get_pdf_page_geometry(
105108
_PYPDFIUM2_MAJOR_VERSION = int(version("pypdfium2").split(".")[0])
106109

107110

108-
class PyPdfiumPageBackend(PdfPageBackend):
111+
class PyPdfiumPageBackend(ManagedPdfiumPageBackend):
109112
def __init__(
110-
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
113+
self,
114+
owner: "PyPdfiumDocumentBackend",
115+
pdfium_doc: pdfium.PdfDocument,
116+
document_hash: str,
117+
page_no: int,
111118
):
119+
super().__init__(owner)
112120
# Note: lock applied by the caller
113121
self.valid = True # No better way to tell from pypdfium.
122+
self._ppage: pdfium.PdfPage | None = None
114123
try:
115-
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
124+
self._ppage = pdfium_doc[page_no]
116125
except PdfiumError:
117126
_log.info(
118127
f"An exception occurred when loading page {page_no} of document {document_hash}.",
@@ -124,11 +133,15 @@ def __init__(
124133
def is_valid(self) -> bool:
125134
return self.valid
126135

136+
def _require_page(self) -> pdfium.PdfPage:
137+
assert self._ppage is not None, "Page backend was unloaded."
138+
return self._ppage
139+
127140
def _compute_text_cells(self) -> List[TextCell]:
128141
"""Compute text cells from pypdfium."""
129142
with pypdfium2_lock:
130143
if not self.text_page:
131-
self.text_page = self._ppage.get_textpage()
144+
self.text_page = self._require_page().get_textpage()
132145

133146
cells = []
134147
cell_counter = 0
@@ -263,8 +276,9 @@ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
263276
page_size = self.get_size()
264277

265278
with pypdfium2_lock:
266-
rotation = self._ppage.get_rotation()
267-
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
279+
page = self._require_page()
280+
rotation = page.get_rotation()
281+
for obj in page.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
268282
if _PYPDFIUM2_MAJOR_VERSION >= 5:
269283
pos = obj.get_bounds() # pypdfium2 >= 5.x
270284
else:
@@ -301,7 +315,7 @@ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
301315
def get_text_in_rect(self, bbox: BoundingBox) -> str:
302316
with pypdfium2_lock:
303317
if not self.text_page:
304-
self.text_page = self._ppage.get_textpage()
318+
self.text_page = self._require_page().get_textpage()
305319

306320
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
307321
bbox = bbox.to_bottom_left_origin(self.get_size().height)
@@ -318,7 +332,7 @@ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
318332
text_cells = self._compute_text_cells()
319333

320334
# Get the PDF page geometry from pypdfium2
321-
dimension = get_pdf_page_geometry(self._ppage)
335+
dimension = get_pdf_page_geometry(self._require_page())
322336

323337
# Create SegmentedPdfPage
324338
return SegmentedPdfPage(
@@ -356,7 +370,7 @@ def get_page_image(
356370
padbox.t = page_size.height - padbox.t
357371

358372
with pypdfium2_lock:
359-
bitmap = self._ppage.render(
373+
bitmap = self._require_page().render(
360374
scale=scale * 1.5,
361375
rotation=0, # no additional rotation
362376
crop=padbox.as_tuple(),
@@ -372,15 +386,21 @@ def get_page_image(
372386

373387
def get_size(self) -> Size:
374388
with pypdfium2_lock:
375-
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
389+
page = self._require_page()
390+
return Size(width=page.get_width(), height=page.get_height())
376391

377-
def unload(self):
392+
def _close_native_page(self) -> None:
378393
with pypdfium2_lock:
379-
self._ppage = None
380-
self.text_page = None
394+
if self.text_page is not None:
395+
self.text_page.close()
396+
if self._ppage is not None:
397+
self._ppage.close()
398+
399+
self.text_page = None
400+
self._ppage = None
381401

382402

383-
class PyPdfiumDocumentBackend(PdfDocumentBackend):
403+
class PyPdfiumDocumentBackend(ManagedPdfiumDocumentBackend):
384404
def __init__(
385405
self,
386406
in_doc: "InputDocument",
@@ -406,13 +426,13 @@ def page_count(self) -> int:
406426

407427
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
408428
with pypdfium2_lock:
409-
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
429+
return PyPdfiumPageBackend(self, self._pdoc, self.document_hash, page_no)
410430

411431
def is_valid(self) -> bool:
412432
return self.page_count() > 0
413433

414-
def unload(self):
415-
super().unload()
434+
def _close_native_document(self) -> None:
416435
with pypdfium2_lock:
417-
self._pdoc.close()
418-
self._pdoc = None
436+
if self._pdoc is not None:
437+
self._pdoc.close()
438+
self._pdoc = None

0 commit comments

Comments
 (0)