Skip to content

Commit 5e3510f

Browse files
cau-gitclaude
andcommitted
refactor: strip dead live-page tracking from managed PDFium backend
The Condition, Lock, _live_pages set, _closing flag, and owner back-ref on pages were remnants of the Group-3b pipeline defensive shutdown that was not included here. The pipeline always unloads page backends before calling document.unload(), so _close_live_pages() was always a no-op and notify_all() had zero waiters. Reduced ManagedPdfiumDocumentBackend/ManagedPdfiumPageBackend to just a _closed guard and the abstract _close_native_* dispatch. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent b389c82 commit 5e3510f

3 files changed

Lines changed: 10 additions & 60 deletions

File tree

docling/backend/docling_parse_backend.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ class DoclingParsePageBackend(ManagedPdfiumPageBackend):
3030
def __init__(
3131
self,
3232
*,
33-
owner: "DoclingParseDocumentBackend",
3433
dp_doc: PdfDocument,
3534
page_obj: PdfPage,
3635
page_no: int,
@@ -40,7 +39,7 @@ def __init__(
4039
keep_lines: bool = False,
4140
keep_images: bool = True,
4241
):
43-
super().__init__(owner)
42+
super().__init__()
4443
self._ppage = page_obj
4544
self._dp_doc: Optional[PdfDocument] = dp_doc
4645
self._page_no = page_no
@@ -260,7 +259,6 @@ def load_page(
260259
ppage = self._pdoc[page_no]
261260

262261
return DoclingParsePageBackend(
263-
owner=self,
264262
dp_doc=self.dp_doc,
265263
page_obj=ppage,
266264
page_no=page_no,
Lines changed: 7 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
import threading
43
from abc import ABC, abstractmethod
54
from io import BytesIO
65
from pathlib import Path
@@ -23,64 +22,25 @@ def __init__(
2322
options: PdfBackendOptions = PdfBackendOptions(),
2423
) -> None:
2524
super().__init__(in_doc, path_or_stream, options)
26-
self._live_pages: set[ManagedPdfiumPageBackend] = set()
27-
self._live_pages_lock = threading.Lock()
28-
self._live_pages_cond = threading.Condition(self._live_pages_lock)
29-
self._closing = False
3025
self._closed = False
3126

32-
def _register_live_page(self, page_backend: ManagedPdfiumPageBackend) -> None:
33-
with self._live_pages_cond:
34-
if self._closing or self._closed:
35-
raise RuntimeError(
36-
"Cannot register a page while the document is closing."
37-
)
38-
self._live_pages.add(page_backend)
39-
40-
def _release_live_page(self, page_backend: ManagedPdfiumPageBackend) -> None:
41-
with self._live_pages_cond:
42-
self._live_pages.discard(page_backend)
43-
self._live_pages_cond.notify_all()
44-
45-
def _close_live_pages(self) -> None:
46-
while True:
47-
with self._live_pages_cond:
48-
live_pages = list(self._live_pages)
49-
if not live_pages:
50-
return
51-
for page_backend in live_pages:
52-
page_backend.unload()
53-
5427
@abstractmethod
5528
def _close_native_document(self) -> None:
5629
pass
5730

5831
def unload(self) -> None:
59-
with self._live_pages_cond:
60-
if self._closed:
61-
return
62-
self._closing = True
63-
64-
try:
65-
self._close_live_pages()
66-
self._close_native_document()
67-
finally:
68-
with self._live_pages_cond:
69-
self._closed = True
70-
self._closing = False
71-
self._live_pages.clear()
72-
self._live_pages_cond.notify_all()
73-
32+
if self._closed:
33+
return
34+
self._closed = True
35+
self._close_native_document()
7436
super().unload()
7537

7638

7739
class ManagedPdfiumPageBackend(PdfPageBackend, ABC):
7840
"""Shared page lifecycle for PDFium-backed page backends."""
7941

80-
def __init__(self, owner: ManagedPdfiumDocumentBackend) -> None:
81-
self._owner: ManagedPdfiumDocumentBackend | None = owner
42+
def __init__(self) -> None:
8243
self._closed = False
83-
owner._register_live_page(self)
8444

8545
@abstractmethod
8646
def _close_native_page(self) -> None:
@@ -89,12 +49,5 @@ def _close_native_page(self) -> None:
8949
def unload(self) -> None:
9050
if self._closed:
9151
return
92-
93-
owner = self._owner
94-
try:
95-
self._close_native_page()
96-
finally:
97-
self._closed = True
98-
self._owner = None
99-
if owner is not None:
100-
owner._release_live_page(self)
52+
self._closed = True
53+
self._close_native_page()

docling/backend/pypdfium2_backend.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,12 +111,11 @@ def get_pdf_page_geometry(
111111
class PyPdfiumPageBackend(ManagedPdfiumPageBackend):
112112
def __init__(
113113
self,
114-
owner: "PyPdfiumDocumentBackend",
115114
pdfium_doc: pdfium.PdfDocument,
116115
document_hash: str,
117116
page_no: int,
118117
):
119-
super().__init__(owner)
118+
super().__init__()
120119
# Note: lock applied by the caller
121120
self.valid = True # No better way to tell from pypdfium.
122121
self._ppage: pdfium.PdfPage | None = None
@@ -426,7 +425,7 @@ def page_count(self) -> int:
426425

427426
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
428427
with pypdfium2_lock:
429-
return PyPdfiumPageBackend(self, self._pdoc, self.document_hash, page_no)
428+
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
430429

431430
def is_valid(self) -> bool:
432431
return self.page_count() > 0

0 commit comments

Comments
 (0)