Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 9e5ff22

Browse files
authoredJan 24, 2025··
fix: Correctly patch pdfminer to avoid unnecessarily and unsuccessfully repairing PDFs with long content streams, causing needless and endless OCR (#3822)
Fixes: #3815 Verified on my very large documents that it doesn't unnecessarily and unsuccessfully "repair" them. You may or may not wish to keep the version check in `patch_psparser`. Since ~you're pinning the version of pdfminer.six and since it isn't guaranteed that the bug in question will be fixed in the next pdfminer.six release (but it is rather serious, so I should hope so), then perhaps you just want to unconditionally patch it.~ it seems like pinning of versions is only operative when running from Docker (good!) so never mind! Keep that version check! Also corrected an import so that if you do feel like using a newer version of pdfminer.six, it won't break on you. --------- Authored-by: David Huggins-Daines <dhdaines@logisphere.ca>
1 parent e230364 commit 9e5ff22

File tree

6 files changed

+87
-18
lines changed

6 files changed

+87
-18
lines changed
 

Diff for: ‎CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
## 0.16.16-dev1
1+
## 0.16.16-dev2
22

33
### Enhancements
44

55
### Features
66
- **Vectorize layout (inferred, extracted, and OCR) data structure** Using `np.ndarray` to store a group of layout elements or text regions instead of using a list of objects. This improves the memory efficiency and compute speed around layout merging and deduplication.
77

88
### Fixes
9+
- **Correctly patch pdfminer to avoid PDF repair**. The patch applied to pdfminer's parser caused it to occasionally split tokens in content streams, throwing `PDFSyntaxError`. Repairing these PDFs sometimes failed (since they were not actually invalid) resulting in unnecessary OCR fallback.
910

1011
* **Drop usage of ndjson dependency**
1112

Diff for: ‎test_unstructured/partition/pdf_image/test_pdf.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -1205,8 +1205,8 @@ def test_partition_pdf_with_fast_finds_headers_footers(
12051205
@pytest.mark.parametrize(
12061206
("filename", "expected_log"),
12071207
[
1208+
# This one is *actually* an invalid PDF document
12081209
("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
1209-
("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
12101210
],
12111211
)
12121212
def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
@@ -1215,6 +1215,20 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
12151215
assert expected_log in caplog.text
12161216

12171217

1218+
@pytest.mark.parametrize(
1219+
("filename", "expected_log"),
1220+
[
1221+
# This one is *not* an invalid PDF document, make sure we
1222+
# don't try to "repair" it unnecessarily
1223+
("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
1224+
],
1225+
)
1226+
def test_properly_patch_pdfminer(filename, expected_log, caplog):
1227+
caplog.set_level(logging.INFO)
1228+
assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
1229+
assert expected_log not in caplog.text
1230+
1231+
12181232
def assert_element_extraction(
12191233
elements: list[Element],
12201234
extract_image_block_types: list[str],

Diff for: ‎unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.16.16-dev1" # pragma: no cover
1+
__version__ = "0.16.16-dev2" # pragma: no cover

Diff for: ‎unstructured/partition/pdf.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111

1212
import numpy as np
1313
import wrapt
14-
from pdfminer import psparser
1514
from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
1615
from pdfminer.utils import open_filename
1716
from pi_heif import register_heif_opener
@@ -96,15 +95,18 @@
9695
PartitionStrategy,
9796
)
9897
from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
99-
from unstructured.patches.pdfminer import parse_keyword
98+
from unstructured.patches.pdfminer import patch_psparser
10099
from unstructured.utils import first, requires_dependencies
101100

102101
if TYPE_CHECKING:
103102
pass
104103

105-
# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
106-
# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
107-
psparser.PSBaseParser._parse_keyword = parse_keyword # type: ignore
104+
105+
# Correct a bug that was introduced by a previous patch to
106+
# pdfminer.six, causing needless and unsuccessful repairing of PDFs
107+
# which were not actually broken.
108+
patch_psparser()
109+
108110

109111
RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
110112

Diff for: ‎unstructured/partition/pdf_image/pdfminer_utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine
77
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
88
from pdfminer.pdfpage import PDFPage
9-
from pdfminer.pdfparser import PSSyntaxError
9+
from pdfminer.psparser import PSSyntaxError
1010

1111
from unstructured.logger import logger
1212
from unstructured.utils import requires_dependencies

Diff for: ‎unstructured/patches/pdfminer.py

+61-9
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,35 @@
1-
from typing import Union
1+
import functools
2+
from typing import Tuple, Union
23

3-
from pdfminer.psparser import END_KEYWORD, KWD, PSBaseParser, PSKeyword
4+
import pdfminer
5+
from pdfminer.psparser import (
6+
END_KEYWORD,
7+
KWD,
8+
PSEOF,
9+
PSBaseParser,
10+
PSBaseParserToken,
11+
PSKeyword,
12+
log,
13+
)
414

15+
factory_seek = PSBaseParser.seek
516

6-
def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
7-
"""Patch for pdfminer method _parse_keyword of PSBaseParser. Changes are identical to the PR
8-
https://github.com/pdfminer/pdfminer.six/pull/885."""
17+
18+
@functools.wraps(PSBaseParser.seek)
19+
def seek(self: PSBaseParser, pos: int) -> None:
20+
factory_seek(self, pos)
21+
self.eof = False
22+
23+
24+
@functools.wraps(PSBaseParser._parse_keyword)
25+
def _parse_keyword(self, s: bytes, i: int) -> int:
926
m = END_KEYWORD.search(s, i)
10-
if not m:
11-
j = len(s)
12-
self._curtoken += s[i:]
13-
else:
27+
if m:
1428
j = m.start(0)
1529
self._curtoken += s[i:j]
30+
else:
31+
self._curtoken += s[i:]
32+
return len(s)
1633
if self._curtoken == b"true":
1734
token: Union[bool, PSKeyword] = True
1835
elif self._curtoken == b"false":
@@ -22,3 +39,38 @@ def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
2239
self._add_token(token)
2340
self._parse1 = self._parse_main
2441
return j
42+
43+
44+
@functools.wraps(PSBaseParser.nexttoken)
45+
def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
46+
if self.eof:
47+
# It's not really unexpected, come on now...
48+
raise PSEOF("Unexpected EOF")
49+
while not self._tokens:
50+
try:
51+
self.fillbuf()
52+
self.charpos = self._parse1(self.buf, self.charpos)
53+
except PSEOF:
54+
# If we hit EOF in the middle of a token, try to parse
55+
# it by tacking on whitespace, and delay raising PSEOF
56+
# until next time around
57+
self.charpos = self._parse1(b"\n", 0)
58+
self.eof = True
59+
# Oh, so there wasn't actually a token there? OK.
60+
if not self._tokens:
61+
raise
62+
token = self._tokens.pop(0)
63+
log.debug("nexttoken: %r", token)
64+
return token
65+
66+
67+
def patch_psparser():
68+
"""Monkey-patch certain versions of pdfminer.six to avoid dropping
69+
tokens at EOF (before 20231228) and splitting tokens at buffer
70+
boundaries (20231228 and 20240706).
71+
"""
72+
# Presuming the bug will be fixed in the next release
73+
if pdfminer.__version__ <= "20240706":
74+
PSBaseParser.seek = seek
75+
PSBaseParser._parse_keyword = _parse_keyword
76+
PSBaseParser.nexttoken = nexttoken

0 commit comments

Comments
 (0)
Please sign in to comment.