fix: Correctly patch pdfminer to avoid unnecessarily and unsuccessfully repairing PDFs with long content streams, causing needless and endless OCR (#3822)

dhdaines · web-flow · commit 9e5ff225f656 · 2025-01-24T14:27:25.000-06:00
Fixes: #3815 Verified on my very large documents that it doesn't unnecessarily and unsuccessfully "repair" them. You may or may not wish to keep the version check in `patch_psparser`. Since ~you're pinning the version of pdfminer.six and since it isn't guaranteed that the bug in question will be fixed in the next pdfminer.six release (but it is rather serious, so I should hope so), then perhaps you just want to unconditionally patch it.~ it seems like pinning of versions is only operative when running from Docker (good!) so never mind! Keep that version check! Also corrected an import so that if you do feel like using a newer version of pdfminer.six, it won't break on you. --------- Authored-by: David Huggins-Daines <dhdaines@logisphere.ca>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,12 @@
-## 0.16.16-dev1
+## 0.16.16-dev2
 
 ### Enhancements
 
 ### Features
 - **Vectorize layout (inferred, extracted, and OCR) data structure** Using `np.ndarray` to store a group of layout elements or text regions instead of using a list of objects. This improves the memory efficiency and compute speed around layout merging and deduplication.
 
 ### Fixes
+- **Correctly patch pdfminer to avoid PDF repair**. The patch applied to pdfminer's parser caused it to occasionally split tokens in content streams, throwing `PDFSyntaxError`.  Repairing these PDFs sometimes failed (since they were not actually invalid) resulting in unnecessary OCR fallback.
 
 * **Drop usage of ndjson dependency**
 
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1205,8 +1205,8 @@ def test_partition_pdf_with_fast_finds_headers_footers(
 @pytest.mark.parametrize(
     ("filename", "expected_log"),
     [
+        # This one is *actually* an invalid PDF document
         ("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
-        ("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
     ],
 )
 def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
@@ -1215,6 +1215,20 @@ def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_lo
     assert expected_log in caplog.text
 
 
+@pytest.mark.parametrize(
+    ("filename", "expected_log"),
+    [
+        # This one is *not* an invalid PDF document, make sure we
+        # don't try to "repair" it unnecessarily
+        ("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
+    ],
+)
+def test_properly_patch_pdfminer(filename, expected_log, caplog):
+    caplog.set_level(logging.INFO)
+    assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
+    assert expected_log not in caplog.text
+
+
 def assert_element_extraction(
     elements: list[Element],
     extract_image_block_types: list[str],
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.16-dev1"  # pragma: no cover
+__version__ = "0.16.16-dev2"  # pragma: no cover
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -11,7 +11,6 @@
 
 import numpy as np
 import wrapt
-from pdfminer import psparser
 from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
 from pdfminer.utils import open_filename
 from pi_heif import register_heif_opener
@@ -96,15 +95,18 @@
     PartitionStrategy,
 )
 from unstructured.partition.utils.sorting import coord_has_valid_points, sort_page_elements
-from unstructured.patches.pdfminer import parse_keyword
+from unstructured.patches.pdfminer import patch_psparser
 from unstructured.utils import first, requires_dependencies
 
 if TYPE_CHECKING:
     pass
 
-# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
-# the bug: https://github.com/pdfminer/pdfminer.six/pull/885
-psparser.PSBaseParser._parse_keyword = parse_keyword  # type: ignore
+
+# Correct a bug that was introduced by a previous patch to
+# pdfminer.six, causing needless and unsuccessful repairing of PDFs
+# which were not actually broken.
+patch_psparser()
+
 
 RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
 
diff --git a/unstructured/partition/pdf_image/pdfminer_utils.py b/unstructured/partition/pdf_image/pdfminer_utils.py
@@ -6,7 +6,7 @@
 from pdfminer.layout import LAParams, LTContainer, LTImage, LTItem, LTTextLine
 from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
 from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfparser import PSSyntaxError
+from pdfminer.psparser import PSSyntaxError
 
 from unstructured.logger import logger
 from unstructured.utils import requires_dependencies
diff --git a/unstructured/patches/pdfminer.py b/unstructured/patches/pdfminer.py
@@ -1,18 +1,35 @@
-from typing import Union
+import functools
+from typing import Tuple, Union
 
-from pdfminer.psparser import END_KEYWORD, KWD, PSBaseParser, PSKeyword
+import pdfminer
+from pdfminer.psparser import (
+    END_KEYWORD,
+    KWD,
+    PSEOF,
+    PSBaseParser,
+    PSBaseParserToken,
+    PSKeyword,
+    log,
+)
 
+factory_seek = PSBaseParser.seek
 
-def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
-    """Patch for pdfminer method _parse_keyword of PSBaseParser. Changes are identical to the PR
-    https://github.com/pdfminer/pdfminer.six/pull/885."""
+
+@functools.wraps(PSBaseParser.seek)
+def seek(self: PSBaseParser, pos: int) -> None:
+    factory_seek(self, pos)
+    self.eof = False
+
+
+@functools.wraps(PSBaseParser._parse_keyword)
+def _parse_keyword(self, s: bytes, i: int) -> int:
     m = END_KEYWORD.search(s, i)
-    if not m:
-        j = len(s)
-        self._curtoken += s[i:]
-    else:
+    if m:
         j = m.start(0)
         self._curtoken += s[i:j]
+    else:
+        self._curtoken += s[i:]
+        return len(s)
     if self._curtoken == b"true":
         token: Union[bool, PSKeyword] = True
     elif self._curtoken == b"false":
@@ -22,3 +39,38 @@ def parse_keyword(self: PSBaseParser, s: bytes, i: int) -> int:
     self._add_token(token)
     self._parse1 = self._parse_main
     return j
+
+
+@functools.wraps(PSBaseParser.nexttoken)
+def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
+    if self.eof:
+        # It's not really unexpected, come on now...
+        raise PSEOF("Unexpected EOF")
+    while not self._tokens:
+        try:
+            self.fillbuf()
+            self.charpos = self._parse1(self.buf, self.charpos)
+        except PSEOF:
+            # If we hit EOF in the middle of a token, try to parse
+            # it by tacking on whitespace, and delay raising PSEOF
+            # until next time around
+            self.charpos = self._parse1(b"\n", 0)
+            self.eof = True
+            # Oh, so there wasn't actually a token there? OK.
+            if not self._tokens:
+                raise
+    token = self._tokens.pop(0)
+    log.debug("nexttoken: %r", token)
+    return token
+
+
+def patch_psparser():
+    """Monkey-patch certain versions of pdfminer.six to avoid dropping
+    tokens at EOF (before 20231228) and splitting tokens at buffer
+    boundaries (20231228 and 20240706).
+    """
+    # Presuming the bug will be fixed in the next release
+    if pdfminer.__version__ <= "20240706":
+        PSBaseParser.seek = seek
+        PSBaseParser._parse_keyword = _parse_keyword
+        PSBaseParser.nexttoken = nexttoken

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.16.16-dev1" # pragma: no cover`
	`1`	`+__version__ = "0.16.16-dev2" # pragma: no cover`