refactor: cache inline image (#149)

lambdalemon · web-flow · commit d659a1e19427 · 2025-07-26T07:33:06.000-06:00
* benchmark

* why top = None

* cache inline image
diff --git a/benchmarks/type3_charproc.py b/benchmarks/type3_charproc.py
@@ -0,0 +1,36 @@
+"""
+Benchmark type3 charprocs.
+"""
+
+import logging
+import time
+from pathlib import Path
+
+import playa
+
+CONTRIB = Path(__file__).parent.parent / "samples" / "contrib"
+
+LOG = logging.getLogger("benchmark-text")
+PDFS = ["scp05.pdf"]
+
+
+def benchmark_type3_charprocs(path: Path):
+    with playa.open(path) as pdf:
+        for page in pdf.pages:
+            for glyph in page.glyphs:
+                for obj in glyph:
+                    _ = obj
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.ERROR)
+    niter = 1
+    t = 0.0
+    for iter in range(niter + 1):
+        for name in PDFS:
+            path = CONTRIB / name
+            start = time.time()
+            benchmark_type3_charprocs(path)
+            if iter != 0:
+                t += time.time() - start
+    print("charprocs took %d ms / iter" % (t / niter * 1000,))
diff --git a/playa/content.py b/playa/content.py
@@ -521,7 +521,7 @@ def buffer(self) -> bytes:
     @property
     def tokens(self) -> Iterator[Token]:
         """Iterate over tokens in the XObject's content stream."""
-        parser = ContentParser([self.stream])
+        parser = ContentParser([self.stream], self.doc)
         while True:
             try:
                 pos, tok = parser.nexttoken()
@@ -532,7 +532,7 @@ def tokens(self) -> Iterator[Token]:
     @property
     def contents(self) -> Iterator[PDFObject]:
         """Iterator over PDF objects in the content stream."""
-        for pos, obj in ContentParser([self.stream]):
+        for pos, obj in ContentParser([self.stream], self.doc):
             yield obj
 
     def __iter__(self) -> Iterator["ContentObject"]:
diff --git a/playa/document.py b/playa/document.py
@@ -53,6 +53,7 @@
 from playa.pdftypes import (
     ContentStream,
     DecipherCallable,
+    InlineImage,
     ObjRef,
     dict_value,
     int_value,
@@ -204,6 +205,9 @@ def __init__(
         self._cached_objs: Dict[int, PDFObject] = {}
         self._parsed_objs: Dict[int, Tuple[List[PDFObject], int]] = {}
         self._cached_fonts: Dict[int, Font] = {}
+        self._cached_inline_images: Dict[
+            Tuple[int, int], Tuple[int, Optional[InlineImage]]
+        ] = {}
         if isinstance(fp, io.TextIOBase):
             raise TypeError("fp is not a binary file")
         self.pdf_version, self.offset, self.buffer = _open_input(fp)
diff --git a/playa/interp.py b/playa/interp.py
@@ -227,7 +227,7 @@ def pop(self, n: int) -> List[PDFObject]:
         return x
 
     def __iter__(self) -> Iterator[ContentObject]:
-        parser = ContentParser(self.contents)
+        parser = ContentParser(self.contents, self.page.doc)
         for _, obj in parser:
             # These are handled inside the parser as they don't obey
             # the normal syntax rules (PDF 1.7 sec 8.9.7)
diff --git a/playa/page.py b/playa/page.py
@@ -237,7 +237,7 @@ def height(self) -> float:
     @property
     def contents(self) -> Iterator[PDFObject]:
         """Iterator over PDF objects in the content streams."""
-        for _, obj in ContentParser(self._contents):
+        for _, obj in ContentParser(self._contents, self.doc):
             yield obj
 
     def __iter__(self) -> Iterator["ContentObject"]:
@@ -301,7 +301,7 @@ def xobjects_one(
     @property
     def tokens(self) -> Iterator[Token]:
         """Iterator over tokens in the content streams."""
-        parser = ContentParser(self._contents)
+        parser = ContentParser(self._contents, self.doc)
         while True:
             try:
                 pos, tok = parser.nexttoken()
diff --git a/playa/parser.py b/playa/parser.py
@@ -302,11 +302,13 @@ def __init__(
         doc: Union["Document", None] = None,
         pos: int = 0,
         strict: bool = False,
+        streamid: Union[int, None] = None,
     ) -> None:
         self._lexer = Lexer(data, pos)
         self.stack: List[StackEntry] = []
         self.docref = None if doc is None else _ref_document(doc)
         self.strict = strict
+        self.streamid = streamid
 
     @property
     def doc(self) -> Union["Document", None]:
@@ -315,9 +317,12 @@ def doc(self) -> Union["Document", None]:
             return None
         return _deref_document(self.docref)
 
-    def newstream(self, data: Union[bytes, mmap.mmap]) -> None:
+    def newstream(
+        self, data: Union[bytes, mmap.mmap], streamid: Union[int, None] = None
+    ) -> None:
         """Continue parsing from a new data stream."""
         self._lexer = Lexer(data)
+        self.streamid = streamid
 
     def reset(self) -> None:
         """Clear internal parser state."""
@@ -347,7 +352,6 @@ def __next__(self) -> StackEntry:
                         raise e
                     log.warning("When constructing array from %r: %s", obj, e)
                 if pos == top:
-                    top = None
                     return pos, obj
                 self.stack.append((pos, obj))
             elif token is KEYWORD_DICT_BEGIN:
@@ -372,7 +376,6 @@ def __next__(self) -> StackEntry:
                         raise e
                     log.warning("When constructing dict from %r: %s", self.stack, e)
                 if pos == top:
-                    top = None
                     return pos, obj
                 self.stack.append((pos, obj))
             elif token is KEYWORD_PROC_BEGIN:
@@ -387,7 +390,6 @@ def __next__(self) -> StackEntry:
                         raise e
                     log.warning("When constructing proc from %r: %s", obj, e)
                 if pos == top:
-                    top = None
                     return pos, obj
                 self.stack.append((pos, obj))
             elif token is KEYWORD_NULL:
@@ -409,13 +411,27 @@ def __next__(self) -> StackEntry:
                         "Inline image not at top level of stream "
                         f"({pos} != {top}, {self.stack})"
                     )
-                top = pos
-                self.stack.append((pos, token))
+                if (
+                    self.doc is not None
+                    and self.streamid is not None
+                    and (inline_image_id := (self.streamid, pos))
+                    in self.doc._cached_inline_images
+                ):
+                    end, obj = self.doc._cached_inline_images[inline_image_id]
+                    self.seek(end)
+                    if obj is not None:
+                        return pos, obj
+                else:
+                    top = pos
+                    self.stack.append((pos, token))
             elif token is KEYWORD_ID:
                 obj = self.get_inline_image(pos, token)
+                assert top is not None
+                if self.doc is not None and self.streamid is not None:
+                    inline_image_id = (self.streamid, top)
+                    self.doc._cached_inline_images[inline_image_id] = self.tell(), obj
                 if obj is not None:
-                    top = None
-                    return pos, obj
+                    return top, obj
             else:
                 # Literally anything else, including any other keyword
                 # (will be returned above if top is None, or later if
@@ -902,11 +918,11 @@ class ContentParser(ObjectParser):
     the page’s logical content or organization.
     """
 
-    def __init__(self, streams: Iterable[PDFObject]) -> None:
+    def __init__(self, streams: Iterable[PDFObject], doc: "Document") -> None:
         self.streamiter = iter(streams)
         try:
             stream = stream_value(next(self.streamiter))
-            super().__init__(stream.buffer)
+            super().__init__(stream.buffer, doc, streamid=stream.objid)
         except StopIteration:
             super().__init__(b"")
         except TypeError:
@@ -928,6 +944,6 @@ def nexttoken(self) -> Tuple[int, Token]:
                 try:
                     ref = next(self.streamiter)
                     stream = stream_value(ref)
-                    self.newstream(stream.buffer)
+                    self.newstream(stream.buffer, streamid=stream.objid)
                 except TypeError:
                     log.warning("Found non-stream in contents: %r", ref)
diff --git a/samples/contrib/scp05.pdf b/samples/contrib/scp05.pdf
diff --git a/tests/test_object_parser.py b/tests/test_object_parser.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+from playa.document import Document
 from playa.parser import (
     KEYWORD_DICT_BEGIN,
     KEYWORD_DICT_END,
@@ -417,6 +418,15 @@ def test_inline_images():
     assert img.buffer == b"OLDMACDONALDEIEIO"
 
 
+def test_cached_inline_images():
+    doc = Document(b"")
+    first = list(ObjectParser(INLINEDATA1, doc, streamid=0))
+    second = list(ObjectParser(INLINEDATA1, doc, streamid=0))
+    assert first == second
+    third = list(ObjectParser(INLINEDATA1, doc, streamid=1))
+    assert first != third
+
+
 def test_reverse_solidus():
     """Test the handling of useless backslashes that are not escapes."""
     parser = Lexer(rb"(OMG\ WTF \W \T\ F)")