Skip to content

Commit d659a1e

Browse files
authored
refactor: cache inline image (#149)
* benchmark * why top = None * cache inline image
1 parent 068d913 commit d659a1e

File tree

8 files changed

+82
-16
lines changed

8 files changed

+82
-16
lines changed

benchmarks/type3_charproc.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
Benchmark type3 charprocs.
3+
"""
4+
5+
import logging
6+
import time
7+
from pathlib import Path
8+
9+
import playa
10+
11+
CONTRIB = Path(__file__).parent.parent / "samples" / "contrib"
12+
13+
LOG = logging.getLogger("benchmark-text")
14+
PDFS = ["scp05.pdf"]
15+
16+
17+
def benchmark_type3_charprocs(path: Path):
18+
with playa.open(path) as pdf:
19+
for page in pdf.pages:
20+
for glyph in page.glyphs:
21+
for obj in glyph:
22+
_ = obj
23+
24+
25+
if __name__ == "__main__":
26+
logging.basicConfig(level=logging.ERROR)
27+
niter = 1
28+
t = 0.0
29+
for iter in range(niter + 1):
30+
for name in PDFS:
31+
path = CONTRIB / name
32+
start = time.time()
33+
benchmark_type3_charprocs(path)
34+
if iter != 0:
35+
t += time.time() - start
36+
print("charprocs took %d ms / iter" % (t / niter * 1000,))

playa/content.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ def buffer(self) -> bytes:
521521
@property
522522
def tokens(self) -> Iterator[Token]:
523523
"""Iterate over tokens in the XObject's content stream."""
524-
parser = ContentParser([self.stream])
524+
parser = ContentParser([self.stream], self.doc)
525525
while True:
526526
try:
527527
pos, tok = parser.nexttoken()
@@ -532,7 +532,7 @@ def tokens(self) -> Iterator[Token]:
532532
@property
533533
def contents(self) -> Iterator[PDFObject]:
534534
"""Iterator over PDF objects in the content stream."""
535-
for pos, obj in ContentParser([self.stream]):
535+
for pos, obj in ContentParser([self.stream], self.doc):
536536
yield obj
537537

538538
def __iter__(self) -> Iterator["ContentObject"]:

playa/document.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
from playa.pdftypes import (
5454
ContentStream,
5555
DecipherCallable,
56+
InlineImage,
5657
ObjRef,
5758
dict_value,
5859
int_value,
@@ -204,6 +205,9 @@ def __init__(
204205
self._cached_objs: Dict[int, PDFObject] = {}
205206
self._parsed_objs: Dict[int, Tuple[List[PDFObject], int]] = {}
206207
self._cached_fonts: Dict[int, Font] = {}
208+
self._cached_inline_images: Dict[
209+
Tuple[int, int], Tuple[int, Optional[InlineImage]]
210+
] = {}
207211
if isinstance(fp, io.TextIOBase):
208212
raise TypeError("fp is not a binary file")
209213
self.pdf_version, self.offset, self.buffer = _open_input(fp)

playa/interp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ def pop(self, n: int) -> List[PDFObject]:
227227
return x
228228

229229
def __iter__(self) -> Iterator[ContentObject]:
230-
parser = ContentParser(self.contents)
230+
parser = ContentParser(self.contents, self.page.doc)
231231
for _, obj in parser:
232232
# These are handled inside the parser as they don't obey
233233
# the normal syntax rules (PDF 1.7 sec 8.9.7)

playa/page.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ def height(self) -> float:
237237
@property
238238
def contents(self) -> Iterator[PDFObject]:
239239
"""Iterator over PDF objects in the content streams."""
240-
for _, obj in ContentParser(self._contents):
240+
for _, obj in ContentParser(self._contents, self.doc):
241241
yield obj
242242

243243
def __iter__(self) -> Iterator["ContentObject"]:
@@ -301,7 +301,7 @@ def xobjects_one(
301301
@property
302302
def tokens(self) -> Iterator[Token]:
303303
"""Iterator over tokens in the content streams."""
304-
parser = ContentParser(self._contents)
304+
parser = ContentParser(self._contents, self.doc)
305305
while True:
306306
try:
307307
pos, tok = parser.nexttoken()

playa/parser.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -302,11 +302,13 @@ def __init__(
302302
doc: Union["Document", None] = None,
303303
pos: int = 0,
304304
strict: bool = False,
305+
streamid: Union[int, None] = None,
305306
) -> None:
306307
self._lexer = Lexer(data, pos)
307308
self.stack: List[StackEntry] = []
308309
self.docref = None if doc is None else _ref_document(doc)
309310
self.strict = strict
311+
self.streamid = streamid
310312

311313
@property
312314
def doc(self) -> Union["Document", None]:
@@ -315,9 +317,12 @@ def doc(self) -> Union["Document", None]:
315317
return None
316318
return _deref_document(self.docref)
317319

318-
def newstream(self, data: Union[bytes, mmap.mmap]) -> None:
320+
def newstream(
321+
self, data: Union[bytes, mmap.mmap], streamid: Union[int, None] = None
322+
) -> None:
319323
"""Continue parsing from a new data stream."""
320324
self._lexer = Lexer(data)
325+
self.streamid = streamid
321326

322327
def reset(self) -> None:
323328
"""Clear internal parser state."""
@@ -347,7 +352,6 @@ def __next__(self) -> StackEntry:
347352
raise e
348353
log.warning("When constructing array from %r: %s", obj, e)
349354
if pos == top:
350-
top = None
351355
return pos, obj
352356
self.stack.append((pos, obj))
353357
elif token is KEYWORD_DICT_BEGIN:
@@ -372,7 +376,6 @@ def __next__(self) -> StackEntry:
372376
raise e
373377
log.warning("When constructing dict from %r: %s", self.stack, e)
374378
if pos == top:
375-
top = None
376379
return pos, obj
377380
self.stack.append((pos, obj))
378381
elif token is KEYWORD_PROC_BEGIN:
@@ -387,7 +390,6 @@ def __next__(self) -> StackEntry:
387390
raise e
388391
log.warning("When constructing proc from %r: %s", obj, e)
389392
if pos == top:
390-
top = None
391393
return pos, obj
392394
self.stack.append((pos, obj))
393395
elif token is KEYWORD_NULL:
@@ -409,13 +411,27 @@ def __next__(self) -> StackEntry:
409411
"Inline image not at top level of stream "
410412
f"({pos} != {top}, {self.stack})"
411413
)
412-
top = pos
413-
self.stack.append((pos, token))
414+
if (
415+
self.doc is not None
416+
and self.streamid is not None
417+
and (inline_image_id := (self.streamid, pos))
418+
in self.doc._cached_inline_images
419+
):
420+
end, obj = self.doc._cached_inline_images[inline_image_id]
421+
self.seek(end)
422+
if obj is not None:
423+
return pos, obj
424+
else:
425+
top = pos
426+
self.stack.append((pos, token))
414427
elif token is KEYWORD_ID:
415428
obj = self.get_inline_image(pos, token)
429+
assert top is not None
430+
if self.doc is not None and self.streamid is not None:
431+
inline_image_id = (self.streamid, top)
432+
self.doc._cached_inline_images[inline_image_id] = self.tell(), obj
416433
if obj is not None:
417-
top = None
418-
return pos, obj
434+
return top, obj
419435
else:
420436
# Literally anything else, including any other keyword
421437
# (will be returned above if top is None, or later if
@@ -902,11 +918,11 @@ class ContentParser(ObjectParser):
902918
the page’s logical content or organization.
903919
"""
904920

905-
def __init__(self, streams: Iterable[PDFObject]) -> None:
921+
def __init__(self, streams: Iterable[PDFObject], doc: "Document") -> None:
906922
self.streamiter = iter(streams)
907923
try:
908924
stream = stream_value(next(self.streamiter))
909-
super().__init__(stream.buffer)
925+
super().__init__(stream.buffer, doc, streamid=stream.objid)
910926
except StopIteration:
911927
super().__init__(b"")
912928
except TypeError:
@@ -928,6 +944,6 @@ def nexttoken(self) -> Tuple[int, Token]:
928944
try:
929945
ref = next(self.streamiter)
930946
stream = stream_value(ref)
931-
self.newstream(stream.buffer)
947+
self.newstream(stream.buffer, streamid=stream.objid)
932948
except TypeError:
933949
log.warning("Found non-stream in contents: %r", ref)

samples/contrib/scp05.pdf

1.56 MB
Binary file not shown.

tests/test_object_parser.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import pytest
55

6+
from playa.document import Document
67
from playa.parser import (
78
KEYWORD_DICT_BEGIN,
89
KEYWORD_DICT_END,
@@ -417,6 +418,15 @@ def test_inline_images():
417418
assert img.buffer == b"OLDMACDONALDEIEIO"
418419

419420

421+
def test_cached_inline_images():
422+
doc = Document(b"")
423+
first = list(ObjectParser(INLINEDATA1, doc, streamid=0))
424+
second = list(ObjectParser(INLINEDATA1, doc, streamid=0))
425+
assert first == second
426+
third = list(ObjectParser(INLINEDATA1, doc, streamid=1))
427+
assert first != third
428+
429+
420430
def test_reverse_solidus():
421431
"""Test the handling of useless backslashes that are not escapes."""
422432
parser = Lexer(rb"(OMG\ WTF \W \T\ F)")

0 commit comments

Comments
 (0)