@@ -302,11 +302,13 @@ def __init__(
302302 doc : Union ["Document" , None ] = None ,
303303 pos : int = 0 ,
304304 strict : bool = False ,
305+ streamid : Union [int , None ] = None ,
305306 ) -> None :
306307 self ._lexer = Lexer (data , pos )
307308 self .stack : List [StackEntry ] = []
308309 self .docref = None if doc is None else _ref_document (doc )
309310 self .strict = strict
311+ self .streamid = streamid
310312
311313 @property
312314 def doc (self ) -> Union ["Document" , None ]:
@@ -315,9 +317,12 @@ def doc(self) -> Union["Document", None]:
315317 return None
316318 return _deref_document (self .docref )
317319
318- def newstream (self , data : Union [bytes , mmap .mmap ]) -> None :
320+ def newstream (
321+ self , data : Union [bytes , mmap .mmap ], streamid : Union [int , None ] = None
322+ ) -> None :
319323 """Continue parsing from a new data stream."""
320324 self ._lexer = Lexer (data )
325+ self .streamid = streamid
321326
322327 def reset (self ) -> None :
323328 """Clear internal parser state."""
@@ -347,7 +352,6 @@ def __next__(self) -> StackEntry:
347352 raise e
348353 log .warning ("When constructing array from %r: %s" , obj , e )
349354 if pos == top :
350- top = None
351355 return pos , obj
352356 self .stack .append ((pos , obj ))
353357 elif token is KEYWORD_DICT_BEGIN :
@@ -372,7 +376,6 @@ def __next__(self) -> StackEntry:
372376 raise e
373377 log .warning ("When constructing dict from %r: %s" , self .stack , e )
374378 if pos == top :
375- top = None
376379 return pos , obj
377380 self .stack .append ((pos , obj ))
378381 elif token is KEYWORD_PROC_BEGIN :
@@ -387,7 +390,6 @@ def __next__(self) -> StackEntry:
387390 raise e
388391 log .warning ("When constructing proc from %r: %s" , obj , e )
389392 if pos == top :
390- top = None
391393 return pos , obj
392394 self .stack .append ((pos , obj ))
393395 elif token is KEYWORD_NULL :
@@ -409,13 +411,27 @@ def __next__(self) -> StackEntry:
409411 "Inline image not at top level of stream "
410412 f"({ pos } != { top } , { self .stack } )"
411413 )
412- top = pos
413- self .stack .append ((pos , token ))
414+ if (
415+ self .doc is not None
416+ and self .streamid is not None
417+ and (inline_image_id := (self .streamid , pos ))
418+ in self .doc ._cached_inline_images
419+ ):
420+ end , obj = self .doc ._cached_inline_images [inline_image_id ]
421+ self .seek (end )
422+ if obj is not None :
423+ return pos , obj
424+ else :
425+ top = pos
426+ self .stack .append ((pos , token ))
414427 elif token is KEYWORD_ID :
415428 obj = self .get_inline_image (pos , token )
429+ assert top is not None
430+ if self .doc is not None and self .streamid is not None :
431+ inline_image_id = (self .streamid , top )
432+ self .doc ._cached_inline_images [inline_image_id ] = self .tell (), obj
416433 if obj is not None :
417- top = None
418- return pos , obj
434+ return top , obj
419435 else :
420436 # Literally anything else, including any other keyword
421437 # (will be returned above if top is None, or later if
@@ -902,11 +918,11 @@ class ContentParser(ObjectParser):
902918 the page’s logical content or organization.
903919 """
904920
905- def __init__ (self , streams : Iterable [PDFObject ]) -> None :
921+ def __init__ (self , streams : Iterable [PDFObject ], doc : "Document" ) -> None :
906922 self .streamiter = iter (streams )
907923 try :
908924 stream = stream_value (next (self .streamiter ))
909- super ().__init__ (stream .buffer )
925+ super ().__init__ (stream .buffer , doc , streamid = stream . objid )
910926 except StopIteration :
911927 super ().__init__ (b"" )
912928 except TypeError :
@@ -928,6 +944,6 @@ def nexttoken(self) -> Tuple[int, Token]:
928944 try :
929945 ref = next (self .streamiter )
930946 stream = stream_value (ref )
931- self .newstream (stream .buffer )
947+ self .newstream (stream .buffer , streamid = stream . objid )
932948 except TypeError :
933949 log .warning ("Found non-stream in contents: %r" , ref )
0 commit comments