Skip to content

Commit 586c7ae

Browse files
committed
ROB: Accept inline image EI marker at the end of a content stream
When an inline image's EI marker was the very last token of a content stream, with no trailing whitespace or operator, parsing raised PdfReadError. Two places assumed a byte always follows EI: the caller in _read_inline_image rejected an empty trailing byte (and the unconditional one-byte rewind after reading the marker stepped back into EI when only two bytes were available), and extract_inline_default treated end-of-stream after EI as a failed match and kept reading. An EI at end-of-stream is unambiguously the end of the image since no binary data can follow, so accept it in both places (mirroring the existing end-of-stream handling in _check_end_image_marker).
1 parent 52545c5 commit 586c7ae

3 files changed

Lines changed: 32 additions & 4 deletions

File tree

pypdf/generic/_data_structures.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1429,14 +1429,20 @@ def _read_inline_image(self, stream: StreamType) -> dict[str, Any]:
14291429
data = extract_inline_default(stream)
14301430

14311431
ei = stream.read(3)
1432-
stream.seek(-1, 1)
1433-
if ei[:2] != b"EI" or ei[2:3] not in WHITESPACES:
1432+
# An `EI` at the very end of the stream yields only two bytes; rewinding
1433+
# unconditionally would step back into the marker (#3468).
1434+
if len(ei) == 3:
1435+
stream.seek(-1, 1)
1436+
if ei[:2] != b"EI" or (ei[2:3] != b"" and ei[2:3] not in WHITESPACES):
14341437
# Deal with wrong/missing `EI` tags. Example: Wrong dimensions specified above.
14351438
stream.seek(savpos, 0)
14361439
data = extract_inline_default(stream)
14371440
ei = stream.read(3)
1438-
stream.seek(-1, 1)
1439-
if ei[:2] != b"EI" or ei[2:3] not in WHITESPACES: # pragma: no cover
1441+
if len(ei) == 3:
1442+
stream.seek(-1, 1)
1443+
if ei[:2] != b"EI" or (
1444+
ei[2:3] != b"" and ei[2:3] not in WHITESPACES
1445+
): # pragma: no cover
14401446
# Check the same condition again. This should never fail as
14411447
# edge cases are covered by `extract_inline_default` above,
14421448
# but check this ot make sure that we are behind the `EI` afterwards.

pypdf/generic/_image_inline.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,13 @@ def extract_inline_default(stream: StreamType) -> bytes:
233233
stream.seek(saved_pos, 0)
234234
continue
235235
tok3 = stream.read(1) # possible space after "EI"
236+
if tok3 == b"":
237+
# The `EI` marker is at the very end of the stream. There can be
238+
# no trailing binary data, so this is unambiguously the end of the
239+
# inline image (#3468).
240+
stream.seek(saved_pos - 1, 0)
241+
stream_out.truncate(sav_pos_ei)
242+
break
236243
if tok3 not in WHITESPACES:
237244
stream.seek(saved_pos, 0)
238245
continue

tests/test_generic.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,6 +1280,21 @@ def test_unitary_extract_inline():
12801280
assert co.operations[7][0]["data"] == b"abcdefghijklmnop"
12811281

12821282

1283+
def test_inline_image_at_end_of_stream():
1284+
# An inline image whose `EI` marker is the very end of the content stream
1285+
# (no trailing whitespace or operator) must not raise (#3468).
1286+
image = b"abcdefghijklmnop" # 4 * 4 * 1 byte
1287+
for tail in (b"", b"\n", b"\nQ\n"):
1288+
b = b"q 100 0 0 100 100 100 cm\nBI\n/W 4 /H 4 /CS /G\nID\n" + image + b"\nEI" + tail
1289+
ec = DecodedStreamObject()
1290+
ec.set_data(b)
1291+
co = ContentStream(ec, None)
1292+
operations = co.operations
1293+
inline = [op for op in operations if op[1] == b"INLINE IMAGE"]
1294+
assert len(inline) == 1
1295+
assert inline[0][0]["data"] == image
1296+
1297+
12831298
def test_missing_hashbin():
12841299
assert NullObject().hash_bin() == hash((NullObject,))
12851300
assert hash(NullObject()) == NullObject().hash_bin()

0 commit comments

Comments
 (0)