ROB: Accept inline image EI marker at the end of a content stream

gaoflow · gaoflow · commit 586c7ae342aa · 2026-06-02T12:02:10.000+02:00
When an inline image's EI marker was the very last token of a content
stream, with no trailing whitespace or operator, parsing raised
PdfReadError. Two places assumed a byte always follows EI: the caller
in _read_inline_image rejected an empty trailing byte (and the
unconditional one-byte rewind after reading the marker stepped back
into EI when only two bytes were available), and extract_inline_default
treated end-of-stream after EI as a failed match and kept reading.

An EI at end-of-stream is unambiguously the end of the image since no
binary data can follow, so accept it in both places (mirroring the
existing end-of-stream handling in _check_end_image_marker).
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -1429,14 +1429,20 @@ def _read_inline_image(self, stream: StreamType) -> dict[str, Any]:
             data = extract_inline_default(stream)
 
         ei = stream.read(3)
-        stream.seek(-1, 1)
-        if ei[:2] != b"EI" or ei[2:3] not in WHITESPACES:
+        # An `EI` at the very end of the stream yields only two bytes; rewinding
+        # unconditionally would step back into the marker (#3468).
+        if len(ei) == 3:
+            stream.seek(-1, 1)
+        if ei[:2] != b"EI" or (ei[2:3] != b"" and ei[2:3] not in WHITESPACES):
             # Deal with wrong/missing `EI` tags. Example: Wrong dimensions specified above.
             stream.seek(savpos, 0)
             data = extract_inline_default(stream)
             ei = stream.read(3)
-            stream.seek(-1, 1)
-            if ei[:2] != b"EI" or ei[2:3] not in WHITESPACES:  # pragma: no cover
+            if len(ei) == 3:
+                stream.seek(-1, 1)
+            if ei[:2] != b"EI" or (
+                ei[2:3] != b"" and ei[2:3] not in WHITESPACES
+            ):  # pragma: no cover
                 # Check the same condition again. This should never fail as
                 # edge cases are covered by `extract_inline_default` above,
                 # but check this ot make sure that we are behind the `EI` afterwards.
diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py
@@ -233,6 +233,13 @@ def extract_inline_default(stream: StreamType) -> bytes:
                 stream.seek(saved_pos, 0)
                 continue
             tok3 = stream.read(1)  # possible space after "EI"
+            if tok3 == b"":
+                # The `EI` marker is at the very end of the stream. There can be
+                # no trailing binary data, so this is unambiguously the end of the
+                # inline image (#3468).
+                stream.seek(saved_pos - 1, 0)
+                stream_out.truncate(sav_pos_ei)
+                break
             if tok3 not in WHITESPACES:
                 stream.seek(saved_pos, 0)
                 continue
diff --git a/tests/test_generic.py b/tests/test_generic.py
@@ -1280,6 +1280,21 @@ def test_unitary_extract_inline():
     assert co.operations[7][0]["data"] == b"abcdefghijklmnop"
 
 
+def test_inline_image_at_end_of_stream():
+    # An inline image whose `EI` marker is the very end of the content stream
+    # (no trailing whitespace or operator) must not raise (#3468).
+    image = b"abcdefghijklmnop"  # 4 * 4 * 1 byte
+    for tail in (b"", b"\n", b"\nQ\n"):
+        b = b"q 100 0 0 100 100 100 cm\nBI\n/W 4 /H 4 /CS /G\nID\n" + image + b"\nEI" + tail
+        ec = DecodedStreamObject()
+        ec.set_data(b)
+        co = ContentStream(ec, None)
+        operations = co.operations
+        inline = [op for op in operations if op[1] == b"INLINE IMAGE"]
+        assert len(inline) == 1
+        assert inline[0][0]["data"] == image
+
+
 def test_missing_hashbin():
     assert NullObject().hash_bin() == hash((NullObject,))
     assert hash(NullObject()) == NullObject().hash_bin()