ENH: ASCIIHexDecode.decode now returns bytes instead of str (#1994)

pubpub-zz · web-flow · commit 11ee6480a3f7 · 2023-07-25T22:09:06.000+02:00
Please note that this is potentially backwards-incompatible! This also fixes a bug. Closes #1983
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -245,10 +245,10 @@ class ASCIIHexDecode:
 
     @staticmethod
     def decode(
-        data: str,
+        data: Union[str, bytes],
         decode_parms: Union[None, ArrayObject, DictionaryObject] = None,
         **kwargs: Any,
-    ) -> str:
+    ) -> bytes:
         """
         Decode an ASCII-Hex encoded data stream.
 
@@ -268,24 +268,26 @@ def decode(
         if "decodeParms" in kwargs:  # deprecated
             deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
             decode_parms = kwargs["decodeParms"]  # noqa: F841
-        retval = ""
-        hex_pair = ""
+        if isinstance(data, str):
+            data = data.encode()
+        retval = b""
+        hex_pair = b""
         index = 0
         while True:
             if index >= len(data):
                 raise PdfStreamError("Unexpected EOD in ASCIIHexDecode")
-            char = data[index]
-            if char == ">":
+            char = data[index : index + 1]
+            if char == b">":
                 break
             elif char.isspace():
                 index += 1
                 continue
             hex_pair += char
             if len(hex_pair) == 2:
-                retval += chr(int(hex_pair, base=16))
-                hex_pair = ""
+                retval += bytes((int(hex_pair, base=16),))
+                hex_pair = b""
             index += 1
-        assert hex_pair == ""
+        assert hex_pair == b""
         return retval
 
 
@@ -854,6 +856,8 @@ def _handle_jpx(
 
     size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT])
     data = x_object_obj.get_data()  # type: ignore
+    if isinstance(data, str):  # pragma: no cover
+        data = data.encode()
     colors = x_object_obj.get("/Colors", 1)
     color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object()
     if (
@@ -914,7 +918,7 @@ def _handle_jpx(
             "TIFF",
             ".tiff",
         )
-    elif lfilters is None:
+    else:
         img, image_format, extension = Image.frombytes(mode, size, data), "PNG", ".png"
 
     # CMYK image without decode requires reverting scale (cf p243,2§ last sentence)
diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -84,26 +84,26 @@ def test_flate_decode_decompress_with_array_params(params):
 @pytest.mark.parametrize(
     ("data", "expected"),
     [
-        (">", ""),
+        (">", b""),
         (
             "6162636465666768696a6b6c6d6e6f707172737475767778797a>",
-            string.ascii_lowercase,
+            string.ascii_lowercase.encode(),
         ),
         (
             "4142434445464748494a4b4c4d4e4f505152535455565758595a>",
-            string.ascii_uppercase,
+            string.ascii_uppercase.encode(),
         ),
         (
             "6162636465666768696a6b6c6d6e6f707172737475767778797a4142434445464748494a4b4c4d4e4f505152535455565758595a>",
-            string.ascii_letters,
+            string.ascii_letters.encode(),
         ),
-        ("30313233343536373839>", string.digits),
+        ("30313233343536373839>", string.digits.encode()),
         (
             "3  031323334353637   3839>",
-            string.digits,
+            string.digits.encode(),
         ),  # Same as previous, but whitespaced
-        ("30313233343536373839616263646566414243444546>", string.hexdigits),
-        ("20090a0d0b0c>", string.whitespace),
+        ("30313233343536373839616263646566414243444546>", string.hexdigits.encode()),
+        ("20090a0d0b0c>", string.whitespace.encode()),
     ],
     ids=[
         "empty",
@@ -135,6 +135,19 @@ def test_ascii_hex_decode_missing_eod():
     assert exc.value.args[0] == "Unexpected EOD in ASCIIHexDecode"
 
 
+@pytest.mark.enable_socket()
+def test_decode_ahx():
+    """
+    See #1979
+    Gray Image in CMYK : requiring reverse
+    """
+    url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf"
+    name = "NewJersey.pdf"
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+    for p in reader.pages:
+        _ = list(p.images.keys())
+
+
 @pytest.mark.xfail()
 def test_ascii85decode_with_overflow():
     inputs = (