Skip to content

Commit 11ee648

Browse files
authored
ENH: ASCIIHexDecode.decode now returns bytes instead of str (#1994)
Please note that this is potentially backwards-incompatible! This also fixes a bug. Closes #1983
1 parent 890c93a commit 11ee648

File tree

2 files changed

+35
-18
lines changed

2 files changed

+35
-18
lines changed

Diff for: pypdf/filters.py

+14-10
Original file line numberDiff line numberDiff line change
@@ -245,10 +245,10 @@ class ASCIIHexDecode:
245245

246246
@staticmethod
247247
def decode(
248-
data: str,
248+
data: Union[str, bytes],
249249
decode_parms: Union[None, ArrayObject, DictionaryObject] = None,
250250
**kwargs: Any,
251-
) -> str:
251+
) -> bytes:
252252
"""
253253
Decode an ASCII-Hex encoded data stream.
254254
@@ -268,24 +268,26 @@ def decode(
268268
if "decodeParms" in kwargs: # deprecated
269269
deprecate_with_replacement("decodeParms", "parameters", "4.0.0")
270270
decode_parms = kwargs["decodeParms"] # noqa: F841
271-
retval = ""
272-
hex_pair = ""
271+
if isinstance(data, str):
272+
data = data.encode()
273+
retval = b""
274+
hex_pair = b""
273275
index = 0
274276
while True:
275277
if index >= len(data):
276278
raise PdfStreamError("Unexpected EOD in ASCIIHexDecode")
277-
char = data[index]
278-
if char == ">":
279+
char = data[index : index + 1]
280+
if char == b">":
279281
break
280282
elif char.isspace():
281283
index += 1
282284
continue
283285
hex_pair += char
284286
if len(hex_pair) == 2:
285-
retval += chr(int(hex_pair, base=16))
286-
hex_pair = ""
287+
retval += bytes((int(hex_pair, base=16),))
288+
hex_pair = b""
287289
index += 1
288-
assert hex_pair == ""
290+
assert hex_pair == b""
289291
return retval
290292

291293

@@ -854,6 +856,8 @@ def _handle_jpx(
854856

855857
size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT])
856858
data = x_object_obj.get_data() # type: ignore
859+
if isinstance(data, str): # pragma: no cover
860+
data = data.encode()
857861
colors = x_object_obj.get("/Colors", 1)
858862
color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object()
859863
if (
@@ -914,7 +918,7 @@ def _handle_jpx(
914918
"TIFF",
915919
".tiff",
916920
)
917-
elif lfilters is None:
921+
else:
918922
img, image_format, extension = Image.frombytes(mode, size, data), "PNG", ".png"
919923

920924
# CMYK image without decode requires reverting scale (cf p243,2§ last sentence)

Diff for: tests/test_filters.py

+21-8
Original file line numberDiff line numberDiff line change
@@ -84,26 +84,26 @@ def test_flate_decode_decompress_with_array_params(params):
8484
@pytest.mark.parametrize(
8585
("data", "expected"),
8686
[
87-
(">", ""),
87+
(">", b""),
8888
(
8989
"6162636465666768696a6b6c6d6e6f707172737475767778797a>",
90-
string.ascii_lowercase,
90+
string.ascii_lowercase.encode(),
9191
),
9292
(
9393
"4142434445464748494a4b4c4d4e4f505152535455565758595a>",
94-
string.ascii_uppercase,
94+
string.ascii_uppercase.encode(),
9595
),
9696
(
9797
"6162636465666768696a6b6c6d6e6f707172737475767778797a4142434445464748494a4b4c4d4e4f505152535455565758595a>",
98-
string.ascii_letters,
98+
string.ascii_letters.encode(),
9999
),
100-
("30313233343536373839>", string.digits),
100+
("30313233343536373839>", string.digits.encode()),
101101
(
102102
"3 031323334353637 3839>",
103-
string.digits,
103+
string.digits.encode(),
104104
), # Same as previous, but whitespaced
105-
("30313233343536373839616263646566414243444546>", string.hexdigits),
106-
("20090a0d0b0c>", string.whitespace),
105+
("30313233343536373839616263646566414243444546>", string.hexdigits.encode()),
106+
("20090a0d0b0c>", string.whitespace.encode()),
107107
],
108108
ids=[
109109
"empty",
@@ -135,6 +135,19 @@ def test_ascii_hex_decode_missing_eod():
135135
assert exc.value.args[0] == "Unexpected EOD in ASCIIHexDecode"
136136

137137

138+
@pytest.mark.enable_socket()
139+
def test_decode_ahx():
140+
"""
141+
See #1979
142+
Gray Image in CMYK : requiring reverse
143+
"""
144+
url = "https://github.com/py-pdf/pypdf/files/12090692/New.Jersey.Coinbase.staking.securities.charges.2023-0606_Coinbase-Penalty-and-C-D.pdf"
145+
name = "NewJersey.pdf"
146+
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
147+
for p in reader.pages:
148+
_ = list(p.images.keys())
149+
150+
138151
@pytest.mark.xfail()
139152
def test_ascii85decode_with_overflow():
140153
inputs = (

0 commit comments

Comments
 (0)