Skip to content

Commit 79ec8d7

Browse files
committed
Simplify our unpickling logic and add tests for Python 2 pickles
1 parent f6b8933 commit 79ec8d7

File tree

4 files changed

+170
-17
lines changed

4 files changed

+170
-17
lines changed

python/arcticdb/version_store/_normalization.py

+4-17
Original file line numberDiff line numberDiff line change
@@ -1050,11 +1050,11 @@ def _ext_hook(self, code, data):
10501050
# If stored in Python2 we want to use raw while unpacking.
10511051
# https://github.com/msgpack/msgpack-python/blob/master/msgpack/_unpacker.pyx#L230
10521052
data = unpackb(data, raw=True)
1053-
return Pickler.read(data, pickled_in_python2=True)
1053+
return Pickler.read(data)
10541054

10551055
if code == MsgPackSerialization.PY_PICKLE_3:
10561056
data = unpackb(data, raw=False)
1057-
return Pickler.read(data, pickled_in_python2=False)
1057+
return Pickler.read(data)
10581058

10591059
return ExtType(code, data)
10601060

@@ -1070,21 +1070,8 @@ def _msgpack_unpackb(self, buff, raw=False):
10701070

10711071
class Pickler(object):
10721072
@staticmethod
1073-
def read(data, pickled_in_python2=False):
1074-
if isinstance(data, str):
1075-
return pickle.loads(data.encode("ascii"), encoding="bytes")
1076-
elif isinstance(data, str):
1077-
if not pickled_in_python2:
1078-
# Use the default encoding for python2 pickled objects similar to what's being done for PY2.
1079-
return pickle.loads(data, encoding="bytes")
1080-
1081-
try:
1082-
# This tries normal pickle.loads first then falls back to special Pandas unpickling. Pandas unpickling
1083-
# handles Pandas 1 vs Pandas 2 API breaks better.
1084-
return pd.read_pickle(io.BytesIO(data))
1085-
except UnicodeDecodeError as exc:
1086-
log.debug("Failed decoding with ascii, using latin-1.")
1087-
return pickle.loads(data, encoding="latin-1")
1073+
def read(data):
1074+
return pd.read_pickle(io.BytesIO(data))
10881075

10891076
@staticmethod
10901077
def write(obj):

python/tests/integration/arcticdb/version_store/test_metadata_support.py

+18
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,24 @@ def test_rt_df_with_small_meta(object_and_mem_and_lmdb_version_store):
3333
assert meta == vit.metadata
3434

3535

36+
class A:
37+
def __init__(self, attrib):
38+
self.attrib = attrib
39+
40+
def __eq__(self, other):
41+
return self.attrib == other.attrib
42+
43+
44+
def test_rt_df_with_custom_meta(object_and_mem_and_lmdb_version_store):
45+
lib = object_and_mem_and_lmdb_version_store
46+
47+
df = DataFrame(data=["A", "B", "C"])
48+
meta = {"a_key": A("bananabread")}
49+
lib.write("pandas", df, metadata=meta)
50+
vit = lib.read("pandas")
51+
assert meta == vit.metadata
52+
53+
3654
@pytest.mark.parametrize("log_level", ("error", "warn", "debug", "info", "ERROR", "eRror", "", None))
3755
def test_pickled_metadata_warning(lmdb_version_store_v1, log_level):
3856
import arcticdb.version_store._normalization as norm
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""How some of the Python 2 pickles in test_normalization.py were created.
2+
3+
Executed from a Python 2 env with msgpack 0.6.2
4+
"""
5+
from email import errors # arbitrary module with some custom types to pickle
6+
import pickle
7+
import msgpack
8+
import sys
9+
10+
major_version = sys.version[0]
11+
12+
13+
def custom_pack(obj):
14+
# 102 is our extension code for pickled in Python 2
15+
return msgpack.ExtType(102, msgpack.packb(pickle.dumps(obj)))
16+
17+
18+
def msgpack_packb(obj):
19+
return msgpack.packb(obj, use_bin_type=True, strict_types=True, default=custom_pack)
20+
21+
22+
obj = errors.BoundaryError("bananas")
23+
title = "py" + major_version + "_obj.bin"
24+
with open(title, "wb") as f:
25+
msgpack.dump(obj, f, default=custom_pack)
26+
27+
obj = {"dict_key": errors.BoundaryError("bananas")}
28+
title = "py" + major_version + "_dict.bin"
29+
with open(title, "wb") as f:
30+
msgpack.dump(obj, f, default=custom_pack)
31+
32+
obj = "my_string"
33+
title = "py" + major_version + "_str.bin"
34+
with open(title, "wb") as f:
35+
msgpack.dump(obj, f, default=custom_pack)
36+
37+
obj = b"my_bytes"
38+
title = "py" + major_version + "_str_bytes.bin"
39+
with open(title, "wb") as f:
40+
msgpack.dump(obj, f, default=custom_pack)

python/tests/unit/arcticdb/version_store/test_normalization.py

+108
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
77
"""
88
import datetime
9+
from email import errors
910
import inspect
1011
import itertools
1112
import sys
@@ -92,6 +93,113 @@ def test_msg_pack_legacy_2():
9293
assert data == loc_dt
9394

9495

96+
def test_decode_python2_pickle_in_msgpack_dict():
97+
"""See python2_pickles.py for the generation steps. This is the py2_dict.bin case.
98+
99+
This is to check that we can still deserialize pickles that were written with Python 2 correctly.
100+
"""
101+
norm = test_msgpack_normalizer
102+
packed = b"\x81\xa8dict_key\xc7:f\xda\x007cemail.errors\nBoundaryError\np0\n(S'bananas'\np1\ntp2\nRp3\n."
103+
data = norm._msgpack_unpackb(packed)
104+
assert list(data.keys()) == ["dict_key"]
105+
assert isinstance(data["dict_key"], errors.BoundaryError)
106+
assert data["dict_key"].args[0] == "bananas"
107+
108+
109+
def test_decode_python2_pickle_in_msgpack_obj():
110+
"""See python2_pickles.py for the generation steps. This is the py2_obj.bin case.
111+
112+
This is to check that we can still deserialize pickles that were written with Python 2 correctly.
113+
"""
114+
norm = test_msgpack_normalizer
115+
packed = b"\xc7:f\xda\x007cemail.errors\nBoundaryError\np0\n(S'bananas'\np1\ntp2\nRp3\n."
116+
data = norm._msgpack_unpackb(packed)
117+
assert isinstance(data, errors.BoundaryError)
118+
assert data.args[0] == "bananas"
119+
120+
121+
def test_decode_python2_str_in_msgpack():
122+
"""See python2_pickles.py for the generation steps. This is the py2_str.bin case.
123+
124+
This is to check that we can still deserialize strings that were written with Python 2 correctly.
125+
"""
126+
norm = test_msgpack_normalizer
127+
packed = b'\xa9my_string'
128+
data = norm._msgpack_unpackb(packed)
129+
assert data == "my_string"
130+
assert isinstance(data, str)
131+
132+
133+
def test_decode_python2_bytes_in_old_msgpack():
134+
"""See python2_pickles.py for the generation steps. This is the py2_str_bytes.bin case.
135+
136+
This is to check that we can still deserialize bytes that were written with Python 2 correctly.
137+
"""
138+
norm = test_msgpack_normalizer
139+
packed = b'\xa8my_bytes'
140+
data = norm._msgpack_unpackb(packed)
141+
142+
# We claim it's `str` upon decoding because the `xa8` leading bytes tells us this is a fixed string type.
143+
assert data == "my_bytes"
144+
assert isinstance(data, str)
145+
146+
147+
def test_decode_python2_bytes_in_newer_msgpack():
148+
"""See python2_pickles.py for the generation steps. This is the py2_str_bytes.bin case.
149+
150+
This was written with msgpack 1.0.5 not 0.6.2 like the other examples. In this version, msgpack has
151+
a dedicated type for bytes.
152+
153+
This is to check that we can still deserialize bytes that were written with Python 2 correctly.
154+
"""
155+
norm = test_msgpack_normalizer
156+
packed = b'\xc4\x08my_bytes'
157+
data = norm._msgpack_unpackb(packed)
158+
assert data == b"my_bytes"
159+
assert isinstance(data, bytes)
160+
161+
162+
def test_decode_python3_pickle_in_msgpack_dict():
163+
norm = test_msgpack_normalizer
164+
obj = {"dict_key": errors.BoundaryError("bananas")}
165+
packed = norm._msgpack_packb(obj)
166+
167+
data = norm._msgpack_unpackb(packed)
168+
assert list(data.keys()) == ["dict_key"]
169+
assert isinstance(data["dict_key"], errors.BoundaryError)
170+
assert data["dict_key"].args[0] == "bananas"
171+
172+
173+
def test_decode_python3_pickle_in_msgpack_obj():
174+
norm = test_msgpack_normalizer
175+
obj = errors.BoundaryError("bananas")
176+
packed = norm._msgpack_packb(obj)
177+
178+
data = norm._msgpack_unpackb(packed)
179+
assert isinstance(data, errors.BoundaryError)
180+
assert data.args[0] == "bananas"
181+
182+
183+
def test_decode_python3_pickle_in_msgpack_str():
184+
norm = test_msgpack_normalizer
185+
obj = "bananas"
186+
packed = norm._msgpack_packb(obj)
187+
188+
data = norm._msgpack_unpackb(packed)
189+
assert isinstance(data, str)
190+
assert data == "bananas"
191+
192+
193+
def test_decode_python3_pickle_in_msgpack_bytes():
194+
norm = test_msgpack_normalizer
195+
obj = b"bananas"
196+
packed = norm._msgpack_packb(obj)
197+
198+
data = norm._msgpack_unpackb(packed)
199+
assert isinstance(data, bytes)
200+
assert data == b"bananas"
201+
202+
95203
@param_dict("d", params)
96204
def test_user_meta_and_msg_pack(d):
97205
n = normalize_metadata(d)

0 commit comments

Comments
 (0)