Skip to content

Commit e08ac09

Browse files
author
Saeid Barati
committed
Fix IOType.__hash__ TypeError for unhashable wrapped values
The base ``IOType.__hash__`` does ``hash((type(self), self._value))``, which raises ``TypeError`` when ``_value`` is inherently unhashable — a dict wrapped by ``Json({...})`` or a dataclass with mutable fields (``list``, ``dict``) wrapped by ``Struct(...)``. Both are the primary ways users create instances of these types, so the wrapper ends up mis-behaving in any set/dict or cache keyed by artifacts. Override ``__hash__`` on both subclasses to hash the canonical JSON representation (already produced by the wire/storage path). This is: - Stable — ``sort_keys=True`` makes the JSON deterministic across equal dicts/dataclasses regardless of key insertion order. - Contract-safe — ``__eq__`` compares raw values; equal values flatten to identical sorted-key JSON, so ``a == b`` implies ``hash(a) == hash(b)``. - Descriptor-aware — when ``_value is _UNSET`` (no value, pure type descriptor), fall back to hashing the sentinel directly. Adds regression tests covering ``Json({...})``, ``Json([...])``, ``Struct(dc_with_list)``, plain ``Struct({...})``, and the ``Struct()`` descriptor.
1 parent 1ba4525 commit e08ac09

4 files changed

Lines changed: 85 additions & 1 deletion

File tree

metaflow/io_types/json_type.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import json
22

33
from ..datastore.artifacts.serializer import SerializedBlob
4-
from .base import IOType
4+
from .base import IOType, _UNSET
55

66

77
class Json(IOType):
@@ -25,3 +25,13 @@ def _storage_serialize(self):
2525
@classmethod
2626
def _storage_deserialize(cls, blobs, **kwargs):
2727
return cls(json.loads(blobs[0].decode("utf-8")))
28+
29+
def __hash__(self):
30+
# ``_value`` is typically a dict or list (unhashable), so the base
31+
# class ``hash((type, _value))`` raises TypeError. Hash the canonical
32+
# JSON representation instead — stable across equal values and
33+
# consistent with ``__eq__`` (which compares ``_value`` directly:
34+
# equal dicts/lists produce identical sorted-key JSON).
35+
if self._value is _UNSET:
36+
return hash((type(self), _UNSET))
37+
return hash((type(self), self._wire_serialize()))

metaflow/io_types/struct_type.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,19 @@ def _storage_deserialize(cls, blobs, **kwargs):
140140
return cls(_reconstruct(dc_type, data), dataclass_type=dc_type)
141141
# Fallback: return as plain dict wrapped in Struct
142142
return cls(data)
143+
144+
def __hash__(self):
145+
# ``_value`` is typically a dataclass instance or dict (often
146+
# unhashable), so the base class ``hash((type, _value))`` raises
147+
# TypeError. Hash the canonical JSON representation of the flattened
148+
# value instead — stable across equal values and consistent with
149+
# ``__eq__`` (equal dataclasses/dicts flatten to identical sorted-key
150+
# JSON).
151+
if self._value is _UNSET:
152+
return hash((type(self), _UNSET))
153+
return hash(
154+
(
155+
type(self),
156+
json.dumps(self._to_dict(), separators=(",", ":"), sort_keys=True),
157+
)
158+
)

test/unit/io_types/test_json_type.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,29 @@ def test_json_deeply_nested():
4747
s = j.serialize(format="wire")
4848
j2 = Json.deserialize(s, format="wire")
4949
assert j2.value == data
50+
51+
52+
def test_json_hashable_with_dict_value():
53+
"""Json wrapping a dict must be hashable. The base class default
54+
``hash((type(self), self._value))`` raises TypeError for unhashable
55+
values, so Json overrides it with a wire-format-based hash.
56+
"""
57+
j = Json({"a": 1, "b": [2, 3]})
58+
# No TypeError.
59+
h = hash(j)
60+
assert isinstance(h, int)
61+
62+
# Equal values must hash equal — even when insertion order differs.
63+
j2 = Json({"b": [2, 3], "a": 1})
64+
assert j == j2
65+
assert hash(j) == hash(j2)
66+
67+
# Works inside a set.
68+
assert {Json({"x": 1}), Json({"x": 1})} == {Json({"x": 1})}
69+
70+
71+
def test_json_hashable_with_list_value():
72+
j = Json([1, 2, 3])
73+
h = hash(j)
74+
assert isinstance(h, int)
75+
assert hash(Json([1, 2, 3])) == hash(j)

test/unit/io_types/test_struct_type.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,38 @@ def test_struct_plain_dict_value():
179179
assert s2.value == {"x": 1, "y": "hello"}
180180

181181

182+
def test_struct_hashable_with_unhashable_dataclass():
183+
"""Struct wrapping a dataclass with mutable fields (list, dict) must be
184+
hashable. The base ``hash((type, _value))`` raises TypeError because
185+
dataclasses with mutable fields are unhashable; Struct overrides
186+
``__hash__`` with a canonical-JSON-based hash.
187+
"""
188+
189+
@dataclass
190+
class WithList:
191+
x: int
192+
items: list = dataclasses.field(default_factory=list)
193+
194+
s = Struct(WithList(x=1, items=[1, 2, 3]))
195+
h = hash(s)
196+
assert isinstance(h, int)
197+
198+
# Hash/eq contract: equal values hash equal.
199+
s2 = Struct(WithList(x=1, items=[1, 2, 3]))
200+
assert s == s2
201+
assert hash(s) == hash(s2)
202+
203+
# Wrapping a plain dict also hashes without TypeError.
204+
s3 = Struct({"a": [1, 2], "b": {"nested": True}})
205+
assert isinstance(hash(s3), int)
206+
207+
208+
def test_struct_descriptor_is_hashable():
209+
"""Struct() (no value) must still be hashable via the _UNSET sentinel."""
210+
s = Struct()
211+
assert isinstance(hash(s), int)
212+
213+
182214
def test_struct_descriptor_uses_unset_sentinel():
183215
"""Struct() (no value) must behave as a pure descriptor via the same
184216
_UNSET sentinel the base IOType uses. Two empty descriptors should be

0 commit comments

Comments
 (0)