Skip to content

Commit 4e1dfb8

Browse files
author
Saeid Barati
committed
Harden P1 fixes: frozen dataclasses + numeric hash equivalence
Self-review before pushing caught two subtle regressions in the prior two commits: 1. ``_reconstruct`` used plain ``setattr`` to restore ``init=False`` fields after construction, which raises ``FrozenInstanceError`` on ``@dataclass(frozen=True)``. Switch to ``object.__setattr__`` — the same bypass frozen dataclasses themselves use in ``__post_init__``. 2. ``Json.__hash__`` / ``Struct.__hash__`` hashed the canonical JSON string. That renders ``1`` and ``1.0`` as distinct strings, violating the ``__eq__`` / ``__hash__`` contract when users mix numeric types (``1 == 1.0 == True`` in Python, but ``hash('1') != hash('1.0')``). Replace with a recursive ``_make_hashable`` that converts dicts to ``frozenset`` of items and lists to ``tuple``s. Python's built-in hashing then collapses ``1 == 1.0 == True`` to the same bucket, and equal wrapped values produce equal hashes. ``_make_hashable`` lives in ``base.py`` so any future IOType wrapping unhashable values can reuse it. Regression tests added for: - Frozen dataclass with ``init=False`` field round-trip. - ``Json({'x': 1}) == Json({'x': 1.0})`` and ``hash()`` equality. - Same for ``Struct({'x': 1})`` / ``Struct({'x': 1.0})``. - ``Json({'x': True}) == Json({'x': 1})`` hash equality. - ``Json([1, 2, 3]) == Json([1.0, 2.0, 3.0])`` hash equality.
1 parent e08ac09 commit 4e1dfb8

5 files changed

Lines changed: 100 additions & 20 deletions

File tree

metaflow/io_types/base.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,27 @@ def get_iotype_by_name(type_name):
3939
return _TYPE_REGISTRY.get(type_name)
4040

4141

42+
def _make_hashable(value):
43+
"""
44+
Recursively convert a JSON-like value to a hashable form.
45+
46+
dicts -> frozenset of (key, hashable(value)) pairs.
47+
lists -> tuple of hashable elements.
48+
Everything else assumed hashable (int, float, bool, str, None, ...).
49+
50+
Using ``frozenset``/``tuple`` preserves Python's numeric equivalence
51+
(``1 == 1.0 == True`` -> equal hashes) that ``json.dumps`` would
52+
otherwise break by rendering each as a distinct string. This keeps the
53+
``__eq__`` / ``__hash__`` contract intact when IOType subclasses delegate
54+
value equality to the wrapped Python object.
55+
"""
56+
if isinstance(value, dict):
57+
return frozenset((k, _make_hashable(v)) for k, v in value.items())
58+
if isinstance(value, list):
59+
return tuple(_make_hashable(v) for v in value)
60+
return value
61+
62+
4263
class IOType(object, metaclass=ABCMeta):
4364
"""
4465
Base class for typed Metaflow artifacts.

metaflow/io_types/json_type.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import json
22

33
from ..datastore.artifacts.serializer import SerializedBlob
4-
from .base import IOType, _UNSET
4+
from .base import IOType, _UNSET, _make_hashable
55

66

77
class Json(IOType):
@@ -28,10 +28,10 @@ def _storage_deserialize(cls, blobs, **kwargs):
2828

2929
def __hash__(self):
3030
# ``_value`` is typically a dict or list (unhashable), so the base
31-
# class ``hash((type, _value))`` raises TypeError. Hash the canonical
32-
# JSON representation instead — stable across equal values and
33-
# consistent with ``__eq__`` (which compares ``_value`` directly:
34-
# equal dicts/lists produce identical sorted-key JSON).
31+
# class ``hash((type, _value))`` raises TypeError. Convert to a
32+
# frozenset/tuple form that preserves Python's numeric equivalence
33+
# (``1 == 1.0 == True`` hash identically), so ``__eq__`` and
34+
# ``__hash__`` stay consistent even when users mix int/float/bool.
3535
if self._value is _UNSET:
3636
return hash((type(self), _UNSET))
37-
return hash((type(self), self._wire_serialize()))
37+
return hash((type(self), _make_hashable(self._value)))

metaflow/io_types/struct_type.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import typing
55

66
from ..datastore.artifacts.serializer import SerializedBlob
7-
from .base import IOType, _UNSET
7+
from .base import IOType, _UNSET, _make_hashable
88

99

1010
def _reconstruct(dc_type, data):
@@ -18,7 +18,8 @@ def _reconstruct(dc_type, data):
1818
1919
Fields declared with ``field(init=False, ...)`` are not accepted by the
2020
generated ``__init__``, but ``dataclasses.asdict`` emits them. Pass only
21-
init-eligible fields as kwargs, then ``setattr`` the remainder so the
21+
init-eligible fields as kwargs, then assign the remainder via
22+
``object.__setattr__`` (which works on frozen dataclasses too) so the
2223
serialized values are not lost to defaults or ``__post_init__``.
2324
"""
2425
try:
@@ -45,8 +46,11 @@ def _reconstruct(dc_type, data):
4546
else:
4647
post_init_fields.append((f.name, value))
4748
instance = dc_type(**kwargs)
49+
# ``object.__setattr__`` bypasses ``@dataclass(frozen=True)``'s lock,
50+
# matching the pattern frozen dataclasses themselves use to populate
51+
# ``init=False`` fields inside ``__post_init__``.
4852
for name, value in post_init_fields:
49-
setattr(instance, name, value)
53+
object.__setattr__(instance, name, value)
5054
return instance
5155

5256

@@ -144,15 +148,10 @@ def _storage_deserialize(cls, blobs, **kwargs):
144148
def __hash__(self):
145149
# ``_value`` is typically a dataclass instance or dict (often
146150
# unhashable), so the base class ``hash((type, _value))`` raises
147-
# TypeError. Hash the canonical JSON representation of the flattened
148-
# value instead — stable across equal values and consistent with
149-
# ``__eq__`` (equal dataclasses/dicts flatten to identical sorted-key
150-
# JSON).
151+
# TypeError. Flatten to a dict via ``_to_dict`` then convert to a
152+
# frozenset/tuple form that preserves Python's numeric equivalence
153+
# (``1 == 1.0 == True`` hash identically), so ``__eq__`` and
154+
# ``__hash__`` stay consistent for mixed-type fields.
151155
if self._value is _UNSET:
152156
return hash((type(self), _UNSET))
153-
return hash(
154-
(
155-
type(self),
156-
json.dumps(self._to_dict(), separators=(",", ":"), sort_keys=True),
157-
)
158-
)
157+
return hash((type(self), _make_hashable(self._to_dict())))

test/unit/io_types/test_json_type.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,24 @@ def test_json_hashable_with_list_value():
7373
h = hash(j)
7474
assert isinstance(h, int)
7575
assert hash(Json([1, 2, 3])) == hash(j)
76+
77+
78+
def test_json_hash_preserves_numeric_equivalence():
79+
"""``1 == 1.0 == True`` in Python — equal dicts must hash equal even
80+
when they mix numeric types. A naive JSON-string hash renders ``1``
81+
and ``1.0`` as distinct strings and violates the hash/eq contract.
82+
"""
83+
a = Json({"x": 1})
84+
b = Json({"x": 1.0})
85+
assert a == b
86+
assert hash(a) == hash(b)
87+
88+
c = Json({"x": True})
89+
d = Json({"x": 1})
90+
assert c == d
91+
assert hash(c) == hash(d)
92+
93+
e = Json([1, 2, 3])
94+
f = Json([1.0, 2.0, 3.0])
95+
assert e == f
96+
assert hash(e) == hash(f)

test/unit/io_types/test_struct_type.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,16 @@ def __post_init__(self):
4141
self.computed = self.a * 10
4242

4343

44+
@dataclass(frozen=True)
45+
class FrozenWithInitFalse:
46+
a: int
47+
computed: int = dataclasses.field(init=False, default=0)
48+
49+
def __post_init__(self):
50+
# Frozen dataclasses must use object.__setattr__ to touch fields.
51+
object.__setattr__(self, "computed", self.a * 10)
52+
53+
4454
def test_struct_wire_round_trip():
4555
s = Struct(SimpleData(name="test", count=5, score=3.14, active=True))
4656
wire = s.serialize(format="wire")
@@ -179,11 +189,28 @@ def test_struct_plain_dict_value():
179189
assert s2.value == {"x": 1, "y": "hello"}
180190

181191

192+
def test_struct_frozen_dataclass_with_init_false_field_round_trip():
193+
"""Frozen dataclasses reject plain ``setattr``. Reconstructing one with
194+
an ``init=False`` field must use ``object.__setattr__`` so the
195+
serialized value survives, matching how frozen dataclasses initialize
196+
such fields in their own ``__post_init__``.
197+
"""
198+
original = FrozenWithInitFalse(a=7)
199+
assert original.computed == 70
200+
201+
s = Struct(original)
202+
blobs, meta = s.serialize(format="storage")
203+
s2 = Struct.deserialize([b.value for b in blobs], format="storage", metadata=meta)
204+
assert type(s2.value) is FrozenWithInitFalse
205+
assert s2.value.a == 7
206+
assert s2.value.computed == 70
207+
208+
182209
def test_struct_hashable_with_unhashable_dataclass():
183210
"""Struct wrapping a dataclass with mutable fields (list, dict) must be
184211
hashable. The base ``hash((type, _value))`` raises TypeError because
185212
dataclasses with mutable fields are unhashable; Struct overrides
186-
``__hash__`` with a canonical-JSON-based hash.
213+
``__hash__`` via ``_make_hashable``.
187214
"""
188215

189216
@dataclass
@@ -205,6 +232,18 @@ class WithList:
205232
assert isinstance(hash(s3), int)
206233

207234

235+
def test_struct_hash_preserves_numeric_equivalence():
236+
"""Same contract as Json: equal dicts with int/float/bool members must
237+
hash equal. Confirms the frozenset/tuple based ``_make_hashable`` is
238+
used (not ``json.dumps``, which would render ``1`` and ``1.0`` as
239+
distinct strings).
240+
"""
241+
s_int = Struct({"x": 1})
242+
s_float = Struct({"x": 1.0})
243+
assert s_int == s_float
244+
assert hash(s_int) == hash(s_float)
245+
246+
208247
def test_struct_descriptor_is_hashable():
209248
"""Struct() (no value) must still be hashable via the _UNSET sentinel."""
210249
s = Struct()

0 commit comments

Comments
 (0)