Skip to content

Commit cc44c1f

Browse files
author
Saeid Barati
committed
Add IOType ABC (typed-artifact contract)
This PR, on top of Netflix#3117, adds the minimal contract that extension authors target when they want to ship typed artifacts. Only the abstract base lives in OSS; concrete scalar/tensor/struct/etc. types and the bridge serializer belong downstream, where deployment-specific opinions about encoding, byte order, enum wire representation, and dataclass inference can live without being forced on every Metaflow user. Standard Python primitives (``int``, ``str``, ``list``, ``dict``, ...) continue to flow through ``PickleSerializer`` unchanged. Wrapping is opt-in, for types that need metadata or invariants attached. IOType contract - ``serialize(format=...)`` / ``deserialize(data, format=..., **kw)`` mirror the signature that Netflix#3117 added on ``ArtifactSerializer``. The same ``WIRE`` / ``STORAGE`` constants govern dispatch so a single subclass owns both representations: - ``STORAGE`` → ``(List[SerializedBlob], metadata_dict)`` for the datastore save path. - ``WIRE`` → ``str`` for CLI args, protobuf payloads, and cross-process IPC. - Subclasses implement four hooks: ``_wire_serialize``, ``_wire_deserialize``, ``_storage_serialize``, ``_storage_deserialize``. - ``type_name`` + ``to_spec()`` support JSON schema generation. - ``IOType`` itself is abstract; instantiating without implementing the four hooks raises ``TypeError``. Tests - ``test/unit/io_types/test_base.py`` — covers abstract instantiation, WIRE and STORAGE round-trips, default format, invalid format rejection, descriptor-mode spec output, and equality/hash.
1 parent 79ed3ef commit cc44c1f

4 files changed

Lines changed: 206 additions & 0 deletions

File tree

metaflow/io_types/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .base import IOType

metaflow/io_types/base.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
"""Typed-artifact contract for Metaflow.
2+
3+
This module defines the minimal :class:`IOType` abstract base class. OSS
4+
Metaflow ships the contract; concrete types (scalars, tensors, enums,
5+
dataclass-backed structs, etc.) live in extensions — they embody
6+
deployment-specific opinions about encoding, byte order, and dataclass
7+
inference that do not belong in core.
8+
9+
:class:`IOType` mirrors the ``format`` argument introduced on
10+
:class:`metaflow.datastore.artifacts.serializer.ArtifactSerializer` so a
11+
single subclass can own both representations:
12+
13+
- ``STORAGE`` — blob-based, persisted through the datastore.
14+
- ``WIRE`` — string-based, for CLI args, protobuf payloads, and
15+
cross-process IPC.
16+
17+
Subclasses implement four hooks (``_wire_serialize``, ``_wire_deserialize``,
18+
``_storage_serialize``, ``_storage_deserialize``); callers use the public
19+
``serialize(format=...)`` / ``deserialize(data, format=...)`` methods.
20+
"""
21+
22+
from abc import ABCMeta, abstractmethod
23+
24+
from metaflow.datastore.artifacts.serializer import STORAGE, WIRE
25+
26+
27+
_UNSET = object()
28+
29+
30+
class IOType(object, metaclass=ABCMeta):
31+
"""
32+
Base class for typed Metaflow artifacts.
33+
34+
An :class:`IOType` instance plays two roles:
35+
36+
- **Descriptor** (no value): ``Int64`` in a spec describes an int64
37+
field.
38+
- **Wrapper** (with value): ``Int64(42)`` wraps a value for typed
39+
serialization.
40+
41+
Subclasses implement four internal operations, dispatched by the
42+
``format`` argument of the public :meth:`serialize` / :meth:`deserialize`
43+
methods.
44+
"""
45+
46+
type_name = None # e.g. "text", "json", "int64" — set by subclasses.
47+
48+
def __init__(self, value=_UNSET):
49+
self._value = value
50+
51+
@property
52+
def value(self):
53+
"""The wrapped Python value, or ``_UNSET`` if this is a pure descriptor."""
54+
return self._value
55+
56+
# -- Public API --------------------------------------------------------
57+
58+
def serialize(self, format=STORAGE):
59+
"""
60+
Serialize the wrapped value. Must be side-effect-free.
61+
62+
Parameters
63+
----------
64+
format : str
65+
``STORAGE`` (default) returns ``(List[SerializedBlob], dict)``.
66+
``WIRE`` returns a ``str``.
67+
"""
68+
if format == WIRE:
69+
return self._wire_serialize()
70+
if format == STORAGE:
71+
return self._storage_serialize()
72+
raise ValueError("format must be %r or %r, got %r" % (STORAGE, WIRE, format))
73+
74+
@classmethod
75+
def deserialize(cls, data, format=STORAGE, **kwargs):
76+
"""
77+
Reconstruct an :class:`IOType` from serialized data.
78+
79+
Parameters
80+
----------
81+
data : Union[str, List[bytes]]
82+
``str`` when ``format=WIRE``; ``List[bytes]`` when ``format=STORAGE``.
83+
format : str
84+
``STORAGE`` (default) or ``WIRE``.
85+
**kwargs
86+
Forwarded to the underlying ``_storage_deserialize`` hook
87+
(e.g. metadata the datastore produced at save time).
88+
"""
89+
if format == WIRE:
90+
return cls._wire_deserialize(data)
91+
if format == STORAGE:
92+
return cls._storage_deserialize(data, **kwargs)
93+
raise ValueError("format must be %r or %r, got %r" % (STORAGE, WIRE, format))
94+
95+
# -- Subclass hooks ----------------------------------------------------
96+
97+
@abstractmethod
98+
def _wire_serialize(self):
99+
"""Value -> string (for CLI args, protobuf, external APIs)."""
100+
raise NotImplementedError
101+
102+
@classmethod
103+
@abstractmethod
104+
def _wire_deserialize(cls, s):
105+
"""String -> :class:`IOType` instance."""
106+
raise NotImplementedError
107+
108+
@abstractmethod
109+
def _storage_serialize(self):
110+
"""Value -> ``(List[SerializedBlob], metadata_dict)``. Side-effect-free."""
111+
raise NotImplementedError
112+
113+
@classmethod
114+
@abstractmethod
115+
def _storage_deserialize(cls, blobs, **kwargs):
116+
"""``(List[bytes], metadata)`` -> :class:`IOType` instance."""
117+
raise NotImplementedError
118+
119+
# -- Spec generation ---------------------------------------------------
120+
121+
def to_spec(self):
122+
"""JSON type spec. Works with or without a wrapped value."""
123+
return {"type": self.type_name}
124+
125+
# -- Dunder ------------------------------------------------------------
126+
127+
def __repr__(self):
128+
if self._value is _UNSET:
129+
return "%s()" % self.__class__.__name__
130+
return "%s(%r)" % (self.__class__.__name__, self._value)
131+
132+
def __eq__(self, other):
133+
if type(self) is not type(other):
134+
return NotImplemented
135+
return self._value == other._value
136+
137+
def __hash__(self):
138+
return hash((type(self), self._value))

test/unit/io_types/__init__.py

Whitespace-only changes.

test/unit/io_types/test_base.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""Contract tests for the IOType ABC."""
2+
3+
import pytest
4+
5+
from metaflow.datastore.artifacts.serializer import STORAGE, WIRE, SerializedBlob
6+
from metaflow.io_types import IOType
7+
8+
9+
class _TextIOType(IOType):
10+
"""Minimal concrete IOType used only to exercise the base-class dispatch."""
11+
12+
type_name = "test_text"
13+
14+
def _wire_serialize(self):
15+
return self._value
16+
17+
@classmethod
18+
def _wire_deserialize(cls, s):
19+
return cls(s)
20+
21+
def _storage_serialize(self):
22+
blob = self._value.encode("utf-8")
23+
return [SerializedBlob(blob)], {"length": len(blob)}
24+
25+
@classmethod
26+
def _storage_deserialize(cls, blobs, **kwargs):
27+
return cls(blobs[0].decode("utf-8"))
28+
29+
30+
def test_cannot_instantiate_abstract_base():
31+
with pytest.raises(TypeError):
32+
IOType() # missing hook implementations
33+
34+
35+
def test_wire_roundtrip():
36+
wire = _TextIOType("hi").serialize(format=WIRE)
37+
assert wire == "hi"
38+
assert _TextIOType.deserialize(wire, format=WIRE) == _TextIOType("hi")
39+
40+
41+
def test_storage_roundtrip():
42+
blobs, meta = _TextIOType("hi").serialize(format=STORAGE)
43+
assert meta["length"] == 2
44+
raw = [b.value for b in blobs]
45+
assert _TextIOType.deserialize(raw, format=STORAGE) == _TextIOType("hi")
46+
47+
48+
def test_default_format_is_storage():
49+
out = _TextIOType("hi").serialize()
50+
assert isinstance(out, tuple) # (blobs, metadata)
51+
52+
53+
def test_invalid_format_raises():
54+
with pytest.raises(ValueError):
55+
_TextIOType("hi").serialize(format="bogus")
56+
with pytest.raises(ValueError):
57+
_TextIOType.deserialize("hi", format="bogus")
58+
59+
60+
def test_descriptor_has_no_value():
61+
assert _TextIOType().to_spec() == {"type": "test_text"}
62+
63+
64+
def test_eq_and_hash():
65+
assert _TextIOType("x") == _TextIOType("x")
66+
assert _TextIOType("x") != _TextIOType("y")
67+
assert hash(_TextIOType("x")) == hash(_TextIOType("x"))

0 commit comments

Comments
 (0)