diff --git a/src/boost_histogram/serialization/__init__.py b/src/boost_histogram/serialization/__init__.py index 759d6058..8c4b0cbe 100644 --- a/src/boost_histogram/serialization/__init__.py +++ b/src/boost_histogram/serialization/__init__.py @@ -6,10 +6,15 @@ import numpy as np # pylint: disable-next=import-error -from .. import histogram, version +from .. import histogram, storage, version from ._axis import _axis_from_dict, _axis_to_dict from ._common import serialize_metadata -from ._storage import _data_from_dict, _storage_from_dict, _storage_to_dict +from ._storage import ( + _data_from_dict, + _storage_from_dict, + _storage_to_dict, + _storage_type_to_str, +) __all__ = ["from_uhi", "remove_writer_info", "to_uhi"] @@ -18,16 +23,66 @@ def __dir__() -> list[str]: return __all__ -def to_uhi(h: histogram.Histogram[Any], /) -> dict[str, Any]: +def _storage_has_data_keys(storage_data: dict[str, Any], storage_type: str) -> bool: + """ + Check if storage data dict has the required keys for the given storage type. + + Returns True if all required data keys are present, False if it's structure-only. + Raises ValueError if required keys are missing (malformed/partial data). + """ + match storage_type: + case "int" | "double": + required_keys = {"values"} + case "weighted": + required_keys = {"values", "variances"} + case "mean": + required_keys = {"counts", "values", "variances"} + case "weighted_mean": + required_keys = { + "sum_of_weights", + "sum_of_weights_squared", + "values", + "variances", + } + case _: + msg = f"Unknown storage type: {storage_type}" + raise ValueError(msg) + + present_keys = required_keys & set(storage_data.keys()) + + if not present_keys: + return False + + if present_keys != required_keys: + missing = required_keys - present_keys + msg = f"{storage_type.capitalize()} storage missing required keys: {missing}" + raise ValueError(msg) + + return True + + +def to_uhi( + h: histogram.Histogram[Any], /, *, keep_storage: bool = True +) -> dict[str, Any]: """Convert an Histogram to a dictionary.""" # Convert the histogram to a dictionary + writer_info = {"boost-histogram": {"version": version.version}} + + # Store storage type info for AtomicInt64 and Unlimited (they serialize as int/double) + storage_type_str = _storage_type_to_str(h.storage_type()) + if isinstance(h.storage_type(), (storage.AtomicInt64, storage.Unlimited)): + writer_info["boost-histogram"]["storage_type"] = type(h.storage_type()).__name__ + data = { "uhi_schema": 1, - "writer_info": {"boost-histogram": {"version": version.version}}, + "writer_info": writer_info, "axes": [_axis_to_dict(axis) for axis in h.axes], - "storage": _storage_to_dict(h.storage_type(), h.view(flow=True)), } + if keep_storage: + data["storage"] = _storage_to_dict(h.storage_type(), h.view(flow=True)) + else: + data["storage"] = {"type": storage_type_str} data["metadata"] = serialize_metadata(h.__dict__) return data @@ -35,24 +90,34 @@ def to_uhi(h: histogram.Histogram[Any], /) -> dict[str, Any]: def from_uhi(data: dict[str, Any], /) -> histogram.Histogram[Any]: """Convert a dictionary to an Histogram.""" + # One time use + axis = (_axis_from_dict(ax) for ax in data["axes"]) - h = histogram.Histogram( - *(_axis_from_dict(ax) for ax in data["axes"]), - storage=_storage_from_dict(data["storage"]), - ) - raw_data = _data_from_dict(data["storage"]) + storage_data = data["storage"] + storage_ = _storage_from_dict(storage_data, data.get("writer_info", {})) + h = histogram.Histogram[Any](*axis, storage=storage_) + h.__dict__ = data.get("metadata", {}) + + # Check if storage has data (if not, it's a structure-only histogram) + # Validate required keys per storage type before deciding to skip data loading + storage_type = storage_data["type"] + has_data_keys = _storage_has_data_keys(storage_data, storage_type) + + if not has_data_keys: + return h + + raw_data = _data_from_dict(storage_data) view_shape = h.view(flow=True).shape # Reshape raw_data to the expected shape. This is necessary because JSON # serialization can collapse empty dimensions (e.g. (5, 0, 0) -> (5, 0)), # so we must restore the correct number of dimensions. - storage_type = data["storage"]["type"] + storage_type = storage_data["type"] if storage_type in {"weighted", "mean", "weighted_mean"}: raw_data = np.asarray(raw_data) raw_data = raw_data.reshape(view_shape + raw_data.shape[-1:]) else: raw_data = np.reshape(raw_data, view_shape) h[...] = raw_data - h.__dict__ = data.get("metadata", {}) return h @@ -70,7 +135,7 @@ def remove_writer_info(obj: T, /, *, library: str | None = "boost-histogram") -> obj = copy.copy(obj) if library is None: - obj.pop("writer_info") + obj.pop("writer_info", None) elif library in obj.get("writer_info", {}): obj["writer_info"] = copy.copy(obj["writer_info"]) del obj["writer_info"][library] diff --git a/src/boost_histogram/serialization/_storage.py b/src/boost_histogram/serialization/_storage.py index f484184c..116dd4bd 100644 --- a/src/boost_histogram/serialization/_storage.py +++ b/src/boost_histogram/serialization/_storage.py @@ -7,39 +7,60 @@ from .. import storage -__all__ = ["_data_from_dict", "_storage_from_dict", "_storage_to_dict"] +__all__ = [ + "_data_from_dict", + "_storage_from_dict", + "_storage_to_dict", + "_storage_type_to_str", +] def __dir__() -> list[str]: return __all__ +def _storage_type_to_str(_storage: storage.Storage, /) -> str: + """Return the canonical storage type string for a storage object.""" + match _storage: + case storage.Int64(): + return "int" + case storage.Double(): + return "double" + case storage.AtomicInt64(): + return "int" + case storage.Unlimited(): + return "double" + case storage.Weight(): + return "weighted" + case storage.Mean(): + return "mean" + case storage.WeightedMean(): + return "weighted_mean" + case _: + raise TypeError(f"Unsupported storage type: {_storage}") + + @functools.singledispatch -def _storage_to_dict(_storage: Any, /, data: Any) -> dict[str, Any]: # noqa: ARG001 +def _storage_to_dict(_storage: storage.Storage, /, data: Any) -> dict[str, Any]: # noqa: ARG001 """Convert a storage to a dictionary.""" msg = f"Unsupported storage type: {_storage}" raise TypeError(msg) @_storage_to_dict.register(storage.Int64) -def _(_storage: storage.Int64, /, data: Any) -> dict[str, Any]: - return {"type": "int", "values": data} - - @_storage_to_dict.register(storage.Double) -def _(_storage: storage.Double, /, data: Any) -> dict[str, Any]: - return {"type": "double", "values": data} +def _(_storage: storage.Int64 | storage.Double, /, data: Any) -> dict[str, Any]: + return {"type": _storage_type_to_str(_storage), "values": data} @_storage_to_dict.register(storage.AtomicInt64) @_storage_to_dict.register(storage.Unlimited) def _( - storage_: storage.AtomicInt64 | storage.Unlimited, + _storage: storage.AtomicInt64 | storage.Unlimited, /, data: Any, ) -> dict[str, Any]: return { - "writer_info": {"boost-histogram": {"orig_type": type(storage_).__name__}}, "type": "int" if np.issubdtype(data.dtype, np.integer) else "double", "values": data, } @@ -48,7 +69,7 @@ def _( @_storage_to_dict.register(storage.Weight) def _(_storage: storage.Weight, /, data: Any) -> dict[str, Any]: return { - "type": "weighted", + "type": _storage_type_to_str(_storage), "values": data.value, "variances": data.variance, } @@ -57,7 +78,7 @@ def _(_storage: storage.Weight, /, data: Any) -> dict[str, Any]: @_storage_to_dict.register(storage.Mean) def _(_storage: storage.Mean, /, data: Any) -> dict[str, Any]: return { - "type": "mean", + "type": _storage_type_to_str(_storage), "counts": data.count, "values": data.value, "variances": data.variance, @@ -67,7 +88,7 @@ def _(_storage: storage.Mean, /, data: Any) -> dict[str, Any]: @_storage_to_dict.register(storage.WeightedMean) def _(_storage: storage.WeightedMean, /, data: Any) -> dict[str, Any]: return { - "type": "weighted_mean", + "type": _storage_type_to_str(_storage), "sum_of_weights": data.sum_of_weights, "sum_of_weights_squared": data.sum_of_weights_squared, "values": data.value, @@ -75,12 +96,20 @@ def _(_storage: storage.WeightedMean, /, data: Any) -> dict[str, Any]: } -def _storage_from_dict(data: dict[str, Any], /) -> storage.Storage: +def _storage_from_dict( + data: dict[str, Any], writer_info: dict[str, Any] | None = None, / +) -> storage.Storage: """Convert a dictionary to a storage object.""" # If loading a boost-histogram, we can load the exact original type - orig_type = ( - data.get("writer_info", {}).get("boost-histogram", {}).get("orig_type", "") - ) + # Check both the main writer_info (new location) and storage writer_info (old location) + orig_type = "" + if writer_info: + orig_type = writer_info.get("boost-histogram", {}).get("storage_type", "") + if not orig_type: + orig_type = ( + data.get("writer_info", {}).get("boost-histogram", {}).get("orig_type", "") + ) + if orig_type == "AtomicInt64": return storage.AtomicInt64() if orig_type == "Unlimited": diff --git a/tests/test_serialization_uhi.py b/tests/test_serialization_uhi.py index 7f591a62..b541d23a 100644 --- a/tests/test_serialization_uhi.py +++ b/tests/test_serialization_uhi.py @@ -216,9 +216,97 @@ def test_round_trip_native() -> None: assert h == h2 - assert isinstance(h2.axes[0], bh.axis.Integer) - assert h2.storage_type is bh.storage.AtomicInt64 - assert h2.axes[0].traits.growth == h.axes[0].traits.growth + +@pytest.mark.parametrize( + ("storage_type", "expected_type"), + [ + pytest.param(bh.storage.Int64(), "int", id="int64"), + pytest.param(bh.storage.AtomicInt64(), "int", id="atomic_int64"), + pytest.param(bh.storage.Double(), "double", id="double"), + pytest.param(bh.storage.Weight(), "weighted", id="weight"), + pytest.param(bh.storage.Mean(), "mean", id="mean"), + ], +) +def test_to_uhi_keep_storage_option( + storage_type: bh.storage.Storage, expected_type: str +) -> None: + h = bh.Histogram( + bh.axis.Regular(3, 0, 1), + storage=storage_type, + ) + data_with = to_uhi(h) + data_without = to_uhi(h, keep_storage=False) + + assert "storage" in data_with + assert "storage" in data_without + # Storage with data has "values" key + assert "values" in data_with["storage"] + # Storage without data has only type information + assert "values" not in data_without["storage"] + assert data_without["storage"]["type"] == expected_type + + +@pytest.mark.parametrize( + "storage_type", + [ + pytest.param(bh.storage.Int64(), id="int64"), + pytest.param(bh.storage.AtomicInt64(), id="atomic_int64"), + pytest.param(bh.storage.Double(), id="double"), + pytest.param(bh.storage.Weight(), id="weight"), + pytest.param(bh.storage.Mean(), id="mean"), + ], +) +def test_from_uhi_missing_storage_data(storage_type: bh.storage.Storage) -> None: + h = bh.Histogram( + bh.axis.Regular(4, 0.0, 1.0), + storage=storage_type, + ) + # produce a UHI dict with storage type but no data + data = to_uhi(h, keep_storage=False) + + h2 = from_uhi(data) + + # axes and storage type should round-trip, data should be zeros + assert pytest.approx(np.array(h.axes[0])) == np.array(h2.axes[0]) + assert h2.storage_type is type(storage_type) + assert np.asarray(h2) == pytest.approx(np.zeros_like(np.asarray(h2))) + + +@pytest.mark.parametrize( + ("storage_type", "expected_type"), + [ + pytest.param(bh.storage.AtomicInt64(), "int", id="atomic_int64"), + pytest.param(bh.storage.Unlimited(), "double", id="unlimited"), + ], +) +def test_from_uhi_old_style_writer_info( + storage_type: bh.storage.Storage, expected_type: str +) -> None: + h = bh.Histogram( + bh.axis.Regular(4, 0.0, 1.0), + storage=storage_type, + ) + h.fill([0, 0, 1, 2]) + + data = to_uhi(h) + + old_style_data = { + "uhi_schema": 1, + "axes": data["axes"], + "storage": { + "type": expected_type, + "values": data["storage"]["values"], + "writer_info": { + "boost-histogram": {"orig_type": type(storage_type).__name__} + }, + }, + "metadata": data.get("metadata", {}), + } + + h2 = from_uhi(old_style_data) + + assert h == h2 + assert h2.storage_type is type(storage_type) @pytest.mark.parametrize( @@ -414,3 +502,109 @@ def test_round_trip_3d_histogram_json_constructor() -> None: assert h.ndim == h2.ndim assert h == h2 + + +def test_from_uhi_malformed_weight_storage() -> None: + """Test that malformed Weight storage ( missing required keys) raises ValueError.""" + data = { + "uhi_schema": 1, + "axes": [ + { + "type": "regular", + "lower": 0, + "upper": 10, + "bins": 5, + "underflow": True, + "overflow": True, + "circular": False, + } + ], + "storage": { + "type": "weighted", + "values": [1, 2, 3, 4, 5], + }, + "metadata": {}, + } + + with pytest.raises(ValueError, match="Weighted storage missing required keys"): + from_uhi(data) + + +def test_from_uhi_malformed_mean_storage() -> None: + """Test that malformed Mean storage (missing required keys) raises ValueError.""" + data = { + "uhi_schema": 1, + "axes": [ + { + "type": "regular", + "lower": 0, + "upper": 10, + "bins": 5, + "underflow": True, + "overflow": True, + "circular": False, + } + ], + "storage": { + "type": "mean", + "counts": [1, 2, 3, 4, 5], + "values": [1, 2, 3, 4, 5], + }, + "metadata": {}, + } + + with pytest.raises(ValueError, match="Mean storage missing required keys"): + from_uhi(data) + + +def test_from_uhi_malformed_weighted_mean_storage() -> None: + """Test that malformed WeightedMean storage (missing required keys) raises ValueError.""" + data = { + "uhi_schema": 1, + "axes": [ + { + "type": "regular", + "lower": 0, + "upper": 10, + "bins": 5, + "underflow": True, + "overflow": True, + "circular": False, + } + ], + "storage": { + "type": "weighted_mean", + "sum_of_weights": [1, 2, 3, 4, 5], + "values": [1, 2, 3, 4, 5], + }, + "metadata": {}, + } + + with pytest.raises(ValueError, match="Weighted_mean storage missing required keys"): + from_uhi(data) + + +def test_from_uhi_structure_only_no_error() -> None: + """Test that structure-only (no data keys) histograms load correctly.""" + data = { + "uhi_schema": 1, + "axes": [ + { + "type": "regular", + "lower": 0, + "upper": 10, + "bins": 5, + "underflow": True, + "overflow": True, + "circular": False, + } + ], + "storage": { + "type": "double", + }, + "metadata": {}, + } + + h = from_uhi(data) + assert h.storage_type is bh.storage.Double + assert np.asarray(h) == pytest.approx(np.zeros(5))