diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 088ffcc76dc..b952804025d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,7 +21,15 @@ v2025.03.1 (unreleased) New Features ~~~~~~~~~~~~ - +- By default xarray now encodes :py:class:`numpy.timedelta64` values by + converting to :py:class:`numpy.int64` values and storing ``"dtype"`` and + ``"units"`` attributes consistent with the dtype of the in-memory + :py:class:`numpy.timedelta64` values, e.g. ``"timedelta64[s]"`` and + ``"seconds"`` for second-resolution timedeltas. These values will always be + decoded to timedeltas without a warning moving forward. Timedeltas encoded + via the previous approach can still be roundtripped exactly, but in the + future will not be decoded by default (:issue:`1621`, :issue:`10099`, + :pull:`10101`). By `Spencer Clark `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/coding/times.py b/xarray/coding/times.py index fdecfe77ede..60d9f307339 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +import typing import warnings from collections.abc import Callable, Hashable from datetime import datetime, timedelta @@ -92,6 +93,14 @@ ) +_INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS = [ + "_FillValue", + "missing_value", + "add_offset", + "scale_factor", +] + + def _is_standard_calendar(calendar: str) -> bool: return calendar.lower() in _STANDARD_CALENDARS @@ -1394,62 +1403,140 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: return variable +def has_timedelta64_encoding_dtype(attrs_or_encoding: dict) -> bool: + dtype = attrs_or_encoding.get("dtype", None) + return isinstance(dtype, str) and dtype.startswith("timedelta64") + + class CFTimedeltaCoder(VariableCoder): """Coder for CF Timedelta coding. Parameters ---------- time_unit : PDDatetimeUnitOptions - Target resolution when decoding timedeltas. Defaults to "ns". + Target resolution when decoding timedeltas via units. Defaults to "ns". + When decoding via dtype, the resolution is specified in the dtype + attribute, so this parameter is ignored. + decode_via_units : bool + Whether to decode timedeltas based on the presence of a timedelta-like + units attribute, e.g. "seconds". Defaults to True, but in the future + will default to False. + decode_via_dtype : bool + Whether to decode timedeltas based on the presence of a np.timedelta64 + dtype attribute, e.g. "timedelta64[s]". Defaults to True. """ def __init__( self, time_unit: PDDatetimeUnitOptions = "ns", + decode_via_units: bool = True, + decode_via_dtype: bool = True, ) -> None: self.time_unit = time_unit + self.decode_via_units = decode_via_units + self.decode_via_dtype = decode_via_dtype self._emit_decode_timedelta_future_warning = False def encode(self, variable: Variable, name: T_Name = None) -> Variable: if np.issubdtype(variable.data.dtype, np.timedelta64): dims, data, attrs, encoding = unpack_for_encoding(variable) + has_timedelta_dtype = has_timedelta64_encoding_dtype(encoding) + if ("units" in encoding or "dtype" in encoding) and not has_timedelta_dtype: + dtype = encoding.get("dtype", None) + units = encoding.pop("units", None) - dtype = encoding.get("dtype", None) + # in the case of packed data we need to encode into + # float first, the correct dtype will be established + # via CFScaleOffsetCoder/CFMaskCoder + if "add_offset" in encoding or "scale_factor" in encoding: + dtype = data.dtype if data.dtype.kind == "f" else "float64" - # in the case of packed data we need to encode into - # float first, the correct dtype will be established - # via CFScaleOffsetCoder/CFMaskCoder - if "add_offset" in encoding or "scale_factor" in encoding: - dtype = data.dtype if data.dtype.kind == "f" else "float64" - - data, units = encode_cf_timedelta(data, encoding.pop("units", None), dtype) + else: + resolution, _ = np.datetime_data(variable.dtype) + dtype = np.int64 + attrs_dtype = f"timedelta64[{resolution}]" + units = _numpy_dtype_to_netcdf_timeunit(variable.dtype) + safe_setitem(attrs, "dtype", attrs_dtype, name=name) + # Remove dtype encoding if it exists to prevent it from + # interfering downstream in NonStringCoder. + encoding.pop("dtype", None) + + if any( + k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS + ): + raise ValueError( + f"Specifying '_FillValue', 'missing_value', " + f"'add_offset', or 'scale_factor' is not supported " + f"when literally encoding the np.timedelta64 values " + f"of variable {name!r}. To encode {name!r} with such " + f"encoding parameters, additionally set " + f"encoding['units'] to a unit of time, e.g. " + f"'seconds'. To proceed with literal np.timedelta64 " + f"encoding of {name!r}, remove any encoding entries " + f"for '_FillValue', 'missing_value', 'add_offset', " + f"or 'scale_factor'." + ) + data, units = encode_cf_timedelta(data, units, dtype) safe_setitem(attrs, "units", units, name=name) - return Variable(dims, data, attrs, encoding, fastpath=True) else: return variable def decode(self, variable: Variable, name: T_Name = None) -> Variable: units = variable.attrs.get("units", None) - if isinstance(units, str) and units in TIME_UNITS: - if self._emit_decode_timedelta_future_warning: - emit_user_level_warning( - "In a future version of xarray decode_timedelta will " - "default to False rather than None. To silence this " - "warning, set decode_timedelta to True, False, or a " - "'CFTimedeltaCoder' instance.", - FutureWarning, - ) + has_timedelta_units = isinstance(units, str) and units in TIME_UNITS + has_timedelta_dtype = has_timedelta64_encoding_dtype(variable.attrs) + is_dtype_decodable = has_timedelta_units and has_timedelta_dtype + is_units_decodable = has_timedelta_units + if (is_dtype_decodable and self.decode_via_dtype) or ( + is_units_decodable and self.decode_via_units + ): dims, data, attrs, encoding = unpack_for_decoding(variable) - units = pop_to(attrs, encoding, "units") - dtype = np.dtype(f"timedelta64[{self.time_unit}]") - transform = partial( - decode_cf_timedelta, units=units, time_unit=self.time_unit - ) + if is_dtype_decodable and self.decode_via_dtype: + if any( + k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS + ): + raise ValueError( + "Decoding np.timedelta64 values via dtype is not " + "supported when '_FillValue', 'missing_value', " + "'add_offset', or 'scale_factor' are present in " + "encoding." + ) + dtype = pop_to(attrs, encoding, "dtype", name=name) + dtype = np.dtype(dtype) + resolution, _ = np.datetime_data(dtype) + if resolution not in typing.get_args(PDDatetimeUnitOptions): + raise ValueError( + f"Following pandas, xarray only supports decoding to " + f"timedelta64 values with a resolution of 's', 'ms', " + f"'us', or 'ns'. Encoded values have a resolution of " + f"{resolution!r}." + ) + time_unit = cast(PDDatetimeUnitOptions, resolution) + elif self.decode_via_units: + if self._emit_decode_timedelta_future_warning: + emit_user_level_warning( + "In a future version, xarray will not decode " + "timedelta values based on the presence of a " + "timedelta-like units attribute by default. Instead " + "it will rely on the presence of a np.timedelta64 " + "dtype attribute, which is now xarray's default way " + "of encoding np.timedelta64 values. To continue " + "decoding timedeltas based on the presence of a " + "timedelta-like units attribute, users will need to " + "explicitly opt-in by passing True or " + "CFTimedeltaCoder(decode_via_units=True) to " + "decode_timedelta. To silence this warning, set " + "decode_timedelta to True, False, or a " + "'CFTimedeltaCoder' instance.", + FutureWarning, + ) + dtype = np.dtype(f"timedelta64[{self.time_unit}]") + time_unit = self.time_unit + transform = partial(decode_cf_timedelta, units=units, time_unit=time_unit) data = lazy_elemwise_func(data, transform, dtype=dtype) - return Variable(dims, data, attrs, encoding, fastpath=True) else: return variable diff --git a/xarray/conventions.py b/xarray/conventions.py index 071dab43c28..ab41804fd62 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -204,8 +204,10 @@ def decode_cf_variable( var = coder.decode(var, name=name) if decode_timedelta: - if not isinstance(decode_timedelta, CFTimedeltaCoder): - decode_timedelta = CFTimedeltaCoder() + if isinstance(decode_timedelta, bool): + decode_timedelta = CFTimedeltaCoder( + decode_via_units=decode_timedelta, decode_via_dtype=decode_timedelta + ) decode_timedelta._emit_decode_timedelta_future_warning = ( decode_timedelta_was_none ) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b2841e0da48..33b451be7f8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -635,7 +635,10 @@ def test_roundtrip_timedelta_data(self) -> None: # though we cannot test that until we fix the timedelta decoding # to support large ranges time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit("s") # type: ignore[arg-type, unused-ignore] + encoding = {"units": "seconds"} expected = Dataset({"td": ("td", time_deltas), "td0": time_deltas[0]}) + expected["td"].encoding = encoding + expected["td0"].encoding = encoding with self.roundtrip( expected, open_kwargs={"decode_timedelta": CFTimedeltaCoder(time_unit="ns")} ) as actual: diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index e4541bad7e6..7e1d114ece6 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -20,6 +20,7 @@ ) from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder from xarray.coding.times import ( + _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS, _encode_datetime_with_cftime, _netcdf_to_numpy_timeunit, _numpy_to_netcdf_timeunit, @@ -1515,7 +1516,7 @@ def test_roundtrip_timedelta64_nanosecond_precision( timedelta_values[2] = nat timedelta_values[4] = nat - encoding = dict(dtype=dtype, _FillValue=fill_value) + encoding = dict(dtype=dtype, _FillValue=fill_value, units="nanoseconds") var = Variable(["time"], timedelta_values, encoding=encoding) encoded_var = conventions.encode_cf_variable(var) @@ -1867,7 +1868,8 @@ def test_decode_timedelta( decode_times, decode_timedelta, expected_dtype, warns ) -> None: timedeltas = pd.timedelta_range(0, freq="D", periods=3) - var = Variable(["time"], timedeltas) + encoding = {"units": "days"} + var = Variable(["time"], timedeltas, encoding=encoding) encoded = conventions.encode_cf_variable(var) if warns: with pytest.warns(FutureWarning, match="decode_timedelta"): @@ -1959,3 +1961,125 @@ def test_decode_floating_point_timedelta_no_serialization_warning() -> None: decoded = conventions.decode_cf_variable("foo", encoded, decode_timedelta=True) with assert_no_warnings(): decoded.load() + + +def test_literal_timedelta64_coding(time_unit: PDDatetimeUnitOptions) -> None: + timedeltas = pd.timedelta_range(0, freq="D", periods=3, unit=time_unit) # type: ignore[call-arg] + variable = Variable(["time"], timedeltas) + expected_dtype = f"timedelta64[{time_unit}]" + expected_units = _numpy_to_netcdf_timeunit(time_unit) + + encoded = conventions.encode_cf_variable(variable) + assert encoded.attrs["dtype"] == expected_dtype + assert encoded.attrs["units"] == expected_units + + decoded = conventions.decode_cf_variable("timedeltas", encoded) + assert decoded.encoding["dtype"] == expected_dtype + assert decoded.encoding["units"] == expected_units + + assert_identical(decoded, variable) + assert decoded.dtype == variable.dtype + + reencoded = conventions.encode_cf_variable(decoded) + assert_identical(reencoded, encoded) + assert reencoded.dtype == encoded.dtype + + +def test_literal_timedelta_coding_resolution_error() -> None: + attrs = {"dtype": "timedelta64[D]", "units": "days"} + encoded = Variable(["time"], [0, 1, 2], attrs=attrs) + with pytest.raises(ValueError, match="xarray only supports"): + conventions.decode_cf_variable("timedeltas", encoded) + + +@pytest.mark.parametrize("attribute", ["dtype", "units"]) +def test_literal_timedelta_decode_invalid_encoding(attribute) -> None: + attrs = {"dtype": "timedelta64[s]", "units": "seconds"} + encoding = {attribute: "foo"} + encoded = Variable(["time"], [0, 1, 2], attrs=attrs, encoding=encoding) + with pytest.raises(ValueError, match="failed to prevent"): + conventions.decode_cf_variable("timedeltas", encoded) + + +@pytest.mark.parametrize("attribute", ["dtype", "units"]) +def test_literal_timedelta_encode_invalid_attribute(attribute) -> None: + timedeltas = pd.timedelta_range(0, freq="D", periods=3) + attrs = {attribute: "foo"} + variable = Variable(["time"], timedeltas, attrs=attrs) + with pytest.raises(ValueError, match="failed to prevent"): + conventions.encode_cf_variable(variable) + + +@pytest.mark.parametrize("invalid_key", _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS) +def test_literal_timedelta_encoding_mask_and_scale_error(invalid_key) -> None: + encoding = {invalid_key: 1.0} + timedeltas = pd.timedelta_range(0, freq="D", periods=3) + variable = Variable(["time"], timedeltas, encoding=encoding) + with pytest.raises(ValueError, match=invalid_key): + conventions.encode_cf_variable(variable) + + +@pytest.mark.parametrize("invalid_key", _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS) +def test_literal_timedelta_decoding_mask_and_scale_error(invalid_key) -> None: + attrs = {invalid_key: 1.0, "dtype": "timedelta64[s]", "units": "seconds"} + variable = Variable(["time"], [0, 1, 2], attrs=attrs) + with pytest.raises(ValueError, match=invalid_key): + conventions.decode_cf_variable("foo", variable) + + +@pytest.mark.parametrize( + ("decode_via_units", "decode_via_dtype", "attrs", "expect_timedelta64"), + [ + (True, True, {"units": "seconds"}, True), + (True, False, {"units": "seconds"}, True), + (False, True, {"units": "seconds"}, False), + (False, False, {"units": "seconds"}, False), + (True, True, {"dtype": "timedelta64[s]", "units": "seconds"}, True), + (True, False, {"dtype": "timedelta64[s]", "units": "seconds"}, True), + (False, True, {"dtype": "timedelta64[s]", "units": "seconds"}, True), + (False, False, {"dtype": "timedelta64[s]", "units": "seconds"}, False), + ], + ids=lambda x: f"{x!r}", +) +def test_timedelta_decoding_options( + decode_via_units, decode_via_dtype, attrs, expect_timedelta64 +) -> None: + array = np.array([0, 1, 2], dtype=np.dtype("int64")) + encoded = Variable(["time"], array, attrs=attrs) + + # Confirm we decode to the expected dtype. + decode_timedelta = CFTimedeltaCoder( + time_unit="s", + decode_via_units=decode_via_units, + decode_via_dtype=decode_via_dtype, + ) + decoded = conventions.decode_cf_variable( + "foo", encoded, decode_timedelta=decode_timedelta + ) + if expect_timedelta64: + assert decoded.dtype == np.dtype("timedelta64[s]") + else: + assert decoded.dtype == np.dtype("int64") + + # Confirm we exactly roundtrip. + reencoded = conventions.encode_cf_variable(decoded) + assert_identical(reencoded, encoded) + + +def test_timedelta_encoding_explicit_non_timedelta64_dtype() -> None: + encoding = {"dtype": np.dtype("int32")} + timedeltas = pd.timedelta_range(0, freq="D", periods=3) + variable = Variable(["time"], timedeltas, encoding=encoding) + + encoded = conventions.encode_cf_variable(variable) + assert encoded.attrs["units"] == "days" + assert encoded.dtype == np.dtype("int32") + + with pytest.warns(FutureWarning, match="timedelta"): + decoded = conventions.decode_cf_variable("foo", encoded) + assert_identical(decoded, variable) + + reencoded = conventions.encode_cf_variable(decoded) + assert_identical(reencoded, encoded) + assert encoded.attrs["units"] == "days" + assert encoded.dtype == np.dtype("int32")