Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement literal np.timedelta64 coding #10101

Draft
wants to merge 30 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
063437b
Proof of concept literal timedelta64 coding
spencerkclark Mar 6, 2025
03f2988
Ensure test_roundtrip_timedelta_data test uses old encoding pathway
spencerkclark Mar 6, 2025
bdb53d7
Remove no longer relevant test
spencerkclark Mar 7, 2025
05c3ce6
Merge branch 'main' into timedelta64-encoding
spencerkclark Mar 7, 2025
00d9eaa
Include units attribute
spencerkclark Mar 8, 2025
b043b45
Move coder to times.py
spencerkclark Mar 8, 2025
6f4e6e4
Merge branch 'main' into timedelta64-encoding
spencerkclark Mar 8, 2025
7f73753
Add what's new entry
spencerkclark Mar 8, 2025
4a8e111
Merge branch 'timedelta64-encoding' of https://github.com/spencerkcla…
spencerkclark Mar 8, 2025
9ce2a24
Restore test and reduce diff
spencerkclark Mar 8, 2025
eb6e19a
Fix typing
spencerkclark Mar 8, 2025
436e588
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 8, 2025
a305238
Fix doctests
spencerkclark Mar 8, 2025
b406c64
Restore original order of encoders
spencerkclark Mar 8, 2025
a21b137
Add return types to tests
spencerkclark Mar 8, 2025
5108b02
Move everything to CFTimedeltaCoder; reuse code where possible
spencerkclark Mar 8, 2025
452968c
Fix mypy
spencerkclark Mar 9, 2025
503db4a
Use Kai's offset and scale_factor logic for all encoding
spencerkclark Mar 9, 2025
9aee097
Merge branch 'main' into timedelta64-encoding
spencerkclark Mar 22, 2025
56f55e2
Fix bad merge
spencerkclark Mar 22, 2025
c5e7de9
Forbid mixing other encoding with literal timedelta64 encoding
spencerkclark Mar 22, 2025
d1744af
Expose fine-grained control over decoding pathways
spencerkclark Mar 22, 2025
7c7b071
Rename test
spencerkclark Mar 22, 2025
da1edc4
Use consistent dtype spelling
spencerkclark Mar 22, 2025
2bb4b99
Continue supporting non-timedelta dtype-only encoding
spencerkclark Mar 22, 2025
0220ed5
Fix example attribute in docstring
spencerkclark Mar 22, 2025
c83fcb3
Update what's new
spencerkclark Mar 22, 2025
d1e8a5e
Fix typo
spencerkclark Mar 22, 2025
7b94d35
Complete test
spencerkclark Mar 22, 2025
f269e68
Fix docstring
spencerkclark Mar 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -1368,13 +1368,14 @@ def __init__(
def encode(self, variable: Variable, name: T_Name = None) -> Variable:
if np.issubdtype(variable.data.dtype, np.timedelta64):
dims, data, attrs, encoding = unpack_for_encoding(variable)

data, units = encode_cf_timedelta(
data, encoding.pop("units", None), encoding.get("dtype", None)
)
safe_setitem(attrs, "units", units, name=name)

return Variable(dims, data, attrs, encoding, fastpath=True)
if "units" in encoding:
data, units = encode_cf_timedelta(
data, encoding.pop("units"), encoding.get("dtype", None)
)
safe_setitem(attrs, "units", units, name=name)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable
else:
return variable

Expand Down
75 changes: 75 additions & 0 deletions xarray/coding/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import typing
import warnings
from collections.abc import Callable, Hashable, MutableMapping
from functools import partial
Expand All @@ -11,6 +12,7 @@
import pandas as pd

from xarray.core import dtypes, duck_array_ops, indexing
from xarray.core.types import PDDatetimeUnitOptions
from xarray.core.variable import Variable
from xarray.namedarray.parallelcompat import get_chunked_array_type
from xarray.namedarray.pycompat import is_chunked_array
Expand Down Expand Up @@ -161,6 +163,45 @@
return np.asarray(self.array[key], dtype=self.dtype)


class Timedelta64TypeArray(indexing.ExplicitlyIndexedNDArrayMixin):
"""Decode arrays on the fly from integer to np.timedelta64 datatype

This is useful for decoding timedelta64 arrays from integer typed netCDF
variables.

>>> x = np.array([1, 0, 1, 1, 0], dtype="int64")

>>> x.dtype
dtype('int64')

>>> Timedelta64TypeArray(x, np.dtype("timedelta64[ns]")).dtype
dtype('timedelta64[ns]')

>>> indexer = indexing.BasicIndexer((slice(None),))
>>> Timedelta64TypeArray(x, np.dtype("timedelta64[ns]"))[indexer].dtype
dtype('timedelta64[ns]')
"""

__slots__ = ("_dtype", "array")

def __init__(self, array, dtype: np.typing.DTypeLike) -> None:
self.array = indexing.as_indexable(array)
self._dtype = dtype

@property
def dtype(self):
return np.dtype(self._dtype)

def _oindex_get(self, key):
return np.asarray(self.array.oindex[key], dtype=self.dtype)

def _vindex_get(self, key):
return np.asarray(self.array.vindex[key], dtype=self.dtype)

def __getitem__(self, key) -> np.ndarray:
return np.asarray(self.array[key], dtype=self.dtype)


def lazy_elemwise_func(array, func: Callable, dtype: np.typing.DTypeLike):
"""Lazily apply an element-wise function to an array.
Parameters
Expand Down Expand Up @@ -345,7 +386,7 @@
# otherwise numpy unsigned ints will silently cast to the signed counterpart
fill_value = fill_value.item()
# passes if provided fill value fits in encoded on-disk type
new_fill = encoded_dtype.type(fill_value)

Check warning on line 389 in xarray/coding/variables.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

NumPy will stop allowing conversion of out-of-bound Python integers to integer arrays. The conversion of 255 to int8 will fail in the future. For the old behavior, usually: np.array(value).astype(dtype)` will give the desired result (the cast overflows).

Check warning on line 389 in xarray/coding/variables.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

NumPy will stop allowing conversion of out-of-bound Python integers to integer arrays. The conversion of 255 to int8 will fail in the future. For the old behavior, usually: np.array(value).astype(dtype)` will give the desired result (the cast overflows).

Check warning on line 389 in xarray/coding/variables.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

NumPy will stop allowing conversion of out-of-bound Python integers to integer arrays. The conversion of 255 to int8 will fail in the future. For the old behavior, usually: np.array(value).astype(dtype)` will give the desired result (the cast overflows).

Check warning on line 389 in xarray/coding/variables.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

NumPy will stop allowing conversion of out-of-bound Python integers to integer arrays. The conversion of 255 to int8 will fail in the future. For the old behavior, usually: np.array(value).astype(dtype)` will give the desired result (the cast overflows).

Check warning on line 389 in xarray/coding/variables.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

NumPy will stop allowing conversion of out-of-bound Python integers to integer arrays. The conversion of 255 to int8 will fail in the future. For the old behavior, usually: np.array(value).astype(dtype)` will give the desired result (the cast overflows).

Check warning on line 389 in xarray/coding/variables.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

NumPy will stop allowing conversion of out-of-bound Python integers to integer arrays. The conversion of 255 to int8 will fail in the future. For the old behavior, usually: np.array(value).astype(dtype)` will give the desired result (the cast overflows).

Check warning on line 389 in xarray/coding/variables.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

NumPy will stop allowing conversion of out-of-bound Python integers to integer arrays. The conversion of 255 to int8 will fail in the future. For the old behavior, usually: np.array(value).astype(dtype)` will give the desired result (the cast overflows).

Check warning on line 389 in xarray/coding/variables.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

NumPy will stop allowing conversion of out-of-bound Python integers to integer arrays. The conversion of 255 to int8 will fail in the future. For the old behavior, usually: np.array(value).astype(dtype)` will give the desired result (the cast overflows).

Check warning on line 389 in xarray/coding/variables.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

NumPy will stop allowing conversion of out-of-bound Python integers to integer arrays. The conversion of 255 to int8 will fail in the future. For the old behavior, usually: np.array(value).astype(dtype)` will give the desired result (the cast overflows).

Check warning on line 389 in xarray/coding/variables.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10 min-all-deps

NumPy will stop allowing conversion of out-of-bound Python integers to integer arrays. The conversion of 255 to int8 will fail in the future. For the old behavior, usually: np.array(value).astype(dtype)` will give the desired result (the cast overflows).::warning file=/home/runner/work/xarray/xarray/xarray/coding/variables.py,line=389::NumPy will stop allowing conversion of out-of-bound Python integers to integer arrays. The conversion of 255 to int8 will fail in the future. For the old behavior, usually: np.array(value).astype(dtype)` will give the desired result (the cast overflows).
except OverflowError:
encoded_kind_str = "signed" if encoded_dtype.kind == "i" else "unsigned"
warnings.warn(
Expand Down Expand Up @@ -738,3 +779,37 @@

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
raise NotImplementedError()


class LiteralTimedelta64Coder(VariableCoder):
"""Code np.timedelta64 values."""

def encode(self, variable: Variable, name: T_Name = None) -> Variable:
if np.issubdtype(variable.data.dtype, np.timedelta64):
dims, data, attrs, encoding = unpack_for_encoding(variable)
resolution, _ = np.datetime_data(variable.dtype)
attrs["dtype"] = f"timedelta64[{resolution}]"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about also including units in attrs?

That would make timedelta64 encoding still specify units in the style of CF conventions, which could make us a little more compatible with non-Xarray tools.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be possible, but this would need an additional check inside CFTimedeltaCoder to prevent premature encoding and decoding if both attributes are attached.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree this is a good idea, though does increase the complexity a little. I gave it a try in my latest push.

data = duck_array_ops.astype(data, dtype=np.int64, copy=True)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
if variable.attrs.get("dtype", "").startswith("timedelta64"):
dims, data, attrs, encoding = unpack_for_decoding(variable)
# overwrite (!) dtype in encoding, and remove from attrs
# needed for correct subsequent encoding
encoding["dtype"] = attrs.pop("dtype")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typically we use the pop_to() helper to do this safely, e.g., dtype = pop_to(encoding, attrs, "dtype", name=name)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the heads up! I somewhat blindly inherited this from the BooleanCoder. I added some better tests around this issue.

dtype = np.dtype(encoding["dtype"])
resolution, _ = np.datetime_data(dtype)
if resolution not in typing.get_args(PDDatetimeUnitOptions):
raise ValueError(
f"Following pandas, xarray only supports decoding to "
f"timedelta64 values with a resolution of 's', 'ms', "
f"'us', or 'ns'. Encoded values have a resolution of "
f"{resolution!r}."
)
data = Timedelta64TypeArray(data, dtype)
return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable
2 changes: 2 additions & 0 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def encode_cf_variable(
for coder in [
CFDatetimeCoder(),
CFTimedeltaCoder(),
variables.LiteralTimedelta64Coder(),
variables.CFScaleOffsetCoder(),
variables.CFMaskCoder(),
variables.NativeEnumCoder(),
Expand Down Expand Up @@ -238,6 +239,7 @@ def decode_cf_variable(
original_dtype = var.dtype

var = variables.BooleanCoder().decode(var)
var = variables.LiteralTimedelta64Coder().decode(var)

dimensions, data, attributes, encoding = variables.unpack_for_decoding(var)

Expand Down
3 changes: 3 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,7 +635,10 @@
# though we cannot test that until we fix the timedelta decoding
# to support large ranges
time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit("s") # type: ignore[arg-type, unused-ignore]
encoding = {"units": "seconds"}
expected = Dataset({"td": ("td", time_deltas), "td0": time_deltas[0]})
expected["td"].encoding = encoding
expected["td0"].encoding = encoding
with self.roundtrip(
expected, open_kwargs={"decode_timedelta": CFTimedeltaCoder(time_unit="ns")}
) as actual:
Expand Down Expand Up @@ -4185,7 +4188,7 @@
fx.create_dataset(k, data=v)
with pytest.warns(UserWarning, match="The 'phony_dims' kwarg"):
with xr.open_dataset(tmp_file, engine="h5netcdf", group="bar") as ds:
assert ds.dims == {

Check warning on line 4191 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4191 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4191 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4191 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4191 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4191 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / windows-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4191 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4191 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4191 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.

Check warning on line 4191 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.10

The return type of `Dataset.dims` will be changed to return a set of dimension names in future, in order to be more consistent with `DataArray.dims`. To access a mapping from dimension names to lengths, please use `Dataset.sizes`.
"phony_dim_0": 5,
"phony_dim_1": 5,
"phony_dim_2": 5,
Expand Down
21 changes: 19 additions & 2 deletions xarray/tests/test_coding_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -1512,7 +1512,7 @@ def test_roundtrip_timedelta64_nanosecond_precision(
timedelta_values[2] = nat
timedelta_values[4] = nat

encoding = dict(dtype=dtype, _FillValue=fill_value)
encoding = dict(dtype=dtype, _FillValue=fill_value, units="nanoseconds")
var = Variable(["time"], timedelta_values, encoding=encoding)

encoded_var = conventions.encode_cf_variable(var)
Expand Down Expand Up @@ -1863,7 +1863,8 @@ def test_decode_timedelta(
decode_times, decode_timedelta, expected_dtype, warns
) -> None:
timedeltas = pd.timedelta_range(0, freq="D", periods=3)
var = Variable(["time"], timedeltas)
encoding = {"units": "days"}
var = Variable(["time"], timedeltas, encoding=encoding)
encoded = conventions.encode_cf_variable(var)
if warns:
with pytest.warns(FutureWarning, match="decode_timedelta"):
Expand Down Expand Up @@ -1907,3 +1908,19 @@ def test_lazy_decode_timedelta_error() -> None:
)
with pytest.raises(OutOfBoundsTimedelta, match="overflow"):
decoded.load()


def test_literal_timedelta64_coding(time_unit: PDDatetimeUnitOptions):
timedeltas = pd.timedelta_range(0, freq="D", periods=3, unit=time_unit)
variable = Variable(["time"], timedeltas)
encoded = conventions.encode_cf_variable(variable)
decoded = conventions.decode_cf_variable("timedeltas", encoded)
assert_identical(decoded, variable)
assert decoded.dtype == variable.dtype


def test_literal_timedelta_coding_resolution_error():
attrs = {"dtype": "timedelta64[D]"}
encoded = Variable(["time"], [0, 1, 2], attrs=attrs)
with pytest.raises(ValueError, match="xarray only supports"):
conventions.decode_cf_variable("timedeltas", encoded)
1 change: 0 additions & 1 deletion xarray/tests/test_conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ def test_incompatible_attributes(self) -> None:
Variable(
["t"], pd.date_range("2000-01-01", periods=3), {"units": "foobar"}
),
Variable(["t"], pd.to_timedelta(["1 day"]), {"units": "foobar"}), # type: ignore[arg-type, unused-ignore]
Variable(["t"], [0, 1, 2], {"add_offset": 0}, {"add_offset": 2}),
Variable(["t"], [0, 1, 2], {"_FillValue": 0}, {"_FillValue": 2}),
]
Expand Down
Loading