Skip to content
forked from pydata/xarray

Commit b9a34ca

Browse files
committed
Merge branch 'main' into custom-groupers
* main: Refactoring/fixing zarr-python v3 incompatibilities in xarray datatrees (pydata#10020) Refactor calendar fixtures (pydata#10150) Use flox for grouped first, last. (pydata#10148) Update flaky pydap test (pydata#10149)
2 parents b068e94 + f0809e4 commit b9a34ca

10 files changed

+408
-173
lines changed

doc/whats-new.rst

+10-2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@ New Features
4141
- Support reading to `GPU memory with Zarr <https://zarr.readthedocs.io/en/stable/user-guide/gpu.html>`_ (:pull:`10078`).
4242
By `Deepak Cherian <https://github.com/dcherian>`_.
4343

44+
Performance
45+
~~~~~~~~~~~
46+
- :py:meth:`DatasetGroupBy.first` and :py:meth:`DatasetGroupBy.last` can now use ``flox`` if available. (:issue:`9647`)
47+
By `Deepak Cherian <https://github.com/dcherian>`_.
48+
4449
Breaking changes
4550
~~~~~~~~~~~~~~~~
4651
- Rolled back code that would attempt to catch integer overflow when encoding
@@ -61,6 +66,10 @@ Deprecations
6166

6267
Bug fixes
6368
~~~~~~~~~
69+
70+
- Fix ``open_datatree`` incompatibilities with Zarr-Python V3 and refactor
71+
``TestZarrDatatreeIO`` accordingly (:issue:`9960`, :pull:`10020`).
72+
By `Alfonso Ladino-Rincon <https://github.com/aladinor>`_.
6473
- Default to resolution-dependent optimal integer encoding units when saving
6574
chunked non-nanosecond :py:class:`numpy.datetime64` or
6675
:py:class:`numpy.timedelta64` arrays to disk. Previously units of
@@ -92,6 +101,7 @@ Bug fixes
92101
datetimes and timedeltas (:issue:`8957`, :pull:`10050`).
93102
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
94103

104+
95105
Documentation
96106
~~~~~~~~~~~~~
97107
- Better expose the :py:class:`Coordinates` class in API reference (:pull:`10000`)
@@ -174,8 +184,6 @@ New Features
174184
:py:class:`pandas.DatetimeIndex` (:pull:`9965`). By `Spencer Clark
175185
<https://github.com/spencerkclark>`_ and `Kai Mühlbauer
176186
<https://github.com/kmuehlbauer>`_.
177-
- :py:meth:`DatasetGroupBy.first` and :py:meth:`DatasetGroupBy.last` can now use ``flox`` if available. (:issue:`9647`)
178-
By `Deepak Cherian <https://github.com/dcherian>`_.
179187

180188
Breaking changes
181189
~~~~~~~~~~~~~~~~

xarray/backends/zarr.py

+56-36
Original file line numberDiff line numberDiff line change
@@ -666,10 +666,21 @@ def open_store(
666666
use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask,
667667
zarr_format=zarr_format,
668668
)
669+
670+
from zarr import Group
671+
672+
group_members: dict[str, Group] = {}
669673
group_paths = list(_iter_zarr_groups(zarr_group, parent=group))
670-
return {
674+
for path in group_paths:
675+
if path == group:
676+
group_members[path] = zarr_group
677+
else:
678+
rel_path = path.removeprefix(f"{group}/")
679+
group_members[path] = zarr_group[rel_path.removeprefix("/")]
680+
681+
out = {
671682
group: cls(
672-
zarr_group.get(group),
683+
group_store,
673684
mode,
674685
consolidate_on_close,
675686
append_dim,
@@ -680,8 +691,9 @@ def open_store(
680691
use_zarr_fill_value_as_mask,
681692
cache_members=cache_members,
682693
)
683-
for group in group_paths
694+
for group, group_store in group_members.items()
684695
}
696+
return out
685697

686698
@classmethod
687699
def open_group(
@@ -1034,8 +1046,6 @@ def store(
10341046
if self._consolidate_on_close:
10351047
kwargs = {}
10361048
if _zarr_v3():
1037-
# https://github.com/zarr-developers/zarr-python/pull/2113#issuecomment-2386718323
1038-
kwargs["path"] = self.zarr_group.name.lstrip("/")
10391049
kwargs["zarr_format"] = self.zarr_group.metadata.zarr_format
10401050
zarr.consolidate_metadata(self.zarr_group.store, **kwargs)
10411051

@@ -1662,8 +1672,6 @@ def open_groups_as_dict(
16621672
zarr_version=None,
16631673
zarr_format=None,
16641674
) -> dict[str, Dataset]:
1665-
from xarray.core.treenode import NodePath
1666-
16671675
filename_or_obj = _normalize_path(filename_or_obj)
16681676

16691677
# Check for a group and make it a parent if it exists
@@ -1686,7 +1694,6 @@ def open_groups_as_dict(
16861694
)
16871695

16881696
groups_dict = {}
1689-
16901697
for path_group, store in stores.items():
16911698
store_entrypoint = StoreBackendEntrypoint()
16921699

@@ -1762,44 +1769,57 @@ def _get_open_params(
17621769
consolidated = False
17631770

17641771
if _zarr_v3():
1765-
missing_exc = ValueError
1772+
# TODO: replace AssertionError after https://github.com/zarr-developers/zarr-python/issues/2821 is resolved
1773+
missing_exc = AssertionError
17661774
else:
17671775
missing_exc = zarr.errors.GroupNotFoundError
17681776

1769-
if consolidated is None:
1770-
try:
1771-
zarr_group = zarr.open_consolidated(store, **open_kwargs)
1772-
except (ValueError, KeyError):
1773-
# ValueError in zarr-python 3.x, KeyError in 2.x.
1777+
if consolidated in [None, True]:
1778+
# open the root of the store, in case there is metadata consolidated there
1779+
group = open_kwargs.pop("path")
1780+
1781+
if consolidated:
1782+
# TODO: an option to pass the metadata_key keyword
1783+
zarr_root_group = zarr.open_consolidated(store, **open_kwargs)
1784+
elif consolidated is None:
1785+
# same but with more error handling in case no consolidated metadata found
17741786
try:
1775-
zarr_group = zarr.open_group(store, **open_kwargs)
1776-
emit_user_level_warning(
1777-
"Failed to open Zarr store with consolidated metadata, "
1778-
"but successfully read with non-consolidated metadata. "
1779-
"This is typically much slower for opening a dataset. "
1780-
"To silence this warning, consider:\n"
1781-
"1. Consolidating metadata in this existing store with "
1782-
"zarr.consolidate_metadata().\n"
1783-
"2. Explicitly setting consolidated=False, to avoid trying "
1784-
"to read consolidate metadata, or\n"
1785-
"3. Explicitly setting consolidated=True, to raise an "
1786-
"error in this case instead of falling back to try "
1787-
"reading non-consolidated metadata.",
1788-
RuntimeWarning,
1789-
)
1790-
except missing_exc as err:
1791-
raise FileNotFoundError(
1792-
f"No such file or directory: '{store}'"
1793-
) from err
1794-
elif consolidated:
1795-
# TODO: an option to pass the metadata_key keyword
1796-
zarr_group = zarr.open_consolidated(store, **open_kwargs)
1787+
zarr_root_group = zarr.open_consolidated(store, **open_kwargs)
1788+
except (ValueError, KeyError):
1789+
# ValueError in zarr-python 3.x, KeyError in 2.x.
1790+
try:
1791+
zarr_root_group = zarr.open_group(store, **open_kwargs)
1792+
emit_user_level_warning(
1793+
"Failed to open Zarr store with consolidated metadata, "
1794+
"but successfully read with non-consolidated metadata. "
1795+
"This is typically much slower for opening a dataset. "
1796+
"To silence this warning, consider:\n"
1797+
"1. Consolidating metadata in this existing store with "
1798+
"zarr.consolidate_metadata().\n"
1799+
"2. Explicitly setting consolidated=False, to avoid trying "
1800+
"to read consolidate metadata, or\n"
1801+
"3. Explicitly setting consolidated=True, to raise an "
1802+
"error in this case instead of falling back to try "
1803+
"reading non-consolidated metadata.",
1804+
RuntimeWarning,
1805+
)
1806+
except missing_exc as err:
1807+
raise FileNotFoundError(
1808+
f"No such file or directory: '{store}'"
1809+
) from err
1810+
1811+
# but the user should still receive a DataTree whose root is the group they asked for
1812+
if group and group != "/":
1813+
zarr_group = zarr_root_group[group.removeprefix("/")]
1814+
else:
1815+
zarr_group = zarr_root_group
17971816
else:
17981817
if _zarr_v3():
17991818
# we have determined that we don't want to use consolidated metadata
18001819
# so we set that to False to avoid trying to read it
18011820
open_kwargs["use_consolidated"] = False
18021821
zarr_group = zarr.open_group(store, **open_kwargs)
1822+
18031823
close_store_on_close = zarr_group.store is not store
18041824

18051825
# we use this to determine how to handle fill_value

xarray/core/datatree.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
TYPE_CHECKING,
1717
Any,
1818
Concatenate,
19-
Literal,
2019
NoReturn,
2120
ParamSpec,
2221
TypeVar,
@@ -1741,7 +1740,7 @@ def to_zarr(
17411740
consolidated: bool = True,
17421741
group: str | None = None,
17431742
write_inherited_coords: bool = False,
1744-
compute: Literal[True] = True,
1743+
compute: bool = True,
17451744
**kwargs,
17461745
):
17471746
"""

xarray/core/groupby.py

+66-11
Original file line numberDiff line numberDiff line change
@@ -998,7 +998,7 @@ def _flox_reduce(
998998
dim: Dims,
999999
keep_attrs: bool | None = None,
10001000
**kwargs: Any,
1001-
):
1001+
) -> T_Xarray:
10021002
"""Adaptor function that translates our groupby API to that of flox."""
10031003
import flox
10041004
from flox.xarray import xarray_reduce
@@ -1121,6 +1121,8 @@ def _flox_reduce(
11211121
# flox always assigns an index so we must drop it here if we don't need it.
11221122
to_drop.append(grouper.name)
11231123
continue
1124+
# TODO: We can't simply use `self.encoded.coords` here because it corresponds to `unique_coord`,
1125+
# NOT `full_index`. We would need to construct a new Coordinates object, that corresponds to `full_index`.
11241126
new_coords.append(
11251127
# Using IndexVariable here ensures we reconstruct PandasMultiIndex with
11261128
# all associated levels properly.
@@ -1363,7 +1365,12 @@ def where(self, cond, other=dtypes.NA) -> T_Xarray:
13631365
"""
13641366
return ops.where_method(self, cond, other)
13651367

1366-
def _first_or_last(self, op, skipna, keep_attrs):
1368+
def _first_or_last(
1369+
self,
1370+
op: Literal["first" | "last"],
1371+
skipna: bool | None,
1372+
keep_attrs: bool | None,
1373+
):
13671374
if all(
13681375
isinstance(maybe_slice, slice)
13691376
and (maybe_slice.stop == maybe_slice.start + 1)
@@ -1374,17 +1381,65 @@ def _first_or_last(self, op, skipna, keep_attrs):
13741381
return self._obj
13751382
if keep_attrs is None:
13761383
keep_attrs = _get_keep_attrs(default=True)
1377-
return self.reduce(
1378-
op, dim=[self._group_dim], skipna=skipna, keep_attrs=keep_attrs
1379-
)
1384+
if (
1385+
module_available("flox", minversion="0.10.0")
1386+
and OPTIONS["use_flox"]
1387+
and contains_only_chunked_or_numpy(self._obj)
1388+
):
1389+
result = self._flox_reduce(
1390+
dim=None, func=op, skipna=skipna, keep_attrs=keep_attrs
1391+
)
1392+
else:
1393+
result = self.reduce(
1394+
getattr(duck_array_ops, op),
1395+
dim=[self._group_dim],
1396+
skipna=skipna,
1397+
keep_attrs=keep_attrs,
1398+
)
1399+
return result
13801400

1381-
def first(self, skipna: bool | None = None, keep_attrs: bool | None = None):
1382-
"""Return the first element of each group along the group dimension"""
1383-
return self._first_or_last(duck_array_ops.first, skipna, keep_attrs)
1401+
def first(
1402+
self, skipna: bool | None = None, keep_attrs: bool | None = None
1403+
) -> T_Xarray:
1404+
"""
1405+
Return the first element of each group along the group dimension
13841406
1385-
def last(self, skipna: bool | None = None, keep_attrs: bool | None = None):
1386-
"""Return the last element of each group along the group dimension"""
1387-
return self._first_or_last(duck_array_ops.last, skipna, keep_attrs)
1407+
Parameters
1408+
----------
1409+
skipna : bool or None, optional
1410+
If True, skip missing values (as marked by NaN). By default, only
1411+
skips missing values for float dtypes; other dtypes either do not
1412+
have a sentinel missing value (int) or ``skipna=True`` has not been
1413+
implemented (object, datetime64 or timedelta64).
1414+
keep_attrs : bool or None, optional
1415+
If True, ``attrs`` will be copied from the original
1416+
object to the new one. If False, the new object will be
1417+
returned without attributes.
1418+
1419+
"""
1420+
return self._first_or_last("first", skipna, keep_attrs)
1421+
1422+
def last(
1423+
self, skipna: bool | None = None, keep_attrs: bool | None = None
1424+
) -> T_Xarray:
1425+
"""
1426+
Return the last element of each group along the group dimension
1427+
1428+
Parameters
1429+
----------
1430+
skipna : bool or None, optional
1431+
If True, skip missing values (as marked by NaN). By default, only
1432+
skips missing values for float dtypes; other dtypes either do not
1433+
have a sentinel missing value (int) or ``skipna=True`` has not been
1434+
implemented (object, datetime64 or timedelta64).
1435+
keep_attrs : bool or None, optional
1436+
If True, ``attrs`` will be copied from the original
1437+
object to the new one. If False, the new object will be
1438+
returned without attributes.
1439+
1440+
1441+
"""
1442+
return self._first_or_last("last", skipna, keep_attrs)
13881443

13891444
def assign_coords(self, coords=None, **coords_kwargs):
13901445
"""Assign coordinates by group.

xarray/core/resample.py

+21-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import warnings
44
from collections.abc import Callable, Hashable, Iterable, Sequence
5-
from typing import TYPE_CHECKING, Any
5+
from typing import TYPE_CHECKING, Any, Literal
66

77
from xarray.core._aggregations import (
88
DataArrayResampleAggregations,
@@ -55,8 +55,11 @@ def _flox_reduce(
5555
keep_attrs: bool | None = None,
5656
**kwargs,
5757
) -> T_Xarray:
58-
result = super()._flox_reduce(dim=dim, keep_attrs=keep_attrs, **kwargs)
59-
result = result.rename({RESAMPLE_DIM: self._group_dim})
58+
result: T_Xarray = (
59+
super()
60+
._flox_reduce(dim=dim, keep_attrs=keep_attrs, **kwargs)
61+
.rename({RESAMPLE_DIM: self._group_dim}) # type: ignore[assignment]
62+
)
6063
return result
6164

6265
def shuffle_to_chunks(self, chunks: T_Chunks = None):
@@ -103,6 +106,21 @@ def shuffle_to_chunks(self, chunks: T_Chunks = None):
103106
(grouper,) = self.groupers
104107
return self._shuffle_obj(chunks).drop_vars(RESAMPLE_DIM)
105108

109+
def _first_or_last(
110+
self, op: Literal["first", "last"], skipna: bool | None, keep_attrs: bool | None
111+
) -> T_Xarray:
112+
from xarray.core.dataset import Dataset
113+
114+
result = super()._first_or_last(op=op, skipna=skipna, keep_attrs=keep_attrs)
115+
if isinstance(result, Dataset):
116+
# Can't do this in the base class because group_dim is RESAMPLE_DIM
117+
# which is not present in the original object
118+
for var in result.data_vars:
119+
result._variables[var] = result._variables[var].transpose(
120+
*self._obj._variables[var].dims
121+
)
122+
return result
123+
106124
def _drop_coords(self) -> T_Xarray:
107125
"""Drop non-dimension coordinates along the resampled dimension."""
108126
obj = self._obj

xarray/groupers.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,11 @@ def _factorize_unique(self) -> EncodedGroups:
254254
unique_coord = Variable(
255255
dims=codes.name, data=unique_values, attrs=self.group.attrs
256256
)
257-
full_index = pd.Index(unique_values)
257+
full_index = (
258+
unique_values
259+
if isinstance(unique_values, pd.MultiIndex)
260+
else pd.Index(unique_values)
261+
)
258262

259263
return EncodedGroups(
260264
codes=codes,

0 commit comments

Comments
 (0)