small edits

pydata · Nov 12, 2024 · 77dc5e0 · 77dc5e0
1 parent 96ae241
commit 77dc5e0
Show file tree

Hide file tree

Showing 3 changed files with 190 additions and 88 deletions.
diff --git a/properties/test_properties.py b/properties/test_properties.py
@@ -1,3 +1,5 @@
+import itertools
+
 import pytest
 
 pytest.importorskip("hypothesis")
@@ -37,12 +39,10 @@ def test_property_season_month_tuple(roll, breaks):
     if breaks[-1] != 12:
         breaks = breaks + [12]
     seasons = tuple(
-        "".join(rolled_chars[start:stop])
-        for start, stop in zip(breaks[:-1], breaks[1:], strict=False)
+        "".join(rolled_chars[start:stop]) for start, stop in itertools.pairwise(breaks)
     )
     actual = season_to_month_tuple(seasons)
     expected = tuple(
-        rolled_months[start:stop]
-        for start, stop in zip(breaks[:-1], breaks[1:], strict=False)
+        rolled_months[start:stop] for start, stop in itertools.pairwise(breaks)
     )
     assert expected == actual
diff --git a/xarray/groupers.py b/xarray/groupers.py
@@ -14,8 +14,7 @@
 from collections import defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass, field
-from itertools import pairwise
-from itertools import chain
+from itertools import chain, pairwise
 from typing import TYPE_CHECKING, Any, Literal, cast
 
 import numpy as np
@@ -25,16 +24,12 @@
 from xarray.coding.cftime_offsets import BaseCFTimeOffset, _new_to_legacy_freq
 from xarray.coding.cftimeindex import CFTimeIndex
 from xarray.core import duck_array_ops
-from xarray.core.computation import apply_ufunc
-from xarray.core.coordinates import Coordinates, _coordinates_from_variable
-from xarray.core.coordinates import Coordinates
-from xarray.core.common import _contains_datetime_like_objects
-from xarray.core.common import _contains_datetime_like_objects
 from xarray.core.common import (
     _contains_cftime_datetimes,
     _contains_datetime_like_objects,
 )
-from xarray.core.coordinates import Coordinates
+from xarray.core.computation import apply_ufunc
+from xarray.core.coordinates import Coordinates, _coordinates_from_variable
 from xarray.core.dataarray import DataArray
 from xarray.core.duck_array_ops import isnull
 from xarray.core.formatting import first_n_items
@@ -751,14 +746,16 @@ def __post_init__(self):
             )
         self.season_tuples = dict(zip(self.seasons, self.season_inds, strict=True))
 
-    def factorize(self, group):
+    def factorize(self, group: T_Group) -> EncodedGroups:
         if group.ndim != 1:
             raise ValueError(
                 "SeasonResampler can only be used to resample by 1D arrays."
             )
-        if not _contains_datetime_like_objects(group.variable):
+        if not isinstance(group, DataArray) or not _contains_datetime_like_objects(
+            group.variable
+        ):
             raise ValueError(
-                "SeasonResampler can only be used to group by datetime-like arrays."
+                "SeasonResampler can only be used to group by datetime-like DataArrays."
             )
 
         seasons = self.seasons
@@ -775,13 +772,14 @@ def factorize(self, group):
             season_label[month.isin(season_ind)] = season_str
             if "DJ" in season_str:
                 after_dec = season_ind[season_str.index("D") + 1 :]
-                # important this is assuming non-overlapping seasons
+                # important: this is assuming non-overlapping seasons
                 year[month.isin(after_dec)] -= 1
 
         # Allow users to skip one or more months?
-        # present_seasons is a mask that is True for months that are requestsed in the output
+        # present_seasons is a mask that is True for months that are requested in the output
         present_seasons = season_label != ""
         if present_seasons.all():
+            # avoid copies if we can.
             present_seasons = slice(None)
         frame = pd.DataFrame(
             data={
@@ -794,10 +792,13 @@ def factorize(self, group):
             ),
         )
 
-        series = frame["index"]
-        g = series.groupby(["year", "season"], sort=False)
-        first_items = g.first()
-        counts = g.count()
+        agged = (
+            frame["index"]
+            .groupby(["year", "season"], sort=False)
+            .agg(["first", "count"])
+        )
+        first_items = agged["first"]
+        counts = agged["count"]
 
         if _contains_cftime_datetimes(group.data):
             index_class = CFTimeIndex
@@ -814,32 +815,18 @@ def factorize(self, group):
             ]
         )
 
-        sbins = first_items.values.astype(int)
-        group_indices = [
-            slice(i, j) for i, j in zip(sbins[:-1], sbins[1:], strict=True)
-        ]
-        group_indices += [slice(sbins[-1], None)]
-
-        # Make sure the first and last timestamps
-        # are for the correct months,if not we have incomplete seasons
-        unique_codes = np.arange(len(unique_coord))
-        if self.drop_incomplete:
-            for idx, slicer in zip([0, -1], (slice(1, None), slice(-1)), strict=True):
-                stamp_year, stamp_season = frame.index[idx]
-                code = seasons.index(stamp_season)
-                stamp_month = season_inds[code][idx]
-                if stamp_month != month[present_seasons][idx].item():
-                    # we have an incomplete season!
-                    group_indices = group_indices[slicer]
-                    unique_coord = unique_coord[slicer]
-                    if idx == 0:
-                        unique_codes -= 1
-                    unique_codes[idx] = -1
-
-        # all years and seasons
+        # sbins = first_items.values.astype(int)
+        # group_indices = [
+        #     slice(i, j) for i, j in zip(sbins[:-1], sbins[1:], strict=True)
+        # ]
+        # group_indices += [slice(sbins[-1], None)]
+
+        # This sorted call is a hack. It's hard to figure out how
+        # to start the iteration for arbitrary season ordering
+        # for example "DJF" as first entry or last entry
+        # So we construct the largest possible index and slice it to the
+        # range present in the data.
         complete_index = index_class(
-            # This sorted call is a hack. It's hard to figure out how
-            # to start the iteration
             sorted(
                 [
                     datetime_class(year=y, month=m, day=1)
@@ -850,22 +837,56 @@ def factorize(self, group):
                 ]
             )
         )
-        # only keep that included in data
-        range_ = complete_index.get_indexer(unique_coord[[0, -1]])
-        full_index = complete_index[slice(range_[0], range_[-1] + 1)]
+
+        # all years and seasons
+        def get_label(year, season):
+            month = season_tuples[season][0]
+            return f"{year}-{month}-01"
+
+        unique_codes = np.arange(len(unique_coord))
+        first_valid_season = season_label[0]
+        last_valid_season = season_label[-1]
+        first_year, last_year = year.data[[0, -1]]
+        if self.drop_incomplete:
+            if month.data[0] != season_tuples[first_valid_season][0]:
+                if "DJ" in first_valid_season:
+                    first_year += 1
+                first_valid_season = seasons[
+                    (seasons.index(first_valid_season) + 1) % len(seasons)
+                ]
+                # group_indices = group_indices[slice(1, None)]
+                unique_codes -= 1
+
+            if month.data[-1] != season_tuples[last_valid_season][-1]:
+                last_valid_season = seasons[seasons.index(last_valid_season) - 1]
+                if "DJ" in last_valid_season:
+                    last_year -= 1
+                # group_indices = group_indices[slice(-1)]
+                unique_codes[-1] = -1
+
+        first_label = get_label(first_year, first_valid_season)
+        last_label = get_label(last_year, last_valid_season)
+
+        slicer = complete_index.slice_indexer(first_label, last_label)
+        full_index = complete_index[slicer]
+        # TODO: group must be sorted
+        # codes = np.searchsorted(edges, group.data, side="left")
+        # codes -= 1
+        # codes[~present_seasons | group.data >= edges[-1]] = -1
+        # codes[isnull(group.data)] = -1
+        # import ipdb; ipdb.set_trace()
         # check that there are no "missing" seasons in the middle
-        # print(full_index, unique_coord)
-        if not full_index.equals(unique_coord):
-            raise ValueError("Are there seasons missing in the middle of the dataset?")
+        # if not full_index.equals(unique_coord):
+        # raise ValueError("Are there seasons missing in the middle of the dataset?")
 
         final_codes = np.full(group.data.size, -1)
         final_codes[present_seasons] = np.repeat(unique_codes, counts)
         codes = group.copy(data=final_codes, deep=False)
-        unique_coord_var = Variable(group.name, unique_coord, group.attrs)
+        # unique_coord_var = Variable(group.name, unique_coord, group.attrs)
 
         return EncodedGroups(
             codes=codes,
-            group_indices=group_indices,
-            unique_coord=unique_coord_var,
+            # group_indices=group_indices,
+            # unique_coord=unique_coord_var,
             full_index=full_index,
         )
diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py
@@ -12,14 +12,15 @@
 from packaging.version import Version
 
 import xarray as xr
-from xarray import DataArray, Dataset, Variable, cftime_range
+from xarray import DataArray, Dataset, Variable, cftime_range, date_range
 from xarray.core.alignment import broadcast
 from xarray.core.groupby import _consolidate_slices
 from xarray.core.types import InterpOptions, ResampleCompatible
 from xarray.groupers import (
     BinGrouper,
     EncodedGroups,
     Grouper,
+    SeasonGrouper,
     SeasonResampler,
     TimeResampler,
     UniqueGrouper,
@@ -44,6 +45,7 @@
     requires_pandas_ge_2_2,
     requires_scipy,
 )
+from xarray.tests.test_coding_times import _ALL_CALENDARS
 
 
 @pytest.fixture
@@ -3144,48 +3146,127 @@ def test_groupby_dask_eager_load_warnings():
     ds.groupby_bins("x", bins=[1, 2, 3], eagerly_compute_group=False)
 
 
-# TODO: Possible property tests to add to this module
-# 1. lambda x: x
-# 2. grouped-reduce on unique coords is identical to array
-# 3. group_over == groupby-reduce along other dimensions
-# 4. result is equivalent for transposed input
-def test_season_to_month_tuple():
-    assert season_to_month_tuple(["JF", "MAM", "JJAS", "OND"]) == (
-        (1, 2),
-        (3, 4, 5),
-        (6, 7, 8, 9),
-        (10, 11, 12),
-    )
-    assert season_to_month_tuple(["DJFM", "AM", "JJAS", "ON"]) == (
-        (12, 1, 2, 3),
-        (4, 5),
-        (6, 7, 8, 9),
-        (10, 11),
+class TestSeasonGrouperAndResampler:
+    def test_season_to_month_tuple(self):
+        assert season_to_month_tuple(["JF", "MAM", "JJAS", "OND"]) == (
+            (1, 2),
+            (3, 4, 5),
+            (6, 7, 8, 9),
+            (10, 11, 12),
+        )
+        assert season_to_month_tuple(["DJFM", "AM", "JJAS", "ON"]) == (
+            (12, 1, 2, 3),
+            (4, 5),
+            (6, 7, 8, 9),
+            (10, 11),
+        )
+
+    @pytest.mark.parametrize("calendar", _ALL_CALENDARS)
+    def test_season_grouper_simple(self, calendar) -> None:
+        time = cftime_range("2001-01-01", "2002-12-30", freq="D", calendar=calendar)
+        da = DataArray(np.ones(time.size), dims="time", coords={"time": time})
+        expected = da.groupby("time.season").mean()
+        # note season order matches expected
+        actual = da.groupby(
+            time=SeasonGrouper(
+                ["DJF", "JJA", "MAM", "SON"],  # drop_incomplete=False
+            )
+        ).mean()
+        assert_identical(expected, actual)
+
+    # TODO: drop_incomplete
+    @requires_cftime
+    @pytest.mark.parametrize("drop_incomplete", [True, False])
+    @pytest.mark.parametrize(
+        "seasons",
+        [
+            pytest.param(["DJF", "MAM", "JJA", "SON"], id="standard"),
+            pytest.param(["MAM", "JJA", "SON", "DJF"], id="standard-diff-order"),
+            pytest.param(["JFM", "AMJ", "JAS", "OND"], id="december-same-year"),
+            pytest.param(["DJF", "MAM", "JJA", "ON"], id="skip-september"),
+            pytest.param(["JJAS"], id="jjas-only"),
+            pytest.param(["MAM", "JJA", "SON", "DJF"], id="different-order"),
+            pytest.param(["JJA", "MAM", "SON", "DJF"], id="out-of-order"),
+        ],
     )
+    def test_season_resampler(self, seasons: list[str], drop_incomplete: bool) -> None:
+        calendar = "standard"
+        time = date_range("2001-01-01", "2002-12-30", freq="D", calendar=calendar)
+        da = DataArray(np.ones(time.size), dims="time", coords={"time": time})
+        counts = da.resample(time="ME").count()
+
+        seasons_as_ints = season_to_month_tuple(seasons)
+        month = counts.time.dt.month.data
+        year = counts.time.dt.year.data
+        for season, as_ints in zip(seasons, seasons_as_ints, strict=True):
+            if "DJ" in season:
+                for imonth in as_ints[season.index("D") + 1 :]:
+                    year[month == imonth] -= 1
+        counts["time"] = (
+            "time",
+            [pd.Timestamp(f"{y}-{m}-01") for y, m in zip(year, month, strict=True)],
+        )
+        counts = counts.convert_calendar(calendar, "time", align_on="date")
+
+        expected_vals = []
+        expected_time = []
+        for year in [2001, 2002]:
+            for season, as_ints in zip(seasons, seasons_as_ints, strict=True):
+                out_year = year
+                if "DJ" in season:
+                    out_year = year - 1
+                available = [
+                    counts.sel(time=f"{out_year}-{month:02d}").data for month in as_ints
+                ]
+                if any(len(a) == 0 for a in available) and drop_incomplete:
+                    continue
+                output_label = pd.Timestamp(f"{out_year}-{as_ints[0]:02d}-01")
+                expected_time.append(output_label)
+                # use concatenate to handle empty array when dec value does not exist
+                expected_vals.append(np.concatenate(available).sum())
 
+        expected = xr.DataArray(
+            expected_vals, dims="time", coords={"time": expected_time}
+        ).convert_calendar(calendar, align_on="date")
+        rs = SeasonResampler(seasons, drop_incomplete=drop_incomplete)
+        # through resample
+        actual = da.resample(time=rs).sum()
+        assert_identical(actual, expected)
 
-def test_season_resampler():
-    time = cftime_range("2001-01-01", "2002-12-30", freq="D", calendar="360_day")
-    da = DataArray(np.ones(time.size), dims="time", coords={"time": time})
+    def test_season_resampler_errors(self):
+        time = cftime_range("2001-01-01", "2002-12-30", freq="D", calendar="360_day")
+        da = DataArray(np.ones(time.size), dims="time", coords={"time": time})
 
-    # through resample
-    da.resample(time=SeasonResampler(["DJF", "MAM", "JJA", "SON"])).sum()
+        # non-datetime array
+        with pytest.raises(ValueError):
+            DataArray(np.ones(5), dims="time").groupby(time=SeasonResampler(["DJF"]))
 
-    # through groupby
-    da.groupby(time=SeasonResampler(["DJF", "MAM", "JJA", "SON"])).sum()
+        # ndim > 1 array
+        with pytest.raises(ValueError):
+            DataArray(
+                np.ones((5, 5)), dims=("t", "x"), coords={"x": np.arange(5)}
+            ).groupby(x=SeasonResampler(["DJF"]))
 
-    # skip september
-    da.groupby(time=SeasonResampler(["DJF", "MAM", "JJA", "ON"])).sum()
+        # overlapping seasons
+        with pytest.raises(ValueError):
+            da.groupby(time=SeasonResampler(["DJFM", "MAMJ", "JJAS", "SOND"])).sum()
 
-    # "subsampling"
-    da.groupby(time=SeasonResampler(["JJAS"])).sum()
+    @requires_cftime
+    def test_season_resampler_groupby_identical(self):
+        time = date_range("2001-01-01", "2002-12-30", freq="D")
+        da = DataArray(np.ones(time.size), dims="time", coords={"time": time})
 
-    # overlapping
-    with pytest.raises(ValueError):
-        da.groupby(time=SeasonResampler(["DJFM", "MAMJ", "JJAS", "SOND"])).sum()
+        # through resample
+        resampler = SeasonResampler(["DJF", "MAM", "JJA", "SON"])
+        rs = da.resample(time=resampler).sum()
 
+        # through groupby
+        gb = da.groupby(time=resampler).sum()
+        assert_identical(rs, gb)
 
-# Possible property tests
+
+# TODO: Possible property tests to add to this module
 # 1. lambda x: x
 # 2. grouped-reduce on unique coords is identical to array
 # 3. group_over == groupby-reduce along other dimensions
+# 4. result is equivalent for transposed input