Single layer for cohorts. (#415)

dcherian · web-flow · commit e2aa2be30564 · 2025-01-19T00:12:54.000Z
Does so by vendoring in the tree reduction code and modifying to work purely on a Graph level with no array creation. For kicks, I fused in the concatenation step. Benchmarks are looking great! I don't understand the `ERA5MonthHourRechunked` regression, but it's quite minor ``` | Before [df81| After [761367d] | Ratio | Benchmark (Parameter) | |-------------|------------------|---------|----------------------------------------------------------| | 3672 | 4266 | 1.16 | cohorts.ERA5MonthHourRechunked.track_num_tasks_optimized | | 3834 | 3469 | 0.9 | cohorts.ERA5DayOfYear.track_num_tasks | | 999±20ms | 822±60ms | 0.82 | cohorts.ERA5Resampling.time_graph_construct | | 11 | 6 | 0.55 | cohorts.ERA5MonthHourRechunked.track_num_layers | | 10 | 5 | 0.5 | cohorts.ERA5MonthHour.track_num_layers | | 17 | 5 | 0.29 | cohorts.PerfectMonthly.track_num_layers | | 128 | 5 | 0.04 | cohorts.ERA5Google.track_num_layers | | 266 | 6 | 0.02 | cohorts.NWMMidwest.track_num_layers | | 735 | 5 | 0.01 | cohorts.ERA5DayOfYear.track_num_layers | | 490 | 6 | 0.01 | cohorts.OISST.track_num_layers | | 7305 | 5 | 0 | cohorts.ERA5Resampling.track_num_layers | ```
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -26,9 +26,11 @@ jobs:
         with:
           environment-name: flox-bench
           create-args: >-
-            python=3.10
+            python=3.12
             asv
             mamba
+            libmambapy<2.0
+            conda-build
           init-shell: bash
           cache-environment: true
 
diff --git a/asv_bench/benchmarks/cohorts.py b/asv_bench/benchmarks/cohorts.py
@@ -113,6 +113,22 @@ def rechunk(self):
         )
 
 
+class ERA5Resampling(Cohorts):
+    def setup(self, *args, **kwargs):
+        super().__init__()
+        # nyears is number of years, adjust to make bigger,
+        # full dataset is 60-ish years.
+        nyears = 5
+        shape = (37, 721, 1440, nyears * 365 * 24)
+        chunks = (-1, -1, -1, 1)
+        time = pd.date_range("2001-01-01", periods=shape[-1], freq="h")
+
+        self.array = dask.array.random.random(shape, chunks=chunks)
+        self.by = codes_for_resampling(time, "D")
+        self.axis = (-1,)
+        self.expected = np.unique(self.by)
+
+
 class ERA5DayOfYear(ERA5Dataset, Cohorts):
     def setup(self, *args, **kwargs):
         super().__init__()
diff --git a/ci/benchmark.yml b/ci/benchmark.yml
@@ -6,10 +6,9 @@ dependencies:
   - build
   - cachey
   - dask-core
-  - numpy<2
+  - numpy<2.1
   - mamba
   - pip
-  - python=3.10
   - xarray
   - numpy_groupies>=0.9.19
   - numbagg>=0.3
diff --git a/flox/core.py b/flox/core.py
@@ -44,6 +44,7 @@
     quantile_new_dims_func,
 )
 from .cache import memoize
+from .lib import ArrayLayer
 from .xrutils import (
     _contains_cftime_datetimes,
     _to_pytimedelta,
@@ -72,10 +73,7 @@
             from typing import Unpack
     except (ModuleNotFoundError, ImportError):
         Unpack: Any  # type: ignore[no-redef]
-
-    import cubed.Array as CubedArray
-    import dask.array.Array as DaskArray
-    from dask.typing import Graph
+    from .types import CubedArray, DaskArray, Graph
 
     T_DuckArray: TypeAlias = np.ndarray | DaskArray | CubedArray  # Any ?
     T_By: TypeAlias = T_DuckArray
@@ -1191,7 +1189,7 @@ def _aggregate(
     agg: Aggregation,
     expected_groups: pd.Index | None,
     axis: T_Axes,
-    keepdims,
+    keepdims: bool,
     fill_value: Any,
     reindex: bool,
 ) -> FinalResultsDict:
@@ -1511,7 +1509,7 @@ def subset_to_blocks(
     blkshape: tuple[int, ...] | None = None,
     reindexer=identity,
     chunks_as_array: tuple[np.ndarray, ...] | None = None,
-) -> DaskArray:
+) -> ArrayLayer:
     """
     Advanced indexing of .blocks such that we always get a regular array back.
 
@@ -1525,10 +1523,8 @@ def subset_to_blocks(
     -------
     dask.array
     """
-    import dask.array
     from dask.array.slicing import normalize_index
     from dask.base import tokenize
-    from dask.highlevelgraph import HighLevelGraph
 
     if blkshape is None:
         blkshape = array.blocks.shape
@@ -1538,9 +1534,6 @@ def subset_to_blocks(
 
     index = _normalize_indexes(array, flatblocks, blkshape)
 
-    if all(not isinstance(i, np.ndarray) and i == slice(None) for i in index):
-        return dask.array.map_blocks(reindexer, array, meta=array._meta)
-
     # These rest is copied from dask.array.core.py with slight modifications
     index = normalize_index(index, array.numblocks)
     index = tuple(slice(k, k + 1) if isinstance(k, Integral) else k for k in index)
@@ -1553,10 +1546,7 @@ def subset_to_blocks(
 
     keys = itertools.product(*(range(len(c)) for c in chunks))
     layer: Graph = {(name,) + key: (reindexer, tuple(new_keys[key].tolist())) for key in keys}
-
-    graph = HighLevelGraph.from_collections(name, layer, dependencies=[array])
-
-    return dask.array.Array(graph, name, chunks, meta=array)
+    return ArrayLayer(layer=layer, chunks=chunks, name=name)
 
 
 def _extract_unknown_groups(reduced, dtype) -> tuple[DaskArray]:
@@ -1613,6 +1603,9 @@ def dask_groupby_agg(
 ) -> tuple[DaskArray, tuple[np.ndarray | DaskArray]]:
     import dask.array
     from dask.array.core import slices_from_chunks
+    from dask.highlevelgraph import HighLevelGraph
+
+    from .dask_array_ops import _tree_reduce
 
     # I think _tree_reduce expects this
     assert isinstance(axis, Sequence)
@@ -1742,35 +1735,44 @@ def dask_groupby_agg(
             assert chunks_cohorts
             block_shape = array.blocks.shape[-len(axis) :]
 
-            reduced_ = []
+            out_name = f"{name}-reduce-{method}-{token}"
             groups_ = []
             chunks_as_array = tuple(np.array(c) for c in array.chunks)
-            for blks, cohort in chunks_cohorts.items():
+            dsk: Graph = {}
+            for icohort, (blks, cohort) in enumerate(chunks_cohorts.items()):
                 cohort_index = pd.Index(cohort)
                 reindexer = (
                     partial(reindex_intermediates, agg=agg, unique_groups=cohort_index)
                     if do_simple_combine
                     else identity
                 )
-                reindexed = subset_to_blocks(intermediate, blks, block_shape, reindexer, chunks_as_array)
+                subset = subset_to_blocks(intermediate, blks, block_shape, reindexer, chunks_as_array)
+                dsk |= subset.layer  # type: ignore[operator]
                 # now that we have reindexed, we can set reindex=True explicitlly
-                reduced_.append(
-                    tree_reduce(
-                        reindexed,
-                        combine=partial(combine, agg=agg, reindex=do_simple_combine),
-                        aggregate=partial(
-                            aggregate,
-                            expected_groups=cohort_index,
-                            reindex=do_simple_combine,
-                        ),
-                    )
+                _tree_reduce(
+                    subset,
+                    out_dsk=dsk,
+                    name=out_name,
+                    block_index=icohort,
+                    axis=axis,
+                    combine=partial(combine, agg=agg, reindex=do_simple_combine, keepdims=True),
+                    aggregate=partial(
+                        aggregate, expected_groups=cohort_index, reindex=do_simple_combine, keepdims=True
+                    ),
                 )
                 # This is done because pandas promotes to 64-bit types when an Index is created
                 # So we use the index to generate the return value for consistency with "map-reduce"
                 # This is important on windows
                 groups_.append(cohort_index.values)
 
-            reduced = dask.array.concatenate(reduced_, axis=-1)
+            graph = HighLevelGraph.from_collections(out_name, dsk, dependencies=[intermediate])
+
+            out_chunks = list(array.chunks)
+            out_chunks[axis[-1]] = tuple(len(c) for c in chunks_cohorts.values())
+            for ax in axis[:-1]:
+                out_chunks[ax] = (1,)
+            reduced = dask.array.Array(graph, out_name, out_chunks, meta=array._meta)
+
             groups = (np.concatenate(groups_),)
             group_chunks = (tuple(len(cohort) for cohort in groups_),)
 
diff --git a/flox/dask_array_ops.py b/flox/dask_array_ops.py
@@ -0,0 +1,103 @@
+import builtins
+import math
+from functools import partial
+from itertools import product
+from numbers import Integral
+
+from dask import config
+from dask.blockwise import lol_tuples
+from toolz import partition_all
+
+from .lib import ArrayLayer
+from .types import Graph
+
+
+# _tree_reduce and partial_reduce are copied from dask.array.reductions
+# They have been modified to work purely with graphs, and without creating new Array layers
+# in the graph. The `block_index` kwarg is new and avoids a concatenation by simply setting the right
+# key initially
+def _tree_reduce(
+    x: ArrayLayer,
+    *,
+    name: str,
+    out_dsk: Graph,
+    aggregate,
+    axis: tuple[int, ...],
+    block_index: int,
+    split_every=None,
+    combine=None,
+):
+    # Normalize split_every
+    split_every = split_every or config.get("split_every", 4)
+    if isinstance(split_every, dict):
+        split_every = {k: split_every.get(k, 2) for k in axis}
+    elif isinstance(split_every, Integral):
+        n = builtins.max(int(split_every ** (1 / (len(axis) or 1))), 2)
+        split_every = dict.fromkeys(axis, n)
+    else:
+        raise ValueError("split_every must be a int or a dict")
+
+    numblocks = tuple(len(c) for c in x.chunks)
+    out_chunks = x.chunks
+
+    # Reduce across intermediates
+    depth = 1
+    for i, n in enumerate(numblocks):
+        if i in split_every and split_every[i] != 1:
+            depth = int(builtins.max(depth, math.ceil(math.log(n, split_every[i]))))
+    func = partial(combine or aggregate, axis=axis)
+
+    agg_dep_name = x.name
+    for level in range(depth - 1):
+        newname = name + f"-{block_index}-partial-{level}"
+        out_dsk, out_chunks = partial_reduce(
+            func,
+            out_dsk,
+            chunks=out_chunks,
+            split_every=split_every,
+            name=newname,
+            dep_name=agg_dep_name,
+            axis=axis,
+        )
+        agg_dep_name = newname
+    func = partial(aggregate, axis=axis)
+    return partial_reduce(
+        func,
+        out_dsk,
+        chunks=out_chunks,
+        split_every=split_every,
+        name=name,
+        dep_name=agg_dep_name,
+        axis=axis,
+        block_index=block_index,
+    )
+
+
+def partial_reduce(
+    func,
+    dsk,
+    *,
+    chunks: tuple[tuple[int, ...], ...],
+    name: str,
+    dep_name: str,
+    split_every: dict[int, int],
+    axis: tuple[int, ...],
+    block_index: int | None = None,
+):
+    numblocks = tuple(len(c) for c in chunks)
+    ndim = len(numblocks)
+    parts = [list(partition_all(split_every.get(i, 1), range(n))) for (i, n) in enumerate(numblocks)]
+    keys = product(*map(range, map(len, parts)))
+    out_chunks = [
+        tuple(1 for p in partition_all(split_every[i], c)) if i in split_every else c
+        for (i, c) in enumerate(chunks)
+    ]
+    for k, p in zip(keys, product(*parts)):
+        free = {i: j[0] for (i, j) in enumerate(p) if len(j) == 1 and i not in split_every}
+        dummy = dict(i for i in enumerate(p) if i[0] in split_every)
+        g = lol_tuples((dep_name,), range(ndim), free, dummy)
+        assert dep_name != name
+        if block_index is not None:
+            k = (*k[:-1], block_index)
+        dsk[(name,) + k] = (func, g)
+    return dsk, out_chunks
diff --git a/flox/lib.py b/flox/lib.py
@@ -0,0 +1,17 @@
+from dataclasses import dataclass
+
+from .types import DaskArray, Graph
+
+
+@dataclass
+class ArrayLayer:
+    name: str
+    layer: Graph
+    chunks: tuple[tuple[int, ...], ...]
+
+    def to_array(self, dep: DaskArray) -> DaskArray:
+        from dask.array import Array
+        from dask.highlevelgraph import HighLevelGraph
+
+        graph = HighLevelGraph.from_collections(self.name, self.layer, dependencies=[dep])
+        return Array(graph, self.name, self.chunks, meta=dep._meta)
diff --git a/flox/types.py b/flox/types.py
@@ -0,0 +1,13 @@
+from typing import Any, TypeAlias
+
+try:
+    import cubed.Array as CubedArray
+except ImportError:
+    CubedArray = Any
+
+try:
+    import dask.array.Array as DaskArray
+    from dask.typing import Graph
+except ImportError:
+    DaskArray = Any
+    Graph: TypeAlias = Any  # type: ignore[no-redef,misc]
diff --git a/pyproject.toml b/pyproject.toml
@@ -133,7 +133,7 @@ module=[
     "pandas",
     "setuptools",
     "scipy.*",
-    "toolz",
+    "toolz.*",
 ]
 ignore_missing_imports = true
 
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -1524,15 +1524,6 @@ def test_dtype(func, dtype, engine):
     assert actual.dtype == np.dtype("float64")
 
 
-@requires_dask
-def test_subset_blocks():
-    array = dask.array.random.random((120,), chunks=(4,))
-
-    blockid = (0, 3, 6, 9, 12, 15, 18, 21, 24, 27)
-    subset = subset_to_blocks(array, blockid)
-    assert subset.blocks.shape == (len(blockid),)
-
-
 @requires_dask
 @pytest.mark.parametrize(
     "flatblocks, expected",
@@ -1573,19 +1564,29 @@ def test_normalize_block_indexing_2d(flatblocks, expected):
     assert_equal_tuple(expected, actual)
 
 
+@requires_dask
+def test_subset_blocks():
+    array = dask.array.random.random((120,), chunks=(4,))
+
+    blockid = (0, 3, 6, 9, 12, 15, 18, 21, 24, 27)
+    subset = subset_to_blocks(array, blockid).to_array(array)
+    assert subset.blocks.shape == (len(blockid),)
+
+
+@pytest.mark.skip("temporarily removed this optimization")
 @requires_dask
 def test_subset_block_passthrough():
     from flox.core import identity
 
     # full slice pass through
     array = dask.array.ones((5,), chunks=(1,))
     expected = dask.array.map_blocks(identity, array)
-    subset = subset_to_blocks(array, np.arange(5))
+    subset = subset_to_blocks(array, np.arange(5)).to_array(array)
     assert subset.name == expected.name
 
     array = dask.array.ones((5, 5), chunks=1)
     expected = dask.array.map_blocks(identity, array)
-    subset = subset_to_blocks(array, np.arange(25))
+    subset = subset_to_blocks(array, np.arange(25)).to_array(array)
     assert subset.name == expected.name
 
 
@@ -1604,7 +1605,7 @@ def test_subset_block_passthrough():
 )
 def test_subset_block_2d(flatblocks, expectidx):
     array = dask.array.from_array(np.arange(25).reshape((5, 5)), chunks=1)
-    subset = subset_to_blocks(array, flatblocks)
+    subset = subset_to_blocks(array, flatblocks).to_array(array)
     assert len(subset.dask.layers) == 2
     assert_equal(subset, array.compute()[expectidx])
 

Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ module=[`
`133`	`133`	`"pandas",`
`134`	`134`	`"setuptools",`
`135`	`135`	`"scipy.*",`
`136`		`- "toolz",`
	`136`	`+ "toolz.*",`
`137`	`137`	`]`
`138`	`138`	`ignore_missing_imports = true`
`139`	`139`