More cohorts speedups (#290)

dcherian · web-flow · commit f77bf798aac9 · 2023-11-28T09:50:57.000-07:00
* Small speedup to cohorts again ``` | Before [19db5b3] <v0.8.2> | After [9d3285e2] <speedup-cohorts> | Ratio | Benchmark (Parameter) | |------------------------------|--------------------------------------|---------|---------------------------------------------------------| | 8.70±0.3ms | 7.23±0.4ms | 0.83 | cohorts.ERA5MonthHourRechunked.time_find_group_cohorts | | 8.34±0.08ms | 6.80±0.07ms | 0.81 | cohorts.ERA5MonthHour.time_find_group_cohorts | | 1.13±0.02ms | 609±20μs | 0.54 | cohorts.PerfectMonthlyRechunked.time_find_group_cohorts | | 1.12±0.02ms | 592±4μs | 0.53 | cohorts.PerfectMonthly.time_find_group_cohorts | | 3.46±0.03ms | 1.43±0.01ms | 0.41 | cohorts.ERA5Google.time_find_group_cohorts | ``` * Check set membership instead of tuples ``` | Before [19db5b3] <v0.8.2> | After [bc7fee3e] <speedup-cohorts~1> | Ratio | Benchmark (Parameter) | |------------------------------|----------------------------------------|---------|---------------------------------------------------------| | 221±5ms | 196±0.6ms | 0.89 | cohorts.NWMMidwest.time_find_group_cohorts | | 7.47±0.04ms | 6.25±0.04ms | 0.84 | cohorts.ERA5MonthHour.time_find_group_cohorts | | 7.82±0.1ms | 6.36±0.03ms | 0.81 | cohorts.ERA5MonthHourRechunked.time_find_group_cohorts | | 1.02±0ms | 549±2μs | 0.54 | cohorts.PerfectMonthly.time_find_group_cohorts | | 1.02±0ms | 550±1μs | 0.54 | cohorts.PerfectMonthlyRechunked.time_find_group_cohorts | | 3.25±0.01ms | 1.31±0ms | 0.4 | cohorts.ERA5Google.time_find_group_cohorts | | 64.9±0.3ms | 21.9±0.09ms | 0.34 | cohorts.ERA5DayOfYearRechunked.time_find_group_cohorts | ``` * Another attempt. Better for large labels... ``` | Before [19db5b3] <v0.8.2> | After [a2036e1b] <speedup-cohorts> | Ratio | Benchmark (Parameter) | |------------------------------|--------------------------------------|---------|---------------------------------------------------------| | 3.21±0.01ms | 7.91±0.06ms | 2.46 | cohorts.ERA5Google.time_find_group_cohorts | | 1.01±0ms | 1.93±0.04ms | 1.9 | cohorts.PerfectMonthly.time_find_group_cohorts | | 1.02±0ms | 1.90±0.01ms | 1.87 | cohorts.PerfectMonthlyRechunked.time_find_group_cohorts | | 7.84±0.06ms | 12.2±0.6ms | 1.55 | cohorts.ERA5MonthHourRechunked.time_find_group_cohorts | | 7.55±0.03ms | 10.7±0.07ms | 1.42 | cohorts.ERA5MonthHour.time_find_group_cohorts | | 225±10ms | 78.6±1ms | 0.35 | cohorts.NWMMidwest.time_find_group_cohorts | ``` * Revert "Another attempt. Better for large labels..." This reverts commit e2c67ff. * [revert] * bitmask approach ``` | Before [19db5b3] <v0.8.2> | After [bb71bc4d] <speedup-cohorts> | Ratio | Benchmark (Parameter) | |------------------------------|--------------------------------------|---------|---------------------------------------------------------| | 24.8±0.07ms | 19.9±0.2ms | 0.8 | cohorts.ERA5DayOfYear.time_find_group_cohorts | | 3.23±0.01ms | 1.24±0.01ms | 0.38 | cohorts.ERA5Google.time_find_group_cohorts | | 1.01±0ms | 297±0.5μs | 0.29 | cohorts.PerfectMonthly.time_find_group_cohorts | | 1.02±0ms | 298±0.5μs | 0.29 | cohorts.PerfectMonthlyRechunked.time_find_group_cohorts | | 64.9±0.2ms | 16.5±0.3ms | 0.25 | cohorts.ERA5DayOfYearRechunked.time_find_group_cohorts | | 7.66±0.02ms | 1.83±0.01ms | 0.24 | cohorts.ERA5MonthHourRechunked.time_find_group_cohorts | | 217±3ms | 52.9±2ms | 0.24 | cohorts.NWMMidwest.time_find_group_cohorts | | 7.55±0.02ms | 1.70±0ms | 0.23 | cohorts.ERA5MonthHour.time_find_group_cohorts | ``` * Change order of tokenize Small incremental change ``` | Before [b1fd3be] <speedup-cohorts~1> | After [056ce4d0] <speedup-cohorts> | Ratio | Benchmark (Parameter) | |-----------------------------------------|--------------------------------------|---------|-----------------------------------------| | 170±1ms | 143±1ms | 0.84 | cohorts.NWMMidwest.time_graph_construct | * Another set optimization | Before [d4f3b80] <speedup-cohorts~1> | After [7583969e] <speedup-cohorts> | Ratio | Benchmark (Parameter) | |-----------------------------------------|--------------------------------------|---------|--------------------------------------------------------| | 16.3±0.2ms | 6.27±0.04ms | 0.38 | cohorts.ERA5DayOfYearRechunked.time_find_group_cohorts | | 20.0±0.02ms | 7.45±0.01ms | 0.37 | cohorts.ERA5DayOfYear.time_find_group_cohorts | ``` * switch to containment * [revert] Revert "switch to containment" This reverts commit e082cbd. * Sparse array bitmask ``` | Change | Before [97ce15f] <speedup-cohorts~1> | After [1b79831] <speedup-cohorts> | Ratio | Benchmark (Parameter) | |----------|-----------------------------------------|--------------------------------------|---------|---------------------------------------------------------| | + | 233±0.5μs | 519±3μs | 2.23 | cohorts.PerfectMonthly.time_find_group_cohorts | | + | 232±0.8μs | 518±1μs | 2.23 | cohorts.PerfectMonthlyRechunked.time_find_group_cohorts | | + | 1.01±0.01ms | 2.14±0.05ms | 2.13 | cohorts.ERA5Google.time_find_group_cohorts | | + | 1.48±0.01ms | 2.27±0.01ms | 1.53 | cohorts.ERA5MonthHourRechunked.time_find_group_cohorts | | + | 1.39±0ms | 2.11±0ms | 1.52 | cohorts.ERA5MonthHour.time_find_group_cohorts | | + | 2.66±0.01ms | 2.99±0.08ms | 1.12 | cohorts.PerfectMonthly.time_graph_construct | | - | 22.5±0.06ms | 17.4±0.3ms | 0.77 | cohorts.NWMMidwest.time_find_group_cohorts | ``` * speed up map_blocks a little. tokenizing **kwargs is slower than tokenizing *args * VIsualize chunks rather than cohort labels * Add scipy to minimal requirements * Add back memoize * Minor comments
diff --git a/asv_bench/benchmarks/cohorts.py b/asv_bench/benchmarks/cohorts.py
@@ -1,10 +1,8 @@
 import dask
 import numpy as np
 import pandas as pd
-import xarray as xr
 
 import flox
-from flox.xarray import xarray_reduce
 
 
 class Cohorts:
@@ -129,11 +127,10 @@ def setup(self, *args, **kwargs):
         super().rechunk()
 
 
-def time_cohorts_era5_single():
-    TIME = 900  # 92044 in Google ARCO ERA5
-    da = xr.DataArray(
-        dask.array.ones((TIME, 721, 1440), chunks=(1, -1, -1)),
-        dims=("time", "lat", "lon"),
-        coords=dict(time=pd.date_range("1959-01-01", freq="6H", periods=TIME)),
-    )
-    xarray_reduce(da, da.time.dt.day, method="cohorts", func="any")
+class ERA5Google(Cohorts):
+    def setup(self, *args, **kwargs):
+        TIME = 900  # 92044 in Google ARCO ERA5
+        self.time = pd.Series(pd.date_range("1959-01-01", freq="6H", periods=TIME))
+        self.axis = (2,)
+        self.array = dask.array.ones((721, 1440, TIME), chunks=(-1, -1, 1))
+        self.by = self.time.dt.day.values
diff --git a/ci/benchmark.yml b/ci/benchmark.yml
@@ -13,3 +13,4 @@ dependencies:
   - numpy_groupies>=0.9.19
   - numbagg>=0.3
   - wheel
+  - scipy
diff --git a/ci/docs.yml b/ci/docs.yml
@@ -6,6 +6,7 @@ dependencies:
   - pip
   - xarray
   - numpy>=1.22
+  - scipy
   - numpydoc
   - numpy_groupies>=0.9.19
   - toolz
diff --git a/ci/environment.yml b/ci/environment.yml
@@ -9,6 +9,7 @@ dependencies:
   - netcdf4
   - pandas
   - numpy>=1.22
+  - scipy
   - lxml # for mypy coverage report
   - matplotlib
   - pip
@@ -24,4 +25,3 @@ dependencies:
   - toolz
   - numba
   - numbagg>=0.3
-  - scipy
diff --git a/ci/minimal-requirements.yml b/ci/minimal-requirements.yml
@@ -10,6 +10,7 @@ dependencies:
   - pytest-pretty
   - pytest-xdist
   - numpy==1.22
+  - scipy
   - numpy_groupies==0.9.19
   - pandas
   - pooch
diff --git a/ci/no-dask.yml b/ci/no-dask.yml
@@ -6,6 +6,7 @@ dependencies:
   - netcdf4
   - pandas
   - numpy>=1.22
+  - scipy
   - pip
   - pytest
   - pytest-cov
diff --git a/ci/no-numba.yml b/ci/no-numba.yml
@@ -9,6 +9,7 @@ dependencies:
   - netcdf4
   - pandas
   - numpy>=1.22
+  - scipy
   - lxml # for mypy coverage report
   - matplotlib
   - pip
@@ -21,4 +22,3 @@ dependencies:
   - numpy_groupies>=0.9.19
   - pooch
   - toolz
-  - scipy
diff --git a/ci/no-xarray.yml b/ci/no-xarray.yml
@@ -6,6 +6,7 @@ dependencies:
   - netcdf4
   - pandas
   - numpy>=1.22
+  - scipy
   - pip
   - pytest
   - pytest-cov
diff --git a/flox/core.py b/flox/core.py
@@ -9,6 +9,7 @@
 from collections import namedtuple
 from collections.abc import Sequence
 from functools import partial, reduce
+from itertools import product
 from numbers import Integral
 from typing import (
     TYPE_CHECKING,
@@ -23,6 +24,7 @@
 import numpy_groupies as npg
 import pandas as pd
 import toolz as tlz
+from scipy.sparse import csc_array
 
 from . import xrdtypes
 from .aggregate_flox import _prepare_for_flox
@@ -203,6 +205,16 @@ def _unique(a: np.ndarray) -> np.ndarray:
     return np.sort(pd.unique(a.reshape(-1)))
 
 
+def slices_from_chunks(chunks):
+    """slightly modified from dask.array.core.slices_from_chunks to be lazy"""
+    cumdims = [tlz.accumulate(operator.add, bds, 0) for bds in chunks]
+    slices = (
+        (slice(s, s + dim) for s, dim in zip(starts, shapes))
+        for starts, shapes in zip(cumdims, chunks)
+    )
+    return product(*slices)
+
+
 @memoize
 def find_group_cohorts(labels, chunks, merge: bool = True) -> dict:
     """
@@ -215,9 +227,10 @@ def find_group_cohorts(labels, chunks, merge: bool = True) -> dict:
     Parameters
     ----------
     labels : np.ndarray
-        mD Array of group labels
+        mD Array of integer group codes, factorized so that -1
+        represents NaNs.
     chunks : tuple
-        nD array that is being reduced
+        chunks of the array being reduced
     merge : bool, optional
         Attempt to merge cohorts when one cohort's chunks are a subset
         of another cohort's chunks.
@@ -227,33 +240,59 @@ def find_group_cohorts(labels, chunks, merge: bool = True) -> dict:
     cohorts: dict_values
         Iterable of cohorts
     """
-    import dask
-
     # To do this, we must have values in memory so casting to numpy should be safe
     labels = np.asarray(labels)
 
-    # Build an array with the shape of labels, but where every element is the "chunk number"
-    # 1. First subset the array appropriately
-    axis = range(-labels.ndim, 0)
-    # Easier to create a dask array and use the .blocks property
-    array = dask.array.empty(tuple(sum(c) for c in chunks), chunks=chunks)
-    labels = np.broadcast_to(labels, array.shape[-labels.ndim :])
-
-    #  Iterate over each block and create a new block of same shape with "chunk number"
-    shape = tuple(array.blocks.shape[ax] for ax in axis)
-    # Use a numpy object array to enable assignment in the loop
-    # TODO: is it possible to just use a nested list?
-    #       That is what we need for `np.block`
-    blocks = np.empty(shape, dtype=object)
-    array_chunks = tuple(np.array(c) for c in array.chunks)
-    for idx, blockindex in enumerate(np.ndindex(array.numblocks)):
-        chunkshape = tuple(c[i] for c, i in zip(array_chunks, blockindex))
-        blocks[blockindex] = np.full(chunkshape, idx)
-    which_chunk = np.block(blocks.tolist()).reshape(-1)
-
-    raveled = labels.reshape(-1)
-    # these are chunks where a label is present
-    label_chunks = pd.Series(which_chunk).groupby(raveled).unique()
+    shape = tuple(sum(c) for c in chunks)
+    nchunks = math.prod(len(c) for c in chunks)
+
+    # assumes that `labels` are factorized
+    nlabels = labels.max() + 1
+
+    labels = np.broadcast_to(labels, shape[-labels.ndim :])
+
+    rows = []
+    cols = []
+    # Add one to handle the -1 sentinel value
+    label_is_present = np.zeros((nlabels + 1,), dtype=bool)
+    ilabels = np.arange(nlabels)
+    for idx, region in enumerate(slices_from_chunks(chunks)):
+        # This is a quite fast way to find unique integers, when we know how many there are
+        # inspired by a similar idea in numpy_groupies for first, last
+        # instead of explicitly finding uniques, repeatedly write True to the same location
+        subset = labels[region]
+        # The reshape is not strictly necessary but is about 100ms faster on a test problem.
+        label_is_present[subset.reshape(-1)] = True
+        # skip the -1 sentinel by slicing
+        uniques = ilabels[label_is_present[:-1]]
+        rows.append([idx] * len(uniques))
+        cols.append(uniques)
+        label_is_present[:] = False
+    rows_array = np.concatenate(rows)
+    cols_array = np.concatenate(cols)
+    data = np.broadcast_to(np.array(1, dtype=np.uint8), rows_array.shape)
+    bitmask = csc_array((data, (rows_array, cols_array)), dtype=bool, shape=(nchunks, nlabels))
+    label_chunks = {
+        lab: bitmask.indices[slice(bitmask.indptr[lab], bitmask.indptr[lab + 1])]
+        for lab in range(nlabels)
+    }
+
+    ## numpy bitmask approach, faster than finding uniques, but lots of memory
+    # bitmask = np.zeros((nchunks, nlabels), dtype=bool)
+    # for idx, region in enumerate(slices_from_chunks(chunks)):
+    #     bitmask[idx, labels[region]] = True
+    # bitmask = bitmask[:, :-1]
+    # chunk = np.arange(nchunks)  # [:, np.newaxis] * bitmask
+    # label_chunks = {lab: chunk[bitmask[:, lab]] for lab in range(nlabels - 1)}
+
+    ## Pandas GroupBy approach, quite slow!
+    # which_chunk = np.empty(shape, dtype=np.int64)
+    # for idx, region in enumerate(slices_from_chunks(chunks)):
+    #     which_chunk[region] = idx
+    # which_chunk = which_chunk.reshape(-1)
+    # raveled = labels.reshape(-1)
+    # # these are chunks where a label is present
+    # label_chunks = pd.Series(which_chunk).groupby(raveled).unique()
 
     # These invert the label_chunks mapping so we know which labels occur together.
     def invert(x) -> tuple[np.ndarray, ...]:
@@ -264,33 +303,31 @@ def invert(x) -> tuple[np.ndarray, ...]:
 
     # If our dataset has chunksize one along the axis,
     # then no merging is possible.
-    single_chunks = all((ac == 1).all() for ac in array_chunks)
+    single_chunks = all(all(a == 1 for a in ac) for ac in chunks)
 
-    if merge and not single_chunks:
+    if not single_chunks and merge:
         # First sort by number of chunks occupied by cohort
         sorted_chunks_cohorts = dict(
             sorted(chunks_cohorts.items(), key=lambda kv: len(kv[0]), reverse=True)
         )
 
-        items = tuple(sorted_chunks_cohorts.items())
+        items = tuple((k, set(k), v) for k, v in sorted_chunks_cohorts.items() if k)
 
         merged_cohorts = {}
-        merged_keys = []
+        merged_keys = set()
 
         # Now we iterate starting with the longest number of chunks,
         # and then merge in cohorts that are present in a subset of those chunks
         # I think this is suboptimal and must fail at some point.
         # But it might work for most cases. There must be a better way...
-        for idx, (k1, v1) in enumerate(items):
+        for idx, (k1, set_k1, v1) in enumerate(items):
             if k1 in merged_keys:
                 continue
             merged_cohorts[k1] = copy.deepcopy(v1)
-            for k2, v2 in items[idx + 1 :]:
-                if k2 in merged_keys:
-                    continue
-                if set(k2).issubset(set(k1)):
+            for k2, set_k2, v2 in items[idx + 1 :]:
+                if k2 not in merged_keys and set_k2.issubset(set_k1):
                     merged_cohorts[k1].extend(v2)
-                    merged_keys.append(k2)
+                    merged_keys.update((k2,))
 
         # make sure each cohort is sorted after merging
         sorted_merged_cohorts = {k: sorted(v) for k, v in merged_cohorts.items()}
@@ -1373,7 +1410,6 @@ def dask_groupby_agg(
 
     inds = tuple(range(array.ndim))
     name = f"groupby_{agg.name}"
-    token = dask.base.tokenize(array, by, agg, expected_groups, axis)
 
     if expected_groups is None and reindex:
         expected_groups = _get_expected_groups(by, sort=sort)
@@ -1394,6 +1430,9 @@ def dask_groupby_agg(
         by = dask.array.from_array(by, chunks=chunks)
     _, (array, by) = dask.array.unify_chunks(array, inds, by, inds[-by.ndim :])
 
+    # tokenize here since by has already been hashed if its numpy
+    token = dask.base.tokenize(array, by, agg, expected_groups, axis)
+
     # preprocess the array:
     #   - for argreductions, this zips the index together with the array block
     #   - not necessary for blockwise with argreductions
@@ -1510,7 +1549,7 @@ def dask_groupby_agg(
                 index = pd.Index(cohort)
                 subset = subset_to_blocks(intermediate, blks, array.blocks.shape[-len(axis) :])
                 reindexed = dask.array.map_blocks(
-                    reindex_intermediates, subset, agg=agg, unique_groups=index, meta=subset._meta
+                    reindex_intermediates, subset, agg, index, meta=subset._meta
                 )
                 # now that we have reindexed, we can set reindex=True explicitlly
                 reduced_.append(
diff --git a/flox/visualize.py b/flox/visualize.py
@@ -121,44 +121,44 @@ def get_colormap(N):
     ncolors = len(cmap.colors)
     q = N // ncolors
     r = N % ncolors
-    cmap = mpl.colors.ListedColormap(np.concatenate([cmap.colors] * q + [cmap.colors[:r]]))
-    cmap.set_under(color="w")
+    cmap = mpl.colors.ListedColormap(np.concatenate([cmap.colors] * q + [cmap.colors[: r + 1]]))
+    cmap.set_under(color="k")
     return cmap
 
 
-def factorize_cohorts(by, cohorts):
-    factorized = np.full(by.shape, -1)
+def factorize_cohorts(chunks, cohorts):
+    chunk_grid = tuple(len(c) for c in chunks)
+    nchunks = np.prod(chunk_grid)
+    factorized = np.full((nchunks,), -1, dtype=np.int64)
     for idx, cohort in enumerate(cohorts):
-        factorized[np.isin(by, cohort)] = idx
-    return factorized
+        factorized[list(cohort)] = idx
+    return factorized.reshape(chunk_grid)
 
 
-def visualize_cohorts_2d(by, array):
+def visualize_cohorts_2d(by, chunks):
     assert by.ndim == 2
     print("finding cohorts...")
-    before_merged = find_group_cohorts(
-        by, [array.chunks[ax] for ax in range(-by.ndim, 0)], merge=False
-    ).values()
-    merged = find_group_cohorts(
-        by, [array.chunks[ax] for ax in range(-by.ndim, 0)], merge=True
-    ).values()
+    chunks = [chunks[ax] for ax in range(-by.ndim, 0)]
+    before_merged = find_group_cohorts(by, chunks, merge=False)
+    merged = find_group_cohorts(by, chunks, merge=True)
     print("finished cohorts...")
 
-    xticks = np.cumsum(array.chunks[-1])
-    yticks = np.cumsum(array.chunks[-2])
+    xticks = np.cumsum(chunks[-1])
+    yticks = np.cumsum(chunks[-2])
 
-    f, ax = plt.subplots(2, 2, constrained_layout=True, sharex=True, sharey=True)
+    f, ax = plt.subplots(1, 3, constrained_layout=True, sharex=False, sharey=False)
     ax = ax.ravel()
-    ax[1].set_visible(False)
-    ax = ax[[0, 2, 3]]
+    # ax[1].set_visible(False)
+    # ax = ax[[0, 2, 3]]
 
     ngroups = len(_unique(by))
-    h0 = ax[0].imshow(by, cmap=get_colormap(ngroups))
-    h1 = _visualize_cohorts(by, before_merged, ax=ax[1])
-    h2 = _visualize_cohorts(by, merged, ax=ax[2])
+    h0 = ax[0].imshow(by, vmin=0, cmap=get_colormap(ngroups))
+    h1 = _visualize_cohorts(chunks, before_merged, ax=ax[1])
+    h2 = _visualize_cohorts(chunks, merged, ax=ax[2])
 
     for axx in ax:
         axx.grid(True, which="both")
+    for axx in ax[:1]:
         axx.set_xticks(xticks)
         axx.set_yticks(yticks)
     for h, axx in zip([h0, h1, h2], ax):
@@ -167,14 +167,15 @@ def visualize_cohorts_2d(by, array):
     ax[0].set_title(f"by: {ngroups} groups")
     ax[1].set_title(f"{len(before_merged)} cohorts")
     ax[2].set_title(f"{len(merged)} merged cohorts")
-    f.set_size_inches((6, 6))
+    f.set_size_inches((12, 6))
 
 
-def _visualize_cohorts(by, cohorts, ax=None):
+def _visualize_cohorts(chunks, cohorts, ax=None):
     if ax is None:
         _, ax = plt.subplots(1, 1)
 
-    ax.imshow(factorize_cohorts(by, cohorts), vmin=0, cmap=get_colormap(len(cohorts)))
+    data = factorize_cohorts(chunks, cohorts)
+    return ax.imshow(data, vmin=0, cmap=get_colormap(len(cohorts)))
 
 
 def visualize_groups_2d(labels, y0=0, **kwargs):
diff --git a/pyproject.toml b/pyproject.toml
@@ -41,6 +41,7 @@ requires = [
     "pandas",
     "numpy>=1.22",
     "numpy_groupies>=0.9.19",
+    "scipy",
     "toolz",
     "setuptools>=61.0.0",
     "setuptools_scm[toml]>=7.0",
@@ -101,6 +102,7 @@ known-third-party = [
     "pkg_resources",
     "pytest",
     "setuptools",
+    "scipy",
     "xarray"
 ]
 
diff --git a/tests/test_core.py b/tests/test_core.py