Skip to content

Commit df0da40

Browse files
authored
Fix blockwise sort optimization (#181)
1 parent c2c4e1d commit df0da40

File tree

2 files changed

+21
-1
lines changed

2 files changed

+21
-1
lines changed

flox/core.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1760,7 +1760,7 @@ def groupby_reduce(
17601760
assert len(groups) == 1
17611761
sorted_idx = np.argsort(groups[0])
17621762
# This optimization helps specifically with resampling
1763-
if not (sorted_idx[1:] <= sorted_idx[:-1]).all():
1763+
if not (sorted_idx[:-1] <= sorted_idx[1:]).all():
17641764
result = result[..., sorted_idx]
17651765
groups = (groups[0][sorted_idx],)
17661766

tests/test_core.py

+20
Original file line numberDiff line numberDiff line change
@@ -1183,3 +1183,23 @@ def test_validate_reindex():
11831183
for func in ["sum", "argmax"]:
11841184
actual = _validate_reindex(None, func, method, expected_groups=None, by_is_dask=False)
11851185
assert actual is False
1186+
1187+
1188+
@requires_dask
1189+
def test_1d_blockwise_sort_optimization():
1190+
# Make sure for resampling problems sorting isn't done.
1191+
time = pd.Series(pd.date_range("2020-09-01", "2020-12-31 23:59", freq="3H"))
1192+
array = dask.array.ones((len(time),), chunks=(224,))
1193+
1194+
actual, _ = groupby_reduce(array, time.dt.dayofyear.values, method="blockwise", func="count")
1195+
assert all("getitem" not in k for k in actual.dask)
1196+
1197+
actual, _ = groupby_reduce(
1198+
array, time.dt.dayofyear.values[::-1], sort=True, method="blockwise", func="count"
1199+
)
1200+
assert any("getitem" in k for k in actual.dask.layers)
1201+
1202+
actual, _ = groupby_reduce(
1203+
array, time.dt.dayofyear.values[::-1], sort=False, method="blockwise", func="count"
1204+
)
1205+
assert all("getitem" not in k for k in actual.dask.layers)

0 commit comments

Comments
 (0)