Skip to content

Commit cdaaba5

Browse files
committed
Fix datetime binning
1 parent 731fa05 commit cdaaba5

File tree

2 files changed

+26
-5
lines changed

2 files changed

+26
-5
lines changed

flox/core.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -400,10 +400,12 @@ def factorize_(by: tuple, axis, expected_groups: tuple[pd.Index, ...] = None):
400400
# this makes the reindexing logic simpler.
401401
if expect is None:
402402
raise ValueError("Please pass bin edges in expected_groups.")
403-
# idx = np.digitize(groupvar.ravel(), expect) - 1
404-
idx = pd.cut(groupvar.ravel(), bins=expect, labels=False).codes.copy()
405-
# same sentinel value as factorize
403+
# TODO: fix for binning
406404
found_groups.append(expect)
405+
# pd.cut with bins = IntervalIndex[datetime64] doesn't work...
406+
if groupvar.dtype.kind == "M":
407+
expect = np.concatenate([expect.left.to_numpy(), [expect.right[-1].to_numpy()]])
408+
idx = pd.cut(groupvar.ravel(), bins=expect).codes.copy()
407409
else:
408410
idx, groups = pd.factorize(groupvar.ravel())
409411
found_groups.append(np.array(groups))
@@ -1246,13 +1248,15 @@ def _assert_by_is_aligned(shape, by):
12461248

12471249

12481250
def _convert_expected_groups_to_index(expected_groups, isbin: bool) -> pd.Index | None:
1249-
if isinstance(expected_groups, pd.Index):
1251+
if isinstance(expected_groups, pd.IntervalIndex) or (
1252+
isinstance(expected_groups, pd.Index) and not isbin
1253+
):
12501254
return expected_groups
12511255
if isbin:
12521256
return pd.IntervalIndex.from_arrays(expected_groups[:-1], expected_groups[1:])
12531257
elif expected_groups is not None:
12541258
return pd.Index(expected_groups)
1255-
return None
1259+
return expected_groups
12561260

12571261

12581262
def groupby_reduce(

tests/test_core.py

+17
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
from numpy_groupies.aggregate_numpy import aggregate
55

66
from flox.core import (
7+
_convert_expected_groups_to_index,
78
_get_optimal_chunks_for_groups,
9+
factorize_,
810
find_group_cohorts,
911
groupby_reduce,
1012
rechunk_for_cohorts,
@@ -758,3 +760,18 @@ def test_empty_bins(func, engine):
758760
)
759761
expected = np.array([1.0, 1.0, np.nan])
760762
assert_equal(actual, expected)
763+
764+
765+
def test_datetime_binning():
766+
time_bins = pd.date_range(start="2010-08-01", end="2010-08-15", freq="24H")
767+
by = pd.date_range("2010-08-01", "2010-08-15", freq="15min")
768+
769+
actual = _convert_expected_groups_to_index(time_bins, isbin=True)
770+
expected = pd.IntervalIndex.from_arrays(time_bins[:-1], time_bins[1:])
771+
assert_equal(actual, expected)
772+
773+
ret = factorize_((by.to_numpy(),), axis=0, expected_groups=(actual,))
774+
group_idx = ret[0]
775+
expected = pd.cut(by, time_bins).codes.copy()
776+
expected[0] = 14 # factorize doesn't return -1 for nans
777+
assert_equal(group_idx, expected)

0 commit comments

Comments
 (0)