✨ Add support for RollingGroupby, ExpandingGroupby

ddelange · ddelange · commit 16cf978a1961 · 2026-02-23T21:59:09.000+01:00
diff --git a/src/mapply/__init__.py b/src/mapply/__init__.py
@@ -95,3 +95,7 @@ def init(
     )
 
     setattr(PandasObject, apply_name, apply)
+
+    from pandas.core.window.rolling import BaseWindowGroupby
+
+    setattr(BaseWindowGroupby, apply_name, apply)
diff --git a/src/mapply/_window_groupby.py b/src/mapply/_window_groupby.py
@@ -0,0 +1,117 @@
+# BSD 3-Clause License
+#
+# Copyright (c) 2024, ddelange, <ddelange@delange.dev>
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+import logging
+from collections.abc import Callable
+from typing import Any
+
+from mapply.parallel import multiprocessing_imap, tqdm
+
+logger = logging.getLogger(__name__)
+
+
+def run_window_groupby_apply(
+    window_groupby: Any,
+    func: Callable,
+    *,
+    n_workers: int,
+    progressbar: bool,
+    args: tuple[Any, ...] = (),
+    **kwargs: Any,
+):
+    """Apply func to each group's window in parallel using multiprocessing_imap."""
+    from pandas import concat
+    from pandas.core.window.expanding import ExpandingGroupby
+    from pandas.core.window.rolling import RollingGroupby
+
+    if isinstance(window_groupby, ExpandingGroupby):
+        window_kwargs = {
+            "min_periods": window_groupby.min_periods,
+        }
+        window_method = "expanding"
+    elif isinstance(window_groupby, RollingGroupby):
+        window_kwargs = {
+            "window": window_groupby.window,
+            "min_periods": window_groupby.min_periods,
+            "center": window_groupby.center,
+            "on": window_groupby.on,
+            "closed": window_groupby.closed,
+        }
+        window_method = "rolling"
+    else:
+        msg = f"Unsupported window groupby type: {type(window_groupby).__name__}"
+        raise TypeError(msg)
+
+    grouper = window_groupby._grouper  # noqa: SLF001
+    indices = grouper.indices
+    result_index = grouper.result_index
+    obj = window_groupby.obj
+    as_index = window_groupby._as_index  # noqa: SLF001
+    groupby_names = grouper.names
+
+    # lazy generator: yield (key, group_slice) without materializing all groups
+    def _group_iter():
+        for key in result_index:
+            yield key, obj.iloc[indices[key]]
+
+    def _process_group(key_and_data):
+        key, group_data = key_and_data
+        window_obj = getattr(group_data, window_method)(**window_kwargs)
+        result = window_obj.apply(func, args=args, **kwargs)
+        return key, result
+
+    # generator with length defined (for progressbar)
+    groups = tqdm(_group_iter(), disable=True, total=len(result_index))
+    processed = multiprocessing_imap(
+        _process_group,
+        groups,
+        n_workers=n_workers,
+        progressbar=progressbar,
+    )
+
+    # consume lazily from the multiprocessing_imap generator
+    keys = []
+    parts = []
+    for key, part in processed:
+        keys.append(key)
+        parts.append(part)
+
+    if not parts:
+        # delegate to native pandas for the empty case to preserve index dtypes
+        return window_groupby.apply(func, args=args, **kwargs)
+
+    result = concat(parts, keys=keys, names=groupby_names + list(obj.index.names))
+
+    if not as_index:
+        result = result.reset_index(level=list(range(len(groupby_names))))
+
+    return result
diff --git a/src/mapply/mapply.py b/src/mapply/mapply.py
@@ -50,6 +50,7 @@
 from typing import Any
 
 from mapply._groupby import run_groupwise_apply
+from mapply._window_groupby import run_window_groupby_apply
 from mapply.parallel import N_CORES, multiprocessing_imap
 
 DEFAULT_CHUNK_SIZE = 100
@@ -120,6 +121,17 @@ def mapply(  # noqa: PLR0913
     from numpy import arange, array_split
     from pandas import Series, concat
     from pandas.core.groupby import GroupBy
+    from pandas.core.window.rolling import BaseWindowGroupby
+
+    if isinstance(df_or_series, BaseWindowGroupby):
+        return run_window_groupby_apply(
+            df_or_series,
+            func,
+            n_workers=n_workers,
+            progressbar=progressbar,
+            args=args,
+            **kwargs,
+        )
 
     if isinstance(df_or_series, GroupBy):
         return run_groupwise_apply(
diff --git a/tests/test_mapply.py b/tests/test_mapply.py
@@ -154,3 +154,143 @@ def fn(x):
     series = pd.Series({"a": list(range(100))})
 
     assert isinstance(series.mapply(sum).iloc[0], np.int64)
+
+
+def test_rolling_groupby_mapply():
+    """Assert RollingGroupby behaviour is equivalent."""
+    mapply.init(progressbar=False, chunk_size=1)
+
+    np.random.seed(42)  # noqa: NPY002
+    df = pd.DataFrame(
+        {
+            "A": np.random.randint(0, 100, 200),  # noqa: NPY002
+            "B": np.random.randint(0, 100, 200),  # noqa: NPY002
+            "group": [0] * 100 + [1] * 100,
+        },
+    )
+
+    # basic RollingGroupby with custom func
+    pd.testing.assert_frame_equal(
+        df.groupby("group").rolling(3).apply(lambda x: x.sum()),
+        df.groupby("group").rolling(3).mapply(lambda x: x.sum()),
+    )
+
+    # min_periods
+    pd.testing.assert_frame_equal(
+        df.groupby("group").rolling(5, min_periods=2).apply(lambda x: x.mean()),
+        df.groupby("group").rolling(5, min_periods=2).mapply(lambda x: x.mean()),
+    )
+
+    # center=True  # noqa: ERA001
+    pd.testing.assert_frame_equal(
+        df.groupby("group").rolling(3, center=True).apply(lambda x: x.max()),
+        df.groupby("group").rolling(3, center=True).mapply(lambda x: x.max()),
+    )
+
+    # column selection (Series result)
+    pd.testing.assert_series_equal(
+        df.groupby("group")["A"].rolling(3).apply(lambda x: x.sum()),
+        df.groupby("group")["A"].rolling(3).mapply(lambda x: x.sum()),
+    )
+
+    # as_index=False  # noqa: ERA001
+    pd.testing.assert_frame_equal(
+        df.groupby("group", as_index=False).rolling(3).apply(lambda x: x.sum()),
+        df.groupby("group", as_index=False).rolling(3).mapply(lambda x: x.sum()),
+    )
+
+    # multi-level groupby
+    df["group2"] = list(range(2)) * 100
+    pd.testing.assert_frame_equal(
+        df.groupby(["group", "group2"]).rolling(3).apply(lambda x: x.sum()),
+        df.groupby(["group", "group2"]).rolling(3).mapply(lambda x: x.sum()),
+    )
+
+    # time-based rolling with 'on' parameter
+    df_ts = pd.DataFrame(
+        {
+            "A": np.random.randint(0, 100, 200),  # noqa: NPY002
+            "dt": pd.date_range("2020-01-01", periods=200, freq="D"),
+            "group": [0] * 100 + [1] * 100,
+        },
+    )
+    pd.testing.assert_frame_equal(
+        df_ts.groupby("group").rolling("3D", on="dt").apply(lambda x: x.sum()),
+        df_ts.groupby("group").rolling("3D", on="dt").mapply(lambda x: x.sum()),
+    )
+
+    # empty groupby
+    pd.testing.assert_frame_equal(
+        df.iloc[:0].groupby("group").rolling(3).apply(lambda x: x.sum()),
+        df.iloc[:0].groupby("group").rolling(3).mapply(lambda x: x.sum()),
+    )
+
+    # n_workers=1 (single-process fallback, no pool spawned)
+    mapply.init(progressbar=False, chunk_size=1, n_workers=1)
+    pd.testing.assert_frame_equal(
+        df.groupby("group").rolling(3).apply(lambda x: x.sum()),
+        df.groupby("group").rolling(3).mapply(lambda x: x.sum()),
+    )
+
+
+def test_expanding_groupby_mapply():
+    """Assert ExpandingGroupby behaviour is equivalent."""
+    mapply.init(progressbar=False, chunk_size=1)
+
+    np.random.seed(42)  # noqa: NPY002
+    df = pd.DataFrame(
+        {
+            "A": np.random.randint(0, 100, 200),  # noqa: NPY002
+            "B": np.random.randint(0, 100, 200),  # noqa: NPY002
+            "group": [0] * 100 + [1] * 100,
+        },
+    )
+
+    # basic ExpandingGroupby with custom func
+    pd.testing.assert_frame_equal(
+        df.groupby("group").expanding().apply(lambda x: x.sum()),
+        df.groupby("group").expanding().mapply(lambda x: x.sum()),
+    )
+
+    # min_periods
+    pd.testing.assert_frame_equal(
+        df.groupby("group").expanding(min_periods=3).apply(lambda x: x.mean()),
+        df.groupby("group").expanding(min_periods=3).mapply(lambda x: x.mean()),
+    )
+
+    # column selection (Series result)
+    pd.testing.assert_series_equal(
+        df.groupby("group")["A"].expanding().apply(lambda x: x.sum()),
+        df.groupby("group")["A"].expanding().mapply(lambda x: x.sum()),
+    )
+
+    # as_index=False  # noqa: ERA001
+    pd.testing.assert_frame_equal(
+        df.groupby("group", as_index=False).expanding().apply(lambda x: x.sum()),
+        df.groupby("group", as_index=False).expanding().mapply(lambda x: x.sum()),
+    )
+
+    # multi-level groupby
+    df["group2"] = list(range(2)) * 100
+    pd.testing.assert_frame_equal(
+        df.groupby(["group", "group2"]).expanding().apply(lambda x: x.sum()),
+        df.groupby(["group", "group2"]).expanding().mapply(lambda x: x.sum()),
+    )
+
+    # empty groupby
+    pd.testing.assert_frame_equal(
+        df.iloc[:0].groupby("group").expanding().apply(lambda x: x.sum()),
+        df.iloc[:0].groupby("group").expanding().mapply(lambda x: x.sum()),
+    )
+
+    # n_workers=1 (single-process fallback)
+    mapply.init(progressbar=False, chunk_size=1, n_workers=1)
+    pd.testing.assert_frame_equal(
+        df.groupby("group").expanding().apply(lambda x: x.sum()),
+        df.groupby("group").expanding().mapply(lambda x: x.sum()),
+    )
+
+    # unsupported window groupby type (e.g. EWM)
+    mapply.init(progressbar=False, chunk_size=1)
+    with pytest.raises(TypeError, match="Unsupported window groupby type"):
+        df.groupby("group").ewm(span=3).mapply(lambda x: x.sum())

Original file line number	Diff line number	Diff line change
`@@ -95,3 +95,7 @@ def init(`
`95`	`95`	`)`
`96`	`96`
`97`	`97`	`setattr(PandasObject, apply_name, apply)`
	`98`	`+`
	`99`	`+ from pandas.core.window.rolling import BaseWindowGroupby`
	`100`	`+`
	`101`	`+ setattr(BaseWindowGroupby, apply_name, apply)`