Backport PR #60938: ENH(string dtype): Implement cumsum for Python-backed strings (#60984)

rhshadrach · JakeTT404 · web-flow · commit 684a1a3056a9 · 2025-02-22T10:51:16.000-08:00
* ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829) * Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. * Fix missed tests and correct mistake in error message. * Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. (cherry picked from commit 57340ec) * TST(string dtype): Resolve xfails in pytables (#60795) (cherry picked from commit 4511251) * BUG(string dtype): Resolve pytables xfail when reading with condition (#60943) (cherry picked from commit 0ec5f26) * Backport PR #60940: ENH: Add dtype argument to str.decode * Backport PR #60938: ENH(string dtype): Implement cumsum for Python-backed strings --------- Co-authored-by: Jake Thomas Trevallion <136272202+JakeTT404@users.noreply.github.com>
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -38,7 +38,7 @@ Other enhancements
 - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
 - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype``  (:issue:`60663`)
 - The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
-- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
+- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`)
 - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -46,6 +46,7 @@
 )
 
 from pandas.core import (
+    missing,
     nanops,
     ops,
 )
@@ -865,6 +866,88 @@ def _reduce(
             return result
         raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
 
+    def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArray:
+        """
+        Return an ExtensionArray performing an accumulation operation.
+
+        The underlying data type might change.
+
+        Parameters
+        ----------
+        name : str
+            Name of the function, supported values are:
+            - cummin
+            - cummax
+            - cumsum
+            - cumprod
+        skipna : bool, default True
+            If True, skip NA values.
+        **kwargs
+            Additional keyword arguments passed to the accumulation function.
+            Currently, there is no supported kwarg.
+
+        Returns
+        -------
+        array
+
+        Raises
+        ------
+        NotImplementedError : subclass does not define accumulations
+        """
+        if name == "cumprod":
+            msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
+            raise TypeError(msg)
+
+        # We may need to strip out trailing NA values
+        tail: np.ndarray | None = None
+        na_mask: np.ndarray | None = None
+        ndarray = self._ndarray
+        np_func = {
+            "cumsum": np.cumsum,
+            "cummin": np.minimum.accumulate,
+            "cummax": np.maximum.accumulate,
+        }[name]
+
+        if self._hasna:
+            na_mask = cast("npt.NDArray[np.bool_]", isna(ndarray))
+            if np.all(na_mask):
+                return type(self)(ndarray)
+            if skipna:
+                if name == "cumsum":
+                    ndarray = np.where(na_mask, "", ndarray)
+                else:
+                    # We can retain the running min/max by forward/backward filling.
+                    ndarray = ndarray.copy()
+                    missing.pad_or_backfill_inplace(
+                        ndarray,
+                        method="pad",
+                        axis=0,
+                    )
+                    missing.pad_or_backfill_inplace(
+                        ndarray,
+                        method="backfill",
+                        axis=0,
+                    )
+            else:
+                # When not skipping NA values, the result should be null from
+                # the first NA value onward.
+                idx = np.argmax(na_mask)
+                tail = np.empty(len(ndarray) - idx, dtype="object")
+                tail[:] = self.dtype.na_value
+                ndarray = ndarray[:idx]
+
+        # mypy: Cannot call function of unknown type
+        np_result = np_func(ndarray)  # type: ignore[operator]
+
+        if tail is not None:
+            np_result = np.hstack((np_result, tail))
+        elif na_mask is not None:
+            # Argument 2 to "where" has incompatible type "NAType | float"
+            np_result = np.where(na_mask, self.dtype.na_value, np_result)  # type: ignore[arg-type]
+
+        result = type(self)(np_result)
+        return result
+
     def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
         if self.dtype.na_value is np.nan and result is libmissing.NA:
             # the masked_reductions use pd.NA -> convert to np.nan
diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import HAS_PYARROW
-
 from pandas.core.dtypes.common import is_number
 
 from pandas import (
@@ -168,21 +166,10 @@ def test_agg_cython_table_series(series, func, expected):
         ),
     ),
 )
-def test_agg_cython_table_transform_series(request, series, func, expected):
+def test_agg_cython_table_transform_series(series, func, expected):
     # GH21224
     # test transforming functions in
     # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
-    if (
-        series.dtype == "string"
-        and func in ("cumsum", np.cumsum, np.nancumsum)
-        and not HAS_PYARROW
-    ):
-        request.applymarker(
-            pytest.mark.xfail(
-                raises=NotImplementedError,
-                reason="TODO(infer_string) cumsum not yet implemented for string",
-            )
-        )
     warn = None if isinstance(func, str) else FutureWarning
     with tm.assert_produces_warning(warn, match="is currently using Series.*"):
         result = series.agg(func)
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -200,11 +200,7 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
 
     def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
         assert isinstance(ser.dtype, StorageExtensionDtype)
-        return ser.dtype.storage == "pyarrow" and op_name in [
-            "cummin",
-            "cummax",
-            "cumsum",
-        ]
+        return op_name in ["cummin", "cummax", "cumsum"]
 
     def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
         dtype = cast(StringDtype, tm.get_dtype(obj))
diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py
@@ -193,13 +193,14 @@ def test_cumprod_timedelta(self):
             ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]),
         ],
     )
-    def test_cum_methods_pyarrow_strings(
-        self, pyarrow_string_dtype, data, op, skipna, expected_data
+    def test_cum_methods_ea_strings(
+        self, string_dtype_no_object, data, op, skipna, expected_data
     ):
-        # https://github.com/pandas-dev/pandas/pull/60633
-        ser = pd.Series(data, dtype=pyarrow_string_dtype)
+        # https://github.com/pandas-dev/pandas/pull/60633 - pyarrow
+        # https://github.com/pandas-dev/pandas/pull/60938 - Python
+        ser = pd.Series(data, dtype=string_dtype_no_object)
         method = getattr(ser, op)
-        expected = pd.Series(expected_data, dtype=pyarrow_string_dtype)
+        expected = pd.Series(expected_data, dtype=string_dtype_no_object)
         result = method(skipna=skipna)
         tm.assert_series_equal(result, expected)