Skip to content

Commit 684a1a3

Browse files
Backport PR #60938: ENH(string dtype): Implement cumsum for Python-backed strings (#60984)
* ENH: Improved error message and raise new error for small-string NaN edge case in HDFStore.append (#60829) * Add clearer error messages for datatype mismatch in HDFStore.append. Raise ValueError when nan_rep too large for pytable column. Add and modify applicable test code. * Fix missed tests and correct mistake in error message. * Remove excess comments. Reverse error type change to avoid api changes. Move nan_rep tests into separate function. (cherry picked from commit 57340ec) * TST(string dtype): Resolve xfails in pytables (#60795) (cherry picked from commit 4511251) * BUG(string dtype): Resolve pytables xfail when reading with condition (#60943) (cherry picked from commit 0ec5f26) * Backport PR #60940: ENH: Add dtype argument to str.decode * Backport PR #60938: ENH(string dtype): Implement cumsum for Python-backed strings --------- Co-authored-by: Jake Thomas Trevallion <[email protected]>
1 parent 81229e6 commit 684a1a3

File tree

5 files changed

+92
-25
lines changed

5 files changed

+92
-25
lines changed

doc/source/whatsnew/v2.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ Other enhancements
3838
- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
3939
- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`)
4040
- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
41-
- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
41+
- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`)
4242
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
4343

4444
.. ---------------------------------------------------------------------------

pandas/core/arrays/string_.py

+83
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
)
4747

4848
from pandas.core import (
49+
missing,
4950
nanops,
5051
ops,
5152
)
@@ -865,6 +866,88 @@ def _reduce(
865866
return result
866867
raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
867868

869+
def _accumulate(self, name: str, *, skipna: bool = True, **kwargs) -> StringArray:
870+
"""
871+
Return an ExtensionArray performing an accumulation operation.
872+
873+
The underlying data type might change.
874+
875+
Parameters
876+
----------
877+
name : str
878+
Name of the function, supported values are:
879+
- cummin
880+
- cummax
881+
- cumsum
882+
- cumprod
883+
skipna : bool, default True
884+
If True, skip NA values.
885+
**kwargs
886+
Additional keyword arguments passed to the accumulation function.
887+
Currently, there is no supported kwarg.
888+
889+
Returns
890+
-------
891+
array
892+
893+
Raises
894+
------
895+
NotImplementedError : subclass does not define accumulations
896+
"""
897+
if name == "cumprod":
898+
msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
899+
raise TypeError(msg)
900+
901+
# We may need to strip out trailing NA values
902+
tail: np.ndarray | None = None
903+
na_mask: np.ndarray | None = None
904+
ndarray = self._ndarray
905+
np_func = {
906+
"cumsum": np.cumsum,
907+
"cummin": np.minimum.accumulate,
908+
"cummax": np.maximum.accumulate,
909+
}[name]
910+
911+
if self._hasna:
912+
na_mask = cast("npt.NDArray[np.bool_]", isna(ndarray))
913+
if np.all(na_mask):
914+
return type(self)(ndarray)
915+
if skipna:
916+
if name == "cumsum":
917+
ndarray = np.where(na_mask, "", ndarray)
918+
else:
919+
# We can retain the running min/max by forward/backward filling.
920+
ndarray = ndarray.copy()
921+
missing.pad_or_backfill_inplace(
922+
ndarray,
923+
method="pad",
924+
axis=0,
925+
)
926+
missing.pad_or_backfill_inplace(
927+
ndarray,
928+
method="backfill",
929+
axis=0,
930+
)
931+
else:
932+
# When not skipping NA values, the result should be null from
933+
# the first NA value onward.
934+
idx = np.argmax(na_mask)
935+
tail = np.empty(len(ndarray) - idx, dtype="object")
936+
tail[:] = self.dtype.na_value
937+
ndarray = ndarray[:idx]
938+
939+
# mypy: Cannot call function of unknown type
940+
np_result = np_func(ndarray) # type: ignore[operator]
941+
942+
if tail is not None:
943+
np_result = np.hstack((np_result, tail))
944+
elif na_mask is not None:
945+
# Argument 2 to "where" has incompatible type "NAType | float"
946+
np_result = np.where(na_mask, self.dtype.na_value, np_result) # type: ignore[arg-type]
947+
948+
result = type(self)(np_result)
949+
return result
950+
868951
def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
869952
if self.dtype.na_value is np.nan and result is libmissing.NA:
870953
# the masked_reductions use pd.NA -> convert to np.nan

pandas/tests/apply/test_str.py

+1-14
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
import numpy as np
55
import pytest
66

7-
from pandas.compat import HAS_PYARROW
8-
97
from pandas.core.dtypes.common import is_number
108

119
from pandas import (
@@ -168,21 +166,10 @@ def test_agg_cython_table_series(series, func, expected):
168166
),
169167
),
170168
)
171-
def test_agg_cython_table_transform_series(request, series, func, expected):
169+
def test_agg_cython_table_transform_series(series, func, expected):
172170
# GH21224
173171
# test transforming functions in
174172
# pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
175-
if (
176-
series.dtype == "string"
177-
and func in ("cumsum", np.cumsum, np.nancumsum)
178-
and not HAS_PYARROW
179-
):
180-
request.applymarker(
181-
pytest.mark.xfail(
182-
raises=NotImplementedError,
183-
reason="TODO(infer_string) cumsum not yet implemented for string",
184-
)
185-
)
186173
warn = None if isinstance(func, str) else FutureWarning
187174
with tm.assert_produces_warning(warn, match="is currently using Series.*"):
188175
result = series.agg(func)

pandas/tests/extension/test_string.py

+1-5
Original file line numberDiff line numberDiff line change
@@ -200,11 +200,7 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
200200

201201
def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
202202
assert isinstance(ser.dtype, StorageExtensionDtype)
203-
return ser.dtype.storage == "pyarrow" and op_name in [
204-
"cummin",
205-
"cummax",
206-
"cumsum",
207-
]
203+
return op_name in ["cummin", "cummax", "cumsum"]
208204

209205
def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
210206
dtype = cast(StringDtype, tm.get_dtype(obj))

pandas/tests/series/test_cumulative.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -193,13 +193,14 @@ def test_cumprod_timedelta(self):
193193
([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]),
194194
],
195195
)
196-
def test_cum_methods_pyarrow_strings(
197-
self, pyarrow_string_dtype, data, op, skipna, expected_data
196+
def test_cum_methods_ea_strings(
197+
self, string_dtype_no_object, data, op, skipna, expected_data
198198
):
199-
# https://github.com/pandas-dev/pandas/pull/60633
200-
ser = pd.Series(data, dtype=pyarrow_string_dtype)
199+
# https://github.com/pandas-dev/pandas/pull/60633 - pyarrow
200+
# https://github.com/pandas-dev/pandas/pull/60938 - Python
201+
ser = pd.Series(data, dtype=string_dtype_no_object)
201202
method = getattr(ser, op)
202-
expected = pd.Series(expected_data, dtype=pyarrow_string_dtype)
203+
expected = pd.Series(expected_data, dtype=string_dtype_no_object)
203204
result = method(skipna=skipna)
204205
tm.assert_series_equal(result, expected)
205206

0 commit comments

Comments
 (0)