Skip to content

Commit dab1b88

Browse files
authored
BUG(string dtype): Empty sum produces incorrect result (#60936)
1 parent f1b00b8 commit dab1b88

File tree

6 files changed

+79
-1
lines changed

6 files changed

+79
-1
lines changed

doc/source/whatsnew/v2.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ Conversion
118118

119119
Strings
120120
^^^^^^^
121+
- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`60229`)
121122
- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
122123
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
123124
- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)

pandas/core/arrays/base.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -2628,7 +2628,15 @@ def _groupby_op(
26282628
if op.how not in ["any", "all"]:
26292629
# Fail early to avoid conversion to object
26302630
op._get_cython_function(op.kind, op.how, np.dtype(object), False)
2631-
npvalues = self.to_numpy(object, na_value=np.nan)
2631+
2632+
arr = self
2633+
if op.how == "sum":
2634+
# https://github.com/pandas-dev/pandas/issues/60229
2635+
# All NA should result in the empty string.
2636+
assert "skipna" in kwargs
2637+
if kwargs["skipna"] and min_count == 0:
2638+
arr = arr.fillna("")
2639+
npvalues = arr.to_numpy(object, na_value=np.nan)
26322640
else:
26332641
raise NotImplementedError(
26342642
f"function is not implemented for this dtype: {self.dtype}"

pandas/tests/frame/test_reductions.py

+10
Original file line numberDiff line numberDiff line change
@@ -835,6 +835,16 @@ def test_axis_1_empty(self, all_reductions, index):
835835
expected = Series([], index=index, dtype=expected_dtype)
836836
tm.assert_series_equal(result, expected)
837837

838+
@pytest.mark.parametrize("min_count", [0, 1])
839+
def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count):
840+
# https://github.com/pandas-dev/pandas/issues/60229
841+
dtype = string_dtype_no_object
842+
df = DataFrame({"a": [pd.NA]}, dtype=dtype)
843+
result = df.sum(axis=1, skipna=skipna, min_count=min_count)
844+
value = "" if skipna and min_count == 0 else pd.NA
845+
expected = Series([value], dtype=dtype)
846+
tm.assert_series_equal(result, expected)
847+
838848
@pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
839849
@pytest.mark.parametrize("numeric_only", [None, True, False])
840850
def test_sum_prod_nanops(self, method, unit, numeric_only):

pandas/tests/groupby/test_reductions.py

+14
Original file line numberDiff line numberDiff line change
@@ -955,6 +955,20 @@ def test_min_empty_string_dtype(func, string_dtype_no_object):
955955
tm.assert_frame_equal(result, expected)
956956

957957

958+
@pytest.mark.parametrize("min_count", [0, 1])
959+
def test_string_dtype_empty_sum(string_dtype_no_object, skipna, min_count):
960+
# https://github.com/pandas-dev/pandas/issues/60229
961+
dtype = string_dtype_no_object
962+
df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
963+
gb = df.groupby("a")
964+
result = gb.sum(skipna=skipna, min_count=min_count)
965+
value = "" if skipna and min_count == 0 else pd.NA
966+
expected = DataFrame(
967+
{"b": value}, index=pd.Index(["x"], name="a", dtype=dtype), dtype=dtype
968+
)
969+
tm.assert_frame_equal(result, expected)
970+
971+
958972
def test_max_nan_bug():
959973
df = DataFrame(
960974
{

pandas/tests/resample/test_base.py

+25
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,31 @@ def test_resample_empty_series(freq, index, resample_method):
223223
assert result.index.freq == expected.index.freq
224224

225225

226+
@pytest.mark.parametrize("min_count", [0, 1])
227+
def test_resample_empty_sum_string(string_dtype_no_object, min_count):
228+
# https://github.com/pandas-dev/pandas/issues/60229
229+
dtype = string_dtype_no_object
230+
ser = Series(
231+
pd.NA,
232+
index=DatetimeIndex(
233+
[
234+
"2000-01-01 00:00:00",
235+
"2000-01-01 00:00:10",
236+
"2000-01-01 00:00:20",
237+
"2000-01-01 00:00:30",
238+
]
239+
),
240+
dtype=dtype,
241+
)
242+
rs = ser.resample("20s")
243+
result = rs.sum(min_count=min_count)
244+
245+
value = "" if min_count == 0 else pd.NA
246+
index = date_range(start="2000-01-01", freq="20s", periods=2, unit="s")
247+
expected = Series(value, index=index, dtype=dtype)
248+
tm.assert_series_equal(result, expected)
249+
250+
226251
@pytest.mark.parametrize(
227252
"freq",
228253
[

pandas/tests/resample/test_resampler_grouper.py

+20
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,26 @@ def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
494494
tm.assert_frame_equal(result, expected)
495495

496496

497+
@pytest.mark.parametrize("min_count", [0, 1])
498+
def test_groupby_resample_empty_sum_string(
499+
string_dtype_no_object, test_frame, min_count
500+
):
501+
# https://github.com/pandas-dev/pandas/issues/60229
502+
dtype = string_dtype_no_object
503+
test_frame = test_frame.assign(B=pd.array([pd.NA] * len(test_frame), dtype=dtype))
504+
gbrs = test_frame.groupby("A").resample("40s")
505+
result = gbrs.sum(min_count=min_count)
506+
507+
index = pd.MultiIndex(
508+
levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns")]],
509+
codes=[[0, 1, 2], [0, 0, 0]],
510+
names=["A", None],
511+
)
512+
value = "" if min_count == 0 else pd.NA
513+
expected = DataFrame({"B": value}, index=index, dtype=dtype)
514+
tm.assert_frame_equal(result, expected)
515+
516+
497517
def test_groupby_resample_with_list_of_keys():
498518
# GH 47362
499519
df = DataFrame(

0 commit comments

Comments
 (0)