Skip to content

Commit 1e899af

Browse files
authored
BUG: .mode(dropna=False) doesn't work with nullable integers (#61132)
* Fix dropna bug when mode * Fix test cases * Fix data type incompatible
1 parent 5d9cf43 commit 1e899af

File tree

10 files changed

+71
-33
lines changed

10 files changed

+71
-33
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -838,6 +838,7 @@ Other
838838
- Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
839839
- Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
840840
- Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`)
841+
- Bug in :meth:`Series.mode` where an exception was raised when taking the mode with nullable types with no null values in the series. (:issue:`58926`)
841842
- Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
842843
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
843844
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` throwing ``ValueError`` when ``regex=True`` and all NA values. (:issue:`60688`)

pandas/_libs/hashtable_func_helper.pxi.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,7 @@ def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
430430

431431
if na_counter > 0:
432432
res_mask = np.zeros(j+1, dtype=np.bool_)
433-
res_mask[j] = True
433+
res_mask[j] = (na_counter == max_count)
434434
return modes[:j + 1], res_mask
435435

436436

pandas/core/algorithms.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -987,7 +987,7 @@ def duplicated(
987987

988988
def mode(
989989
values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None
990-
) -> ArrayLike:
990+
) -> tuple[np.ndarray, npt.NDArray[np.bool_]] | ExtensionArray:
991991
"""
992992
Returns the mode(s) of an array.
993993
@@ -1000,7 +1000,7 @@ def mode(
10001000
10011001
Returns
10021002
-------
1003-
np.ndarray or ExtensionArray
1003+
Union[Tuple[np.ndarray, npt.NDArray[np.bool_]], ExtensionArray]
10041004
"""
10051005
values = _ensure_arraylike(values, func_name="mode")
10061006
original = values
@@ -1014,8 +1014,10 @@ def mode(
10141014
values = _ensure_data(values)
10151015

10161016
npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask)
1017-
if res_mask is not None:
1018-
return npresult, res_mask # type: ignore[return-value]
1017+
if res_mask is None:
1018+
res_mask = np.zeros(npresult.shape, dtype=np.bool_)
1019+
else:
1020+
return npresult, res_mask
10191021

10201022
try:
10211023
npresult = safe_sort(npresult)
@@ -1026,7 +1028,7 @@ def mode(
10261028
)
10271029

10281030
result = _reconstruct_data(npresult, original.dtype, original)
1029-
return result
1031+
return result, res_mask
10301032

10311033

10321034
def rank(

pandas/core/arrays/base.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -2511,8 +2511,9 @@ def _mode(self, dropna: bool = True) -> Self:
25112511
Sorted, if possible.
25122512
"""
25132513
# error: Incompatible return value type (got "Union[ExtensionArray,
2514-
# ndarray[Any, Any]]", expected "Self")
2515-
return mode(self, dropna=dropna) # type: ignore[return-value]
2514+
# Tuple[np.ndarray, npt.NDArray[np.bool_]]", expected "Self")
2515+
result, _ = mode(self, dropna=dropna)
2516+
return result # type: ignore[return-value]
25162517

25172518
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
25182519
if any(

pandas/core/arrays/categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2477,7 +2477,7 @@ def _mode(self, dropna: bool = True) -> Categorical:
24772477
if dropna:
24782478
mask = self.isna()
24792479

2480-
res_codes = algorithms.mode(codes, mask=mask)
2480+
res_codes, _ = algorithms.mode(codes, mask=mask)
24812481
res_codes = cast(np.ndarray, res_codes)
24822482
assert res_codes.dtype == codes.dtype
24832483
res = self._from_backing_data(res_codes)

pandas/core/arrays/datetimelike.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1635,7 +1635,7 @@ def _mode(self, dropna: bool = True):
16351635
if dropna:
16361636
mask = self.isna()
16371637

1638-
i8modes = algorithms.mode(self.view("i8"), mask=mask)
1638+
i8modes, _ = algorithms.mode(self.view("i8"), mask=mask)
16391639
npmodes = i8modes.view(self._ndarray.dtype)
16401640
npmodes = cast(np.ndarray, npmodes)
16411641
return self._from_backing_data(npmodes)

pandas/core/arrays/masked.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -1099,12 +1099,8 @@ def value_counts(self, dropna: bool = True) -> Series:
10991099
return Series(arr, index=index, name="count", copy=False)
11001100

11011101
def _mode(self, dropna: bool = True) -> Self:
1102-
if dropna:
1103-
result = mode(self._data, dropna=dropna, mask=self._mask)
1104-
res_mask = np.zeros(result.shape, dtype=np.bool_)
1105-
else:
1106-
result, res_mask = mode(self._data, dropna=dropna, mask=self._mask)
1107-
result = type(self)(result, res_mask) # type: ignore[arg-type]
1102+
result, res_mask = mode(self._data, dropna=dropna, mask=self._mask)
1103+
result = type(self)(result, res_mask)
11081104
return result[result.argsort()]
11091105

11101106
@doc(ExtensionArray.equals)

pandas/core/series.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2071,7 +2071,7 @@ def mode(self, dropna: bool = True) -> Series:
20712071
# TODO: Add option for bins like value_counts()
20722072
values = self._values
20732073
if isinstance(values, np.ndarray):
2074-
res_values = algorithms.mode(values, dropna=dropna)
2074+
res_values, _ = algorithms.mode(values, dropna=dropna)
20752075
else:
20762076
res_values = values._mode(dropna=dropna)
20772077

pandas/tests/series/test_reductions.py

+23
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,29 @@ def test_mode_nullable_dtype(any_numeric_ea_dtype):
5151
tm.assert_series_equal(result, expected)
5252

5353

54+
def test_mode_nullable_dtype_edge_case(any_numeric_ea_dtype):
55+
# GH##58926
56+
ser = Series([1, 2, 3, 1], dtype=any_numeric_ea_dtype)
57+
result = ser.mode(dropna=False)
58+
expected = Series([1], dtype=any_numeric_ea_dtype)
59+
tm.assert_series_equal(result, expected)
60+
61+
ser2 = Series([1, 1, 2, 3, pd.NA], dtype=any_numeric_ea_dtype)
62+
result = ser2.mode(dropna=False)
63+
expected = Series([1], dtype=any_numeric_ea_dtype)
64+
tm.assert_series_equal(result, expected)
65+
66+
ser3 = Series([1, pd.NA, pd.NA], dtype=any_numeric_ea_dtype)
67+
result = ser3.mode(dropna=False)
68+
expected = Series([pd.NA], dtype=any_numeric_ea_dtype)
69+
tm.assert_series_equal(result, expected)
70+
71+
ser4 = Series([1, 1, pd.NA, pd.NA], dtype=any_numeric_ea_dtype)
72+
result = ser4.mode(dropna=False)
73+
expected = Series([1, pd.NA], dtype=any_numeric_ea_dtype)
74+
tm.assert_series_equal(result, expected)
75+
76+
5477
def test_mode_infer_string():
5578
# GH#56183
5679
pytest.importorskip("pyarrow")

pandas/tests/test_algos.py

+31-16
Original file line numberDiff line numberDiff line change
@@ -1831,7 +1831,8 @@ def test_pct_max_many_rows(self):
18311831
class TestMode:
18321832
def test_no_mode(self):
18331833
exp = Series([], dtype=np.float64, index=Index([], dtype=int))
1834-
tm.assert_numpy_array_equal(algos.mode(np.array([])), exp.values)
1834+
result, _ = algos.mode(np.array([]))
1835+
tm.assert_numpy_array_equal(result, exp.values)
18351836

18361837
def test_mode_single(self, any_real_numpy_dtype):
18371838
# GH 15714
@@ -1843,20 +1844,24 @@ def test_mode_single(self, any_real_numpy_dtype):
18431844

18441845
ser = Series(data_single, dtype=any_real_numpy_dtype)
18451846
exp = Series(exp_single, dtype=any_real_numpy_dtype)
1846-
tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
1847+
result, _ = algos.mode(ser.values)
1848+
tm.assert_numpy_array_equal(result, exp.values)
18471849
tm.assert_series_equal(ser.mode(), exp)
18481850

18491851
ser = Series(data_multi, dtype=any_real_numpy_dtype)
18501852
exp = Series(exp_multi, dtype=any_real_numpy_dtype)
1851-
tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
1853+
result, _ = algos.mode(ser.values)
1854+
tm.assert_numpy_array_equal(result, exp.values)
18521855
tm.assert_series_equal(ser.mode(), exp)
18531856

18541857
def test_mode_obj_int(self):
18551858
exp = Series([1], dtype=int)
1856-
tm.assert_numpy_array_equal(algos.mode(exp.values), exp.values)
1859+
result, _ = algos.mode(exp.values)
1860+
tm.assert_numpy_array_equal(result, exp.values)
18571861

18581862
exp = Series(["a", "b", "c"], dtype=object)
1859-
tm.assert_numpy_array_equal(algos.mode(exp.values), exp.values)
1863+
result, _ = algos.mode(exp.values)
1864+
tm.assert_numpy_array_equal(result, exp.values)
18601865

18611866
def test_number_mode(self, any_real_numpy_dtype):
18621867
exp_single = [1]
@@ -1867,12 +1872,14 @@ def test_number_mode(self, any_real_numpy_dtype):
18671872

18681873
ser = Series(data_single, dtype=any_real_numpy_dtype)
18691874
exp = Series(exp_single, dtype=any_real_numpy_dtype)
1870-
tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
1875+
result, _ = algos.mode(ser.values)
1876+
tm.assert_numpy_array_equal(result, exp.values)
18711877
tm.assert_series_equal(ser.mode(), exp)
18721878

18731879
ser = Series(data_multi, dtype=any_real_numpy_dtype)
18741880
exp = Series(exp_multi, dtype=any_real_numpy_dtype)
1875-
tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
1881+
result, _ = algos.mode(ser.values)
1882+
tm.assert_numpy_array_equal(result, exp.values)
18761883
tm.assert_series_equal(ser.mode(), exp)
18771884

18781885
def test_strobj_mode(self):
@@ -1881,7 +1888,8 @@ def test_strobj_mode(self):
18811888

18821889
ser = Series(data, dtype="c")
18831890
exp = Series(exp, dtype="c")
1884-
tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
1891+
result, _ = algos.mode(ser.values)
1892+
tm.assert_numpy_array_equal(result, exp.values)
18851893
tm.assert_series_equal(ser.mode(), exp)
18861894

18871895
@pytest.mark.parametrize("dt", [str, object])
@@ -1891,10 +1899,11 @@ def test_strobj_multi_char(self, dt, using_infer_string):
18911899

18921900
ser = Series(data, dtype=dt)
18931901
exp = Series(exp, dtype=dt)
1902+
result, _ = algos.mode(ser.values)
18941903
if using_infer_string and dt is str:
1895-
tm.assert_extension_array_equal(algos.mode(ser.values), exp.values)
1904+
tm.assert_extension_array_equal(result, exp.values)
18961905
else:
1897-
tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
1906+
tm.assert_numpy_array_equal(result, exp.values)
18981907
tm.assert_series_equal(ser.mode(), exp)
18991908

19001909
def test_datelike_mode(self):
@@ -1928,18 +1937,21 @@ def test_timedelta_mode(self):
19281937
def test_mixed_dtype(self):
19291938
exp = Series(["foo"], dtype=object)
19301939
ser = Series([1, "foo", "foo"])
1931-
tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
1940+
result, _ = algos.mode(ser.values)
1941+
tm.assert_numpy_array_equal(result, exp.values)
19321942
tm.assert_series_equal(ser.mode(), exp)
19331943

19341944
def test_uint64_overflow(self):
19351945
exp = Series([2**63], dtype=np.uint64)
19361946
ser = Series([1, 2**63, 2**63], dtype=np.uint64)
1937-
tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
1947+
result, _ = algos.mode(ser.values)
1948+
tm.assert_numpy_array_equal(result, exp.values)
19381949
tm.assert_series_equal(ser.mode(), exp)
19391950

19401951
exp = Series([1, 2**63], dtype=np.uint64)
19411952
ser = Series([1, 2**63], dtype=np.uint64)
1942-
tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
1953+
result, _ = algos.mode(ser.values)
1954+
tm.assert_numpy_array_equal(result, exp.values)
19431955
tm.assert_series_equal(ser.mode(), exp)
19441956

19451957
def test_categorical(self):
@@ -1961,15 +1973,18 @@ def test_categorical(self):
19611973
def test_index(self):
19621974
idx = Index([1, 2, 3])
19631975
exp = Series([1, 2, 3], dtype=np.int64)
1964-
tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
1976+
result, _ = algos.mode(idx)
1977+
tm.assert_numpy_array_equal(result, exp.values)
19651978

19661979
idx = Index([1, "a", "a"])
19671980
exp = Series(["a"], dtype=object)
1968-
tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
1981+
result, _ = algos.mode(idx)
1982+
tm.assert_numpy_array_equal(result, exp.values)
19691983

19701984
idx = Index([1, 1, 2, 3, 3])
19711985
exp = Series([1, 3], dtype=np.int64)
1972-
tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
1986+
result, _ = algos.mode(idx)
1987+
tm.assert_numpy_array_equal(result, exp.values)
19731988

19741989
idx = Index(
19751990
["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],

0 commit comments

Comments
 (0)