Skip to content

Commit dc8401a

Browse files
Fix bug in Series.describe where the median is included any time the percentiles argument is not None (#61158)
* Fix bug in `~Series.describe` where median percentile is included when the `percentiles` argument is passed * Refine docstrings * Update test_describe in groupby * Minor fixes * Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <[email protected]> * Refactor expected df to avoid transpose --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 9bd352d commit dc8401a

File tree

6 files changed

+56
-11
lines changed

6 files changed

+56
-11
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -838,6 +838,7 @@ Other
838838
- Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
839839
- Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`)
840840
- Bug in :meth:`MultiIndex.fillna` error message was referring to ``isna`` instead of ``fillna`` (:issue:`60974`)
841+
- Bug in :meth:`Series.describe` where median percentile was always included when the ``percentiles`` argument was passed (:issue:`60550`).
841842
- Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
842843
- Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
843844
- Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`)

pandas/core/generic.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -10818,9 +10818,8 @@ def describe(
1081810818
----------
1081910819
percentiles : list-like of numbers, optional
1082010820
The percentiles to include in the output. All should
10821-
fall between 0 and 1. The default is
10822-
``[.25, .5, .75]``, which returns the 25th, 50th, and
10823-
75th percentiles.
10821+
fall between 0 and 1. The default, ``None``, will automatically
10822+
return the 25th, 50th, and 75th percentiles.
1082410823
include : 'all', list-like of dtypes or None (default), optional
1082510824
A white list of data types to include in the result. Ignored
1082610825
for ``Series``. Here are the options:

pandas/core/methods/describe.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -229,10 +229,15 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
229229

230230
formatted_percentiles = format_percentiles(percentiles)
231231

232+
if len(percentiles) == 0:
233+
quantiles = []
234+
else:
235+
quantiles = series.quantile(percentiles).tolist()
236+
232237
stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
233238
d = (
234239
[series.count(), series.mean(), series.std(), series.min()]
235-
+ series.quantile(percentiles).tolist()
240+
+ quantiles
236241
+ [series.max()]
237242
)
238243
# GH#48340 - always return float on non-complex numeric data
@@ -354,10 +359,6 @@ def _refine_percentiles(
354359
# get them all to be in [0, 1]
355360
validate_percentile(percentiles)
356361

357-
# median should always be included
358-
if 0.5 not in percentiles:
359-
percentiles.append(0.5)
360-
361362
percentiles = np.asarray(percentiles)
362363

363364
# sort and check for duplicates

pandas/io/formats/format.py

+3
Original file line numberDiff line numberDiff line change
@@ -1565,6 +1565,9 @@ def format_percentiles(
15651565
>>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
15661566
['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
15671567
"""
1568+
if len(percentiles) == 0:
1569+
return []
1570+
15681571
percentiles = np.asarray(percentiles)
15691572

15701573
# It checks for np.nan as well

pandas/tests/frame/methods/test_describe.py

+41
Original file line numberDiff line numberDiff line change
@@ -413,3 +413,44 @@ def test_describe_exclude_pa_dtype(self):
413413
dtype=pd.ArrowDtype(pa.float64()),
414414
)
415415
tm.assert_frame_equal(result, expected)
416+
417+
@pytest.mark.parametrize("percentiles", [None, [], [0.2]])
418+
def test_refine_percentiles(self, percentiles):
419+
"""
420+
Test that the percentiles are returned correctly depending on the `percentiles`
421+
argument.
422+
- The default behavior is to return the 25th, 50th, and 75 percentiles
423+
- If `percentiles` is an empty list, no percentiles are returned
424+
- If `percentiles` is a non-empty list, only those percentiles are returned
425+
"""
426+
# GH#60550
427+
df = DataFrame({"a": np.arange(0, 10, 1)})
428+
429+
result = df.describe(percentiles=percentiles)
430+
431+
if percentiles is None:
432+
percentiles = [0.25, 0.5, 0.75]
433+
434+
expected = DataFrame(
435+
[
436+
len(df.a),
437+
df.a.mean(),
438+
df.a.std(),
439+
df.a.min(),
440+
*[df.a.quantile(p) for p in percentiles],
441+
df.a.max(),
442+
],
443+
index=pd.Index(
444+
[
445+
"count",
446+
"mean",
447+
"std",
448+
"min",
449+
*[f"{p:.0%}" for p in percentiles],
450+
"max",
451+
]
452+
),
453+
columns=["a"],
454+
)
455+
456+
tm.assert_frame_equal(result, expected)

pandas/tests/groupby/methods/test_describe.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -202,15 +202,15 @@ def test_describe_duplicate_columns():
202202
gb = df.groupby(df[1])
203203
result = gb.describe(percentiles=[])
204204

205-
columns = ["count", "mean", "std", "min", "50%", "max"]
205+
columns = ["count", "mean", "std", "min", "max"]
206206
frames = [
207-
DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
207+
DataFrame([[1.0, val, np.nan, val, val]], index=[1], columns=columns)
208208
for val in (0.0, 2.0, 3.0)
209209
]
210210
expected = pd.concat(frames, axis=1)
211211
expected.columns = MultiIndex(
212212
levels=[[0, 2], columns],
213-
codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
213+
codes=[5 * [0] + 5 * [1] + 5 * [0], 3 * list(range(5))],
214214
)
215215
expected.index.names = [1]
216216
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)