Fix bug in Series.describe where the median is included any time the percentiles argument is not None (#61158)

MartinBraquet · mroeschke · web-flow · commit dc8401afea46 · 2025-03-21T14:13:04.000-07:00
* Fix bug in `~Series.describe` where median percentile is included when the `percentiles` argument is passed

* Refine docstrings

* Update test_describe in groupby

* Minor fixes

* Update doc/source/whatsnew/v3.0.0.rst

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;

* Refactor expected df to avoid transpose

---------

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -838,6 +838,7 @@ Other
 - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
 - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`)
 - Bug in :meth:`MultiIndex.fillna` error message was referring to ``isna`` instead of ``fillna`` (:issue:`60974`)
+- Bug in :meth:`Series.describe` where median percentile was always included when the ``percentiles`` argument was passed (:issue:`60550`).
 - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
 - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
 - Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -10818,9 +10818,8 @@ def describe(
         ----------
         percentiles : list-like of numbers, optional
             The percentiles to include in the output. All should
-            fall between 0 and 1. The default is
-            ``[.25, .5, .75]``, which returns the 25th, 50th, and
-            75th percentiles.
+            fall between 0 and 1. The default, ``None``, will automatically
+            return the 25th, 50th, and 75th percentiles.
         include : 'all', list-like of dtypes or None (default), optional
             A white list of data types to include in the result. Ignored
             for ``Series``. Here are the options:
diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py
@@ -229,10 +229,15 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
 
     formatted_percentiles = format_percentiles(percentiles)
 
+    if len(percentiles) == 0:
+        quantiles = []
+    else:
+        quantiles = series.quantile(percentiles).tolist()
+
     stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
     d = (
         [series.count(), series.mean(), series.std(), series.min()]
-        + series.quantile(percentiles).tolist()
+        + quantiles
         + [series.max()]
     )
     # GH#48340 - always return float on non-complex numeric data
@@ -354,10 +359,6 @@ def _refine_percentiles(
     # get them all to be in [0, 1]
     validate_percentile(percentiles)
 
-    # median should always be included
-    if 0.5 not in percentiles:
-        percentiles.append(0.5)
-
     percentiles = np.asarray(percentiles)
 
     # sort and check for duplicates
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -1565,6 +1565,9 @@ def format_percentiles(
     >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
     ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
     """
+    if len(percentiles) == 0:
+        return []
+
     percentiles = np.asarray(percentiles)
 
     # It checks for np.nan as well
diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py
@@ -413,3 +413,44 @@ def test_describe_exclude_pa_dtype(self):
             dtype=pd.ArrowDtype(pa.float64()),
         )
         tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("percentiles", [None, [], [0.2]])
+    def test_refine_percentiles(self, percentiles):
+        """
+        Test that the percentiles are returned correctly depending on the `percentiles`
+        argument.
+        - The default behavior is to return the 25th, 50th, and 75 percentiles
+        - If `percentiles` is an empty list, no percentiles are returned
+        - If `percentiles` is a non-empty list, only those percentiles are returned
+        """
+        # GH#60550
+        df = DataFrame({"a": np.arange(0, 10, 1)})
+
+        result = df.describe(percentiles=percentiles)
+
+        if percentiles is None:
+            percentiles = [0.25, 0.5, 0.75]
+
+        expected = DataFrame(
+            [
+                len(df.a),
+                df.a.mean(),
+                df.a.std(),
+                df.a.min(),
+                *[df.a.quantile(p) for p in percentiles],
+                df.a.max(),
+            ],
+            index=pd.Index(
+                [
+                    "count",
+                    "mean",
+                    "std",
+                    "min",
+                    *[f"{p:.0%}" for p in percentiles],
+                    "max",
+                ]
+            ),
+            columns=["a"],
+        )
+
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py
@@ -202,15 +202,15 @@ def test_describe_duplicate_columns():
     gb = df.groupby(df[1])
     result = gb.describe(percentiles=[])
 
-    columns = ["count", "mean", "std", "min", "50%", "max"]
+    columns = ["count", "mean", "std", "min", "max"]
     frames = [
-        DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
+        DataFrame([[1.0, val, np.nan, val, val]], index=[1], columns=columns)
         for val in (0.0, 2.0, 3.0)
     ]
     expected = pd.concat(frames, axis=1)
     expected.columns = MultiIndex(
         levels=[[0, 2], columns],
-        codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
+        codes=[5 * [0] + 5 * [1] + 5 * [0], 3 * list(range(5))],
     )
     expected.index.names = [1]
     tm.assert_frame_equal(result, expected)