Fix bug in ~Series.describe where median percentile is included when the percentiles argument is passed

MartinBraquet · MartinBraquet · commit e3b0b5d7d250 · 2025-03-21T09:32:53.000+07:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -838,6 +838,7 @@ Other
 - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
 - Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`)
 - Bug in :meth:`MultiIndex.fillna` error message was referring to ``isna`` instead of ``fillna`` (:issue:`60974`)
+- Bug in :meth:`Series.describe` where median percentile is included when the ``percentiles`` argument is passed (:issue:`60550`).
 - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
 - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
 - Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`)
diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py
@@ -229,10 +229,15 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
 
     formatted_percentiles = format_percentiles(percentiles)
 
+    if len(percentiles) == 0:
+        quantiles = []
+    else:
+        quantiles = series.quantile(percentiles).tolist()
+
     stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
     d = (
         [series.count(), series.mean(), series.std(), series.min()]
-        + series.quantile(percentiles).tolist()
+        + quantiles
         + [series.max()]
     )
     # GH#48340 - always return float on non-complex numeric data
@@ -354,10 +359,6 @@ def _refine_percentiles(
     # get them all to be in [0, 1]
     validate_percentile(percentiles)
 
-    # median should always be included
-    if 0.5 not in percentiles:
-        percentiles.append(0.5)
-
     percentiles = np.asarray(percentiles)
 
     # sort and check for duplicates
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -1565,6 +1565,9 @@ def format_percentiles(
     >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999])
     ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%']
     """
+    if len(percentiles) == 0:
+        return []
+
     percentiles = np.asarray(percentiles)
 
     # It checks for np.nan as well
diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py
@@ -413,3 +413,33 @@ def test_describe_exclude_pa_dtype(self):
             dtype=pd.ArrowDtype(pa.float64()),
         )
         tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("percentiles", [None, [], [0.2]])
+    def test_refine_percentiles(self, percentiles):
+        """
+        Test that the percentiles are returned correctly depending on the `percentiles`
+        argument.
+        - The default behavior is to return the 25th, 50th, and 75 percentiles
+        - If `percentiles` is an empty list, no percentiles are returned
+        - If `percentiles` is a non-empty list, only those percentiles are returned
+        """
+        # GH#60550
+        df = DataFrame({"a": np.arange(0, 10, 1)})
+
+        result = df.describe(percentiles=percentiles)
+
+        if percentiles is None:
+            percentiles = [0.25, 0.5, 0.75]
+
+        expected = Series(
+            {
+                "count": len(df.a),
+                "mean": df.a.mean(),
+                "std": df.a.std(),
+                "min": df.a.min(),
+                **{f"{p:.0%}": df.a.quantile(p) for p in percentiles},
+                "max": df.a.max(),
+            },
+        ).to_frame(name="a")
+
+        tm.assert_frame_equal(result, expected)