Use custom rounding rules for summary (#294)

aloctavodia · web-flow · commit 1fadd98a3944 · 2026-02-03T10:07:25.000+02:00
* Use custom rounding rules for summary

* use scientific

* fix tests

* add/fix tests

* fix test for pandas 3.0

* clarify round_to

* clarify round_to

* improve format docstring and wording

* fix indetantion
diff --git a/src/arviz_stats/base/stats_utils.py b/src/arviz_stats/base/stats_utils.py
@@ -378,3 +378,10 @@ def round_num(value, precision):
         return round(value, sig_digits - int(np.floor(np.log10(abs(value)))) - 1)
 
     return value
+
+
+def get_decimal_places_from_se(se_val):
+    """Get number of decimal places from standard error value."""
+    two_se = 2 * se_val
+    se_magnitude = np.floor(np.log10(np.abs(two_se))) if two_se != 0 else 0
+    return -int(se_magnitude)
diff --git a/src/arviz_stats/summary.py b/src/arviz_stats/summary.py
@@ -1,10 +1,12 @@
 """Summaries for various statistics and diagnostics."""
 
 import numpy as np
+import pandas as pd
 import xarray as xr
 from arviz_base import dataset_to_dataframe, extract, rcParams, references_to_dataset
 from xarray_einstats import stats
 
+from arviz_stats.base.stats_utils import get_decimal_places_from_se, round_num
 from arviz_stats.utils import _apply_multi_input_function
 from arviz_stats.validate import validate_dims
 
@@ -22,7 +24,7 @@ def summary(
     fmt="wide",
     ci_prob=None,
     ci_kind=None,
-    round_to=2,
+    round_to="auto",
     skipna=False,
 ):
     """
@@ -60,8 +62,22 @@ def summary(
     ci_kind : {"hdi", "eti"}, optional
         Type of credible interval. Defaults to ``rcParams["stats.ci_kind"]``.
         If `kind` is stats_median or all_median, `ci_kind` is forced to "eti".
-    round_to : int
-        Number of decimals used to round results. Defaults to 2. Use "none" to return raw numbers.
+    round_to : int or {"auto", "none"}, optional
+        Rounding specification. Defaults to "auto". If integer, number of decimal places to
+        round to. If "none", no rounding is applied. If "auto", and `fmt` is "xarray" defaults to
+        ``rcParams["stats.round_to"]``. If "auto" and `fmt` is in  {"wide", "long"}, applies the
+        following rounding rules:
+
+        * ESS values (ess_bulk, ess_tail, ess_mean, ess_median, min_ss) are rounded down to int
+        * R-hat always shows 2 digits after the decimal
+        * If a column *stat* and *mcse_stat* are both present then the mcse is shown to 2
+          significant figures, and *stat* is shown with precision based on 2*mcse.
+        * All other floating point numbers are shown following ``rcParams["stats.round_to"]``.
+        * For all floating point numbers except R-hat, trailing zeros are removed and values are
+          converted to string for consistent display.
+
+        Note: "auto" is intended for display purposes, using it is not recommended when the output
+        will be used for further numerical computations.
     skipna: bool
         If true ignores nan values when computing the summary statistics. Defaults to false.
 
@@ -127,6 +143,10 @@ def summary(
         ci_kind = rcParams["stats.ci_kind"]
     if sample_dims is None:
         sample_dims = rcParams["data.sample_dims"]
+    if round_to == "auto":
+        round_val = rcParams["stats.round_to"]
+    else:
+        round_val = round_to
 
     ci_perc = int(ci_prob * 100)
 
@@ -224,12 +244,101 @@ def summary(
         summary_result = summary_result.to_dataframe().reset_index().set_index("summary")
         summary_result.index = list(summary_result.index)
 
-    if (round_to is not None) and (round_to not in ("None", "none")):
-        summary_result = summary_result.round(round_to)
+    if fmt == "xarray":
+        if (round_to is not None) and (round_to not in ("None", "none")):
+            summary_result = xr.apply_ufunc(round_num, summary_result, round_val, vectorize=True)
+    else:
+        if round_to == "auto":
+            summary_result = _round_summary(summary_result, round_val)
+        else:
+            if (round_to is not None) and (round_to not in ("None", "none")):
+                summary_result = summary_result.map(lambda x: round_num(x, round_val))
 
     return summary_result
 
 
+def _round_summary(summary_result, round_val):
+    """Apply custom rounding rules to summary statistics.
+
+    Parameters
+    ----------
+    summary_result : pandas.DataFrame
+        The summary result to round
+    round_val : int or str
+        Number of decimals or significant figures to round to.
+
+    Returns
+    -------
+    pandas.DataFrame
+    """
+    result = summary_result.copy()
+    columns = result.columns
+    rounded_columns = set()
+    use_scientific = {}
+
+    # Rule 1: ESS columns and min_ss are rounded down to int
+    ess_cols = [col for col in columns if col.startswith("ess_") or col == "min_ss"]
+    for col in ess_cols:
+        result[col] = result[col].apply(lambda x: pd.NA if not np.isfinite(x) else np.floor(x))
+        result[col] = result[col].astype("Int64")
+        rounded_columns.add(col)
+
+    # Rule 2: R-hat always shows 2 digits after decimal
+    if "r_hat" in columns:
+        result["r_hat"] = result["r_hat"].round(2)
+        rounded_columns.add("r_hat")
+
+    # Rule 3: Handle stat/mcse pairs
+    stat_se_pairs = []
+    for col in columns:
+        if col.startswith("mcse_"):
+            stat_col = col[5:]
+            if stat_col in columns:
+                stat_se_pairs.append((stat_col, col))
+
+    for stat_col, se_col in stat_se_pairs:
+        result[se_col] = result[se_col].apply(lambda x: round_num(x, round_val))
+
+        for idx in result.index:
+            stat_val = result.loc[idx, stat_col]
+            se_val = result.loc[idx, se_col]
+            if not np.isfinite(se_val):
+                continue
+            decimal_places = get_decimal_places_from_se(se_val)
+            if decimal_places < 0:
+                use_scientific[(idx, stat_col)] = True
+            else:
+                result.loc[idx, stat_col] = round_num(stat_val, decimal_places)
+
+        rounded_columns.add(stat_col)
+        rounded_columns.add(se_col)
+
+    # Rule 4: Other floating point numbers to round_val significant figures
+    for col in columns:
+        if col not in rounded_columns:
+            if result[col].dtype.kind == "f":
+                result[col] = result[col].apply(lambda x: round_num(x, round_val))
+
+    # Rule 5: Format
+    for col in columns:
+        if result[col].dtype.kind == "f":
+            if col == "r_hat":
+                result[col] = result[col].apply(lambda x: f"{x:.2f}" if np.isfinite(x) else x)
+            else:
+                formatted_values = []
+                for idx, val in zip(result.index, result[col]):
+                    if not np.isfinite(val):
+                        formatted_values.append(val)
+                    elif use_scientific.get((idx, col), False):
+                        formatted_values.append(f"{val:.0e}")
+                    else:
+                        formatted_values.append(f"{val:g}")
+
+                result[col] = formatted_values
+
+    return result
+
+
 def ci_in_rope(
     data,
     rope,
diff --git a/tests/test_summary.py b/tests/test_summary.py
@@ -10,9 +10,11 @@
 from .helpers import importorskip
 
 azb = importorskip("arviz_base")
+pd = importorskip("pandas")
 xr = importorskip("xarray")
 
 from arviz_stats import ci_in_rope, eti, hdi, mode, qds, summary
+from arviz_stats.summary import _round_summary
 
 
 def test_summary_ndarray():
@@ -396,7 +398,7 @@ def test_mode_single_value_array():
 def test_summary_zero_variance():
     array = np.ones((4, 100, 2))
     summary_df = summary(array)
-    assert summary_df["sd"].iloc[0] == 0.0
+    assert summary_df["sd"].iloc[0] == "0"
 
 
 @pytest.mark.parametrize("prob", [0.5, 0.89, 0.95])
@@ -423,3 +425,36 @@ def test_summary_fmt(datatree, fmt):
     else:
         assert isinstance(result, xr.Dataset)
         assert "summary" in result.dims
+
+
+def test_round_summary():
+    labels = ["a", "bb", "ccc", "d", "e"]
+    data = {
+        "mean": [111.11, 1.2345e-6, 5.4321e8, np.inf, np.nan],
+        "mcse_mean": [0.0012345, 5.432e-5, 2.1234e5, np.inf, np.nan],
+        "sd": [0.0012345, 5.432e-5, 2.1234e5, np.inf, np.nan],
+        "r_hat": [1.009, 1.011, 0.99, np.inf, np.nan],
+        "ess_bulk": [312.45, 23.32, 1011.98, np.inf, np.nan],
+        "ess_tail": [9.2345, 876.321, 999.99, np.inf, np.nan],
+    }
+    df = pd.DataFrame(data, index=labels)
+    result = _round_summary(df, round_val=2)
+
+    assert result["ess_bulk"].dtype == "Int64"
+    assert result["ess_tail"].dtype == "Int64"
+    expected_ess_bulk = pd.Series(
+        [312, 23, 1011, pd.NA, pd.NA], index=labels, dtype="Int64", name="ess_bulk"
+    )
+    pd.testing.assert_series_equal(result["ess_bulk"], expected_ess_bulk)
+    expected_ess_tail = pd.Series(
+        [9, 876, 999, pd.NA, pd.NA], index=labels, dtype="Int64", name="ess_tail"
+    )
+    pd.testing.assert_series_equal(result["ess_tail"], expected_ess_tail)
+    expected_r_hat = pd.Series(["1.01", "1.01", "0.99", np.inf, np.nan], index=labels, name="r_hat")
+    pd.testing.assert_series_equal(result["r_hat"], expected_r_hat, check_dtype=False)
+    expected_mcse = pd.Series(["0", "0", "212340", np.inf, np.nan], index=labels, name="mcse_mean")
+    pd.testing.assert_series_equal(result["mcse_mean"], expected_mcse, check_dtype=False)
+    expected_mean = pd.Series(["111", "0", "5e+08"], index=labels[:3], name="mean")
+    pd.testing.assert_series_equal(result.loc[labels[:3], "mean"], expected_mean, check_dtype=False)
+    expected_sd = pd.Series(["0", "0", "212340"], index=labels[:3], name="sd")
+    pd.testing.assert_series_equal(result.loc[labels[:3], "sd"], expected_sd, check_dtype=False)