Per-metric relativization for Summary with preference metrics (#5218)

ItsMrLin · meta-codesync[bot] · commit c40dfaed7750 · 2026-06-04T10:17:02.000-07:00
Summary: Pull Request resolved: #5218 ## Summary Supersedes D97533888. Addresses drfreund's review comment about deduping with `Data.relativize` by placing the per-metric scoping in the data layer rather than adding analysis-specific logic. When an experiment has both preference metrics (e.g., `pairwise_pref_query`) and standard tracking metrics, the Summary should relativize tracking metrics normally while skipping the preference metric (whose binary 0/1 labels have SQ mean near zero, causing "mean_control too small" crash). Previously D99037272 applied a blanket guard that skipped ALL relativization when any objective was a preference metric. This diff replaces that with per-metric scoping: non-preference metrics are relativized and %-formatted, while preference metrics are excluded from relativization and their columns are dropped from the summary table (binary 0/1 labels are not informative in a tabular summary). Labeling-only trial rows (with no tracking metric data) are also dropped. Changes: - `Data.relativize()` and `relativize_dataframe()` in `ax/core/data.py`: add `metric_names` parameter to scope which metrics get relativized. Unscoped metrics pass through with raw values. SEM zeroing for status quo rows is also scoped -- non-relativized metrics retain their original SEM. - `Experiment.to_df()` in `ax/core/experiment.py`: add `metric_names_to_relativize` parameter, threaded to `Data.relativize()`. Percentage formatting also scoped to only relativized metrics. - `Summary.compute()` in `ax/analysis/summary.py`: replace blanket `not has_preference_objective` guard with per-metric scoping. Builds a list of non-preference metric names, passes it as `metric_names_to_relativize`, then drops preference columns and labeling-only rows. Differential Revision: D99149923 fbshipit-source-id: dade9eb72af2d75b13bacf471eb0cd1df7bb95d7
diff --git a/ax/analysis/summary.py b/ax/analysis/summary.py
@@ -84,21 +84,44 @@ def compute(
         # (3) experiment data does not have has_step_column=True (data with a
         # progression doesn't support relativization due to time-series step
         # alignment complexities.)
-        # (4) no preference metric objectives -- preference metrics (e.g.,
-        # pairwise_pref_query) have binary 0/1 labels with SQ mean near zero,
-        # causing relativization to crash with "mean_control too small."
         data = experiment.lookup_data(trial_indices=self.trial_indices)
-        has_preference_objective = experiment.optimization_config is not None and any(
-            is_preference_metric(n)
-            for n in experiment.optimization_config.objective.metric_names
-        )
         should_relativize = (
             len(experiment.metrics) > 0
             and experiment.status_quo is not None
             and not data.has_step_column
-            and not has_preference_objective
         )
 
+        # When relativizing, scope to non-preference metrics only. Preference
+        # metrics (e.g., pairwise_pref_query) have binary 0/1 labels with SQ
+        # mean near zero, causing relativization to crash with "mean_control
+        # too small." Non-preference tracking metrics are relativized normally.
+        non_preference_metric_names: list[str] | None = None
+        if should_relativize:
+            non_preference_metric_names = [
+                name for name in experiment.metrics if not is_preference_metric(name)
+            ]
+
+        df = experiment.to_df(
+            trial_indices=self.trial_indices,
+            omit_empty_columns=self.omit_empty_columns,
+            trial_statuses=self.trial_statuses,
+            relativize=should_relativize,
+            metric_names_to_relativize=non_preference_metric_names,
+        )
+
+        # Drop preference metric columns (e.g., pairwise_pref_query) -- their
+        # binary 0/1 values are not informative in a summary table. Then drop
+        # rows where all remaining metric columns are empty, which removes
+        # labeling-only trials that have no tracking metric data.
+        preference_cols = [col for col in df.columns if is_preference_metric(col)]
+        if preference_cols:
+            df = df.drop(columns=preference_cols)
+            remaining_metric_cols = [
+                col for col in df.columns if col in experiment.metrics
+            ]
+            if remaining_metric_cols:
+                df = df.dropna(subset=remaining_metric_cols, how="all")
+
         return self._create_analysis_card(
             title=(
                 "Summary for "
@@ -112,10 +135,5 @@ def compute(
                     "Metric results are relativized against status quo."
                 )
             ),
-            df=experiment.to_df(
-                trial_indices=self.trial_indices,
-                omit_empty_columns=self.omit_empty_columns,
-                trial_statuses=self.trial_statuses,
-                relativize=should_relativize,
-            ),
+            df=df,
         )
diff --git a/ax/analysis/tests/test_summary.py b/ax/analysis/tests/test_summary.py
@@ -10,9 +10,9 @@
 from ax.analysis.summary import Summary
 from ax.api.client import Client
 from ax.api.configs import RangeParameterConfig
-from ax.core.arm import Arm
 from ax.core.base_trial import TrialStatus
 from ax.core.data import Data
+from ax.core.experiment import Experiment
 from ax.core.metric import Metric
 from ax.core.objective import MultiObjective, Objective
 from ax.core.optimization_config import MultiObjectiveOptimizationConfig
@@ -224,35 +224,97 @@ def test_trial_status_filter(self) -> None:
         self.assertIn(0, card.df["trial_index"].values)
         self.assertIn(1, card.df["trial_index"].values)
 
-    def test_compute_with_preference_objective_skips_relativization(self) -> None:
-        """Summary should skip relativization when a preference metric is an
-        objective, since binary 0/1 labels have SQ mean near zero which causes
-        'mean_control too small' errors."""
+    def _attach_binary_pairwise_data(
+        self, experiment: Experiment, pairwise_name: str
+    ) -> None:
+        """Attach binary 0/1 pairwise data to every trial, with the status-quo
+        (control) arm set to 0. A near-zero control mean is exactly what makes
+        relativizing this metric crash with 'mean_control too small', so this
+        deterministically reproduces the crash unless the preference metric is
+        scoped out of relativization."""
+        status_quo_name = none_throws(experiment.status_quo).name
+        for trial in experiment.trials.values():
+            arm_names = [arm.name for arm in trial.arms]
+            # Control arm gets 0.0 so |mean_control| is below the relativization
+            # epsilon; all other arms get 1.0.
+            means = [0.0 if name == status_quo_name else 1.0 for name in arm_names]
+            experiment.attach_data(
+                Data(
+                    df=pd.DataFrame(
+                        {
+                            "arm_name": arm_names,
+                            "metric_name": [pairwise_name] * len(arm_names),
+                            "mean": means,
+                            "sem": [0.0] * len(arm_names),
+                            "trial_index": [trial.index] * len(arm_names),
+                            "metric_signature": [pairwise_name] * len(arm_names),
+                        }
+                    )
+                )
+            )
+
+    def test_compute_with_preference_objective_per_metric_relativization(
+        self,
+    ) -> None:
+        """Summary with a preference metric objective should relativize only
+        non-preference metrics. The preference metric (pairwise_pref_query)
+        has binary 0/1 labels with SQ mean near zero -- relativizing it would
+        crash with 'mean_control too small'. Non-preference metrics should
+        be relativized normally."""
         pairwise_name = Keys.PAIRWISE_PREFERENCE_QUERY.value
 
-        # Use Client to set up experiment with SQ and data
-        client = self.client
-        client.configure_optimization(objective="foo")
-        experiment = client._experiment
-        experiment.status_quo = Arm(parameters={"x1": 0.5, "x2": 0.5})
+        # Use an experiment with BatchTrials and SQ data, which triggers
+        # relativization in the Summary. get_branin_experiment_with_status_quo_trials
+        # creates BatchTrials with a SQ arm, so data.relativize() has SQ data.
+        experiment = get_branin_experiment_with_status_quo_trials()
 
-        # Add pairwise_pref_query as an additional objective
+        # Add pairwise_pref_query as an additional objective alongside branin.
         experiment.add_tracking_metric(Metric(name=pairwise_name))
         experiment.optimization_config = MultiObjectiveOptimizationConfig(
             objective=MultiObjective(
                 objectives=[
-                    Objective(metric=Metric(name="foo"), minimize=True),
+                    Objective(metric=experiment.metrics["branin"], minimize=True),
                     Objective(metric=Metric(name=pairwise_name), minimize=False),
                 ]
             )
         )
 
-        client.get_next_trials(max_trials=1)
-        client.complete_trial(trial_index=0, raw_data={"foo": 1.0, pairwise_name: 0.0})
+        self._attach_binary_pairwise_data(experiment, pairwise_name)
 
-        # Should succeed without "mean_control too small" error
+        # Should succeed without "mean_control too small" crash
         card = Summary().compute(experiment=experiment)
-        self.assertNotIn("relativized", card.subtitle)
+
+        # Subtitle should indicate relativization (non-preference metrics)
+        self.assertIn("relativized", card.subtitle)
+
+        # Preference metric column should be dropped from the summary
+        self.assertNotIn(pairwise_name, card.df.columns)
+
+    def test_compute_with_preference_tracking_metric_and_no_optimization_config(
+        self,
+    ) -> None:
+        """A preference metric attached as a tracking metric (with a status quo
+        but no optimization_config) must still be scoped out of relativization.
+        Relativization is gated only on metrics/status_quo/step data, not on the
+        optimization_config, so without scoping the binary 0/1 preference metric
+        (SQ mean ~0) would crash with 'mean_control too small'."""
+        pairwise_name = Keys.PAIRWISE_PREFERENCE_QUERY.value
+
+        experiment = get_branin_experiment_with_status_quo_trials()
+        # Preference metric is tracking-only; there is no optimization_config.
+        experiment.add_tracking_metric(Metric(name=pairwise_name))
+        experiment._optimization_config = None
+
+        self._attach_binary_pairwise_data(experiment, pairwise_name)
+
+        # Should succeed without "mean_control too small" crash even though
+        # there is no optimization_config.
+        card = Summary().compute(experiment=experiment)
+
+        # branin is still relativized (non-preference metric).
+        self.assertIn("relativized", card.subtitle)
+        # Preference metric column should be dropped from the summary.
+        self.assertNotIn(pairwise_name, card.df.columns)
 
     def test_default_excludes_stale_trials(self) -> None:
         """Test that Summary defaults to excluding STALE trials."""
diff --git a/ax/analysis/tests/test_utils.py b/ax/analysis/tests/test_utils.py
@@ -721,6 +721,59 @@ def test_relativize_df_with_sq_multiple_trials(self) -> None:
             decimal=1,
         )
 
+    def test_relativize_df_with_sq_skips_metrics_missing_from_sq(self) -> None:
+        """When model predictions include metrics not in the status quo df
+        (e.g., pairwise_pref_query from a preference model), relativization
+        should skip those metrics and leave their columns untouched."""
+        df = pd.DataFrame(
+            {
+                "trial_index": [0, 0],
+                "arm_name": ["status_quo", "arm1"],
+                "foo_mean": [10.0, 12.0],
+                "foo_sem": [1.0, 1.2],
+                "pairwise_pref_query_mean": [0.5, 0.8],
+                "pairwise_pref_query_sem": [0.1, 0.2],
+            }
+        )
+        # Status quo df only has foo, not pairwise_pref_query
+        status_quo_df = pd.DataFrame(
+            {
+                "trial_index": [0],
+                "arm_name": ["status_quo"],
+                "foo_mean": [10.0],
+                "foo_sem": [1.0],
+            }
+        )
+
+        rel_df = _relativize_df_with_sq(
+            df=df,
+            status_quo_df=status_quo_df,
+            status_quo_name="status_quo",
+        )
+
+        with self.subTest("foo is relativized"):
+            np.testing.assert_almost_equal(
+                rel_df.loc[rel_df["arm_name"] == "arm1", "foo_mean"].iloc[0],
+                0.2,
+                decimal=1,
+            )
+
+        with self.subTest("pairwise_pref_query is untouched"):
+            np.testing.assert_almost_equal(
+                rel_df.loc[
+                    rel_df["arm_name"] == "arm1", "pairwise_pref_query_mean"
+                ].iloc[0],
+                0.8,
+                decimal=5,
+            )
+            np.testing.assert_almost_equal(
+                rel_df.loc[
+                    rel_df["arm_name"] == "arm1", "pairwise_pref_query_sem"
+                ].iloc[0],
+                0.2,
+                decimal=5,
+            )
+
     def test_is_preference_metric(self) -> None:
         self.assertTrue(is_preference_metric("pairwise_pref_query"))
         self.assertFalse(is_preference_metric("branin"))
diff --git a/ax/analysis/utils.py b/ax/analysis/utils.py
@@ -845,7 +845,14 @@ def _relativize_df_with_sq(
         and 'METRIC_NAME_sem' columns relativized to the status quo arm for each metric
         within each trial.
     """
-    metric_names = [name[:-5] for name in df.columns if name.endswith("_mean")]
+    # Only relativize metrics present in both the data df and the status quo
+    # df. Model predictions may include metrics (e.g., pairwise_pref_query)
+    # that the status quo df doesn't have columns for.
+    metric_names = [
+        name[:-5]
+        for name in df.columns
+        if name.endswith("_mean") and name in status_quo_df.columns
+    ]
 
     rel_df = df.copy()
 
diff --git a/ax/core/data.py b/ax/core/data.py
@@ -11,7 +11,7 @@
 import itertools
 import math
 from bisect import bisect_right
-from collections.abc import Iterable
+from collections.abc import Iterable, Sequence
 from copy import deepcopy
 from functools import cached_property
 from io import StringIO
@@ -439,6 +439,7 @@ def relativize(
         include_sq: bool = False,
         bias_correction: bool = True,
         control_as_constant: bool = False,
+        metric_names: Sequence[str] | None = None,
     ) -> Data:
         """Relativize a data object w.r.t. a status_quo arm.
 
@@ -453,6 +454,9 @@ def relativize(
                 ax.utils.stats.math_utils.relativize for more details.
             control_as_constant: If true, control is treated as a constant.
                 bias_correction is ignored when this is true.
+            metric_names: If provided, only relativize these metrics. Other
+                metrics are passed through with raw values. If None, all
+                metrics are relativized.
 
         Returns:
             The new data object with the relativized metrics (excluding the
@@ -471,6 +475,7 @@ def relativize(
             include_sq=include_sq,
             bias_correction=bias_correction,
             control_as_constant=control_as_constant,
+            metric_names=metric_names,
         )
         return self.__class__(df=df_rel)
 
@@ -698,6 +703,7 @@ def relativize_dataframe(
     include_sq: bool = False,
     bias_correction: bool = True,
     control_as_constant: bool = False,
+    metric_names: Sequence[str] | None = None,
 ) -> pd.DataFrame:
     """Relativize a dataframe w.r.t. a status_quo arm.
 
@@ -712,6 +718,9 @@ def relativize_dataframe(
             ax.utils.stats.math_utils.relativize for more details.
         control_as_constant: If true, control is treated as a constant.
             bias_correction is ignored when this is true.
+        metric_names: If provided, only relativize these metrics. Other
+            metrics are passed through with raw values. If None, all
+            metrics are relativized.
 
     Returns:
         The new dataframe with the relativized metrics (excluding the
@@ -721,11 +730,26 @@ def relativize_dataframe(
     grp_cols = list(
         {"trial_index", "metric_name", "random_split"}.intersection(df.columns.values)
     )
+    metric_names_set = set(metric_names) if metric_names is not None else None
 
     grouped_df = df.groupby(grp_cols)
     dfs = []
     for grp in grouped_df.groups.keys():
         subgroup_df = grouped_df.get_group(grp)
+
+        # If metric scoping is requested, skip relativization for excluded
+        # metrics and pass through raw data (with or without SQ row).
+        if metric_names_set is not None and "metric_name" in grp_cols:
+            grp_metric = (
+                grp if isinstance(grp, str) else grp[grp_cols.index("metric_name")]
+            )
+            if grp_metric not in metric_names_set:
+                if include_sq:
+                    dfs.append(subgroup_df)
+                else:
+                    dfs.append(subgroup_df[subgroup_df["arm_name"] != status_quo_name])
+                continue
+
         is_sq = subgroup_df["arm_name"] == status_quo_name
 
         # Check if status quo exists in this subgroup (trial)
@@ -758,7 +782,13 @@ def relativize_dataframe(
         dfs.append(subgroup_df.assign(mean=means_rel, sem=sems_rel))
     df_rel = pd.concat(dfs, axis=0)
     if include_sq:
-        df_rel.loc[df_rel["arm_name"] == status_quo_name, "sem"] = 0.0
+        # Zero SEM only for metrics that were actually relativized.
+        # Non-relativized metrics (excluded via metric_names scoping)
+        # should retain their original SEM values.
+        sq_mask = df_rel["arm_name"] == status_quo_name
+        if metric_names_set is not None and "metric_name" in df_rel.columns:
+            sq_mask = sq_mask & df_rel["metric_name"].isin(metric_names_set)
+        df_rel.loc[sq_mask, "sem"] = 0.0
     df_rel.reset_index(inplace=True, drop=True)
     # Reorder columns to match expected order (reuses Data class logic)
     df_rel = Data._get_df_with_cols_in_expected_order(df_rel)
diff --git a/ax/core/experiment.py b/ax/core/experiment.py
diff --git a/ax/core/tests/test_data.py b/ax/core/tests/test_data.py