option to use a particular trial for CV test set (#4996)

bletham · facebook-github-bot · commit 316efcd755c6 · 2026-03-10T13:25:05.000-07:00
Summary:

In situations where we have a target trial that is significantly different from other trials on the experiment, for instance if there was a system rebase in between two trials, we really care especially about our ability to predict the target trial, as that is what we expect to see moving forward. This adds a kwarg to the DiagnosticAnalysis to specify a particular trial for the diagnostics. All trials will still be used as train arms in each CV fold, but the test arms will be limited to arms in that trial.

Differential Revision: D95824210
diff --git a/ax/analysis/diagnostics.py b/ax/analysis/diagnostics.py
@@ -36,14 +36,23 @@ class DiagnosticAnalysis(Analysis):
     of leave-one-out cross validation.
     """
 
-    def __init__(self, include_tracking_metrics: bool = False) -> None:
+    def __init__(
+        self,
+        include_tracking_metrics: bool = False,
+        test_trial_index: int | None = None,
+    ) -> None:
         """Initialize the DiagnosticAnalysis.
 
         Args:
             include_tracking_metrics: Whether to include tracking metrics or just use
                 the optimization config metrics.
+            test_trial_index: If provided, limits cross validation to only evaluate
+                predictions for observations from this trial. Other trials'
+                observations will still be used for training but will not
+                appear as test points.
         """
         self.include_tracking_metrics = include_tracking_metrics
+        self.test_trial_index = test_trial_index
 
     @override
     def validate_applicable_state(
@@ -80,7 +89,9 @@ def compute(
         cross_validation_plots = []
         metric_r2_card = []
         if not is_bandit:
-            cv_analysis = CrossValidationPlot(metric_names=metric_names)
+            cv_analysis = CrossValidationPlot(
+                metric_names=metric_names, test_trial_index=self.test_trial_index
+            )
             cv_card = cv_analysis.compute_or_error_card(
                 experiment=experiment,
                 generation_strategy=generation_strategy,
diff --git a/ax/analysis/plotly/cross_validation.py b/ax/analysis/plotly/cross_validation.py
@@ -76,6 +76,7 @@ def __init__(
         untransform: bool = False,
         trial_index: int | None = None,
         labels: Mapping[str, str] | None = None,
+        test_trial_index: int | None = None,
     ) -> None:
         """
         Args:
@@ -99,13 +100,18 @@ def __init__(
                 trial.
             labels: Optional dictionary of labels for the plot. Useful for when metric
                 names are too long or otherwise challenging to read.
+            test_trial_index: If provided, limits cross validation to only evaluate
+                predictions for observations from this trial. Other trials'
+                observations will still be used for training but will not
+                appear as test points.
         """
 
         self.metric_names = metric_names
         self.folds = folds
         self.untransform = untransform
         self.trial_index = trial_index
         self.labels: dict[str, str] = {**labels} if labels is not None else {}
+        self.test_trial_index = test_trial_index
         self._r2s: dict[str, float] = {}
 
     @override
@@ -138,8 +144,17 @@ def compute(
         )
 
         cards = []
+
+        test_selector = (
+            (lambda obs: obs.features.trial_index == self.test_trial_index)
+            if self.test_trial_index is not None
+            else None
+        )
         cv_results = cross_validate(
-            adapter=relevant_adapter, folds=self.folds, untransform=self.untransform
+            adapter=relevant_adapter,
+            folds=self.folds,
+            untransform=self.untransform,
+            test_selector=test_selector,
         )
         relevant_adapter_metric_names = [
             relevant_adapter._experiment.signature_to_metric[signature].name
@@ -217,6 +232,7 @@ def compute_cross_validation_adhoc(
     folds: int = -1,
     untransform: bool = True,
     labels: Mapping[str, str] | None = None,
+    test_trial_index: int | None = None,
     experiment: Experiment | None = None,
     generation_strategy: GenerationStrategy | None = None,
     adapter: Adapter | None = None,
@@ -243,6 +259,10 @@ def compute_cross_validation_adhoc(
             is.
         labels: Optional dictionary of labels for the plot. Useful for when metric
             names are too long or otherwise challenging to read.
+        test_trial_index: If provided, limits cross validation to only evaluate
+            predictions for observations from this trial. Other trials'
+            observations will still be used for training but will not
+            appear as test points.
         experiment: Optional. The experiment to extract data from.
         generation_strategy: Optional. The generation strategy to extract the adapter
             from.
@@ -260,6 +280,7 @@ def compute_cross_validation_adhoc(
         folds=folds,
         untransform=untransform,
         labels=labels,
+        test_trial_index=test_trial_index,
     )
 
     return analysis.compute(
diff --git a/ax/analysis/plotly/tests/test_cross_validation.py b/ax/analysis/plotly/tests/test_cross_validation.py
@@ -139,6 +139,22 @@ def test_it_can_specify_trial_index_correctly(self) -> None:
                 card.df["arm_name"].unique(),
             )
 
+    def test_test_trial_index_filters_to_single_trial(self) -> None:
+        # test_trial_index filters CV to only evaluate predictions for observations
+        # from that trial. Use trial 0 which is in the model's training data.
+        analysis = CrossValidationPlot(metric_names=["bar"], test_trial_index=0)
+        (card,) = analysis.compute(
+            generation_strategy=self.client.generation_strategy
+        ).flatten()
+        # Only the arm from trial 0 should appear as a test point
+        trial_0_arm_name = none_throws(
+            assert_is_instance(self.client.experiment.trials[0], Trial).arm
+        ).name
+        self.assertEqual(
+            list(card.df["arm_name"].unique()),
+            [trial_0_arm_name],
+        )
+
     @mock.patch(
         "ax.analysis.plotly.cross_validation.cross_validate", wraps=cross_validate
     )
diff --git a/ax/analysis/plotly/tests/test_scatter.py b/ax/analysis/plotly/tests/test_scatter.py
@@ -236,6 +236,9 @@ def test_compute_adhoc(self) -> None:
             **kwargs,
         )
 
+        # Normalize timestamps since cards are computed at different times
+        for card, adhoc_card in zip(cards.flatten(), adhoc_cards.flatten()):
+            adhoc_card._timestamp = card._timestamp
         self.assertEqual(cards, adhoc_cards)
 
     @TestCase.ax_long_test(

Original file line number	Diff line number	Diff line change
`@@ -236,6 +236,9 @@ def test_compute_adhoc(self) -> None:`
`236`	`236`	`**kwargs,`
`237`	`237`	`)`
`238`	`238`
	`239`	`+ # Normalize timestamps since cards are computed at different times`
	`240`	`+ for card, adhoc_card in zip(cards.flatten(), adhoc_cards.flatten()):`
	`241`	`+ adhoc_card._timestamp = card._timestamp`
`239`	`242`	`self.assertEqual(cards, adhoc_cards)`
`240`	`243`
`241`	`244`	`@TestCase.ax_long_test(`