option to use a particular trial for CV test set

bletham · facebook-github-bot · commit 40fe62ef4bc6 · 2026-03-09T11:46:42.000-07:00
Summary: In situations where we have a target trial that is significantly different from other trials on the experiment, for instance if there was a system rebase in between two trials, we really care especially about our ability to predict the target trial, as that is what we expect to see moving forward. This adds a kwarg to the DiagnosticAnalysis to specify a particular trial for the diagnostics. All trials will still be used as train arms in each CV fold, but the test arms will be limited to arms in that trial.

Differential Revision: D95824210
diff --git a/ax/analysis/diagnostics.py b/ax/analysis/diagnostics.py
@@ -36,14 +36,23 @@ class DiagnosticAnalysis(Analysis):
     of leave-one-out cross validation.
     """
 
-    def __init__(self, include_tracking_metrics: bool = False) -> None:
+    def __init__(
+        self,
+        include_tracking_metrics: bool = False,
+        trial_index: int | None = None,
+    ) -> None:
         """Initialize the DiagnosticAnalysis.
 
         Args:
             include_tracking_metrics: Whether to include tracking metrics or just use
                 the optimization config metrics.
+            trial_index: If provided, limits cross validation to only evaluate
+                predictions for observations from this trial. Other trials'
+                observations will still be used for training but will not
+                appear as test points.
         """
         self.include_tracking_metrics = include_tracking_metrics
+        self.trial_index = trial_index
 
     @override
     def validate_applicable_state(
@@ -80,7 +89,9 @@ def compute(
         cross_validation_plots = []
         metric_r2_card = []
         if not is_bandit:
-            cv_analysis = CrossValidationPlot(metric_names=metric_names)
+            cv_analysis = CrossValidationPlot(
+                metric_names=metric_names, cv_trial_index=self.trial_index
+            )
             cv_card = cv_analysis.compute_or_error_card(
                 experiment=experiment,
                 generation_strategy=generation_strategy,
diff --git a/ax/analysis/plotly/cross_validation.py b/ax/analysis/plotly/cross_validation.py
@@ -6,7 +6,7 @@
 # pyre-strict
 
 
-from collections.abc import Mapping, Sequence
+from collections.abc import Callable, Mapping, Sequence
 from typing import final
 
 import pandas as pd
@@ -19,6 +19,7 @@
 from ax.analysis.utils import extract_relevant_adapter, validate_adapter_can_predict
 from ax.core.analysis_card import AnalysisCardBase
 from ax.core.experiment import Experiment
+from ax.core.observation import Observation
 from ax.generation_strategy.generation_strategy import GenerationStrategy
 from ax.utils.stats.model_fit_stats import coefficient_of_determination
 from plotly import graph_objects as go
@@ -76,6 +77,7 @@ def __init__(
         untransform: bool = False,
         trial_index: int | None = None,
         labels: Mapping[str, str] | None = None,
+        cv_trial_index: int | None = None,
     ) -> None:
         """
         Args:
@@ -99,13 +101,18 @@ def __init__(
                 trial.
             labels: Optional dictionary of labels for the plot. Useful for when metric
                 names are too long or otherwise challenging to read.
+            cv_trial_index: If provided, limits cross validation to only evaluate
+                predictions for observations from this trial. Other trials'
+                observations will still be used for training but will not
+                appear as test points.
         """
 
         self.metric_names = metric_names
         self.folds = folds
         self.untransform = untransform
         self.trial_index = trial_index
         self.labels: dict[str, str] = {**labels} if labels is not None else {}
+        self.cv_trial_index = cv_trial_index
         self._r2s: dict[str, float] = {}
 
     @override
@@ -138,8 +145,25 @@ def compute(
         )
 
         cards = []
+
+        def _make_test_selector(
+            trial_index: int,
+        ) -> Callable[[Observation], bool]:
+            def test_selector(obs: Observation) -> bool:
+                return obs.features.trial_index == trial_index
+
+            return test_selector
+
+        test_selector = (
+            _make_test_selector(self.cv_trial_index)
+            if self.cv_trial_index is not None
+            else None
+        )
         cv_results = cross_validate(
-            adapter=relevant_adapter, folds=self.folds, untransform=self.untransform
+            adapter=relevant_adapter,
+            folds=self.folds,
+            untransform=self.untransform,
+            test_selector=test_selector,
         )
         relevant_adapter_metric_names = [
             relevant_adapter._experiment.signature_to_metric[signature].name
diff --git a/ax/analysis/plotly/tests/test_constraint_feasibility.py b/ax/analysis/plotly/tests/test_constraint_feasibility.py
@@ -218,6 +218,7 @@ def test_offline(self) -> None:
                             generation_strategy=generation_strategy,
                         )
 
+    @mock_botorch_optimize
     def test_online(self) -> None:
         for experiment in get_online_experiments():
             # Skip if no outcome constraints
diff --git a/ax/analysis/plotly/tests/test_cross_validation.py b/ax/analysis/plotly/tests/test_cross_validation.py
@@ -139,6 +139,22 @@ def test_it_can_specify_trial_index_correctly(self) -> None:
                 card.df["arm_name"].unique(),
             )
 
+    def test_cv_trial_index_filters_to_single_trial(self) -> None:
+        # cv_trial_index filters CV to only evaluate predictions for observations
+        # from that trial. Use trial 0 which is in the model's training data.
+        analysis = CrossValidationPlot(metric_names=["bar"], cv_trial_index=0)
+        (card,) = analysis.compute(
+            generation_strategy=self.client.generation_strategy
+        ).flatten()
+        # Only the arm from trial 0 should appear as a test point
+        trial_0_arm_name = none_throws(
+            assert_is_instance(self.client.experiment.trials[0], Trial).arm
+        ).name
+        self.assertEqual(
+            list(card.df["arm_name"].unique()),
+            [trial_0_arm_name],
+        )
+
     @mock.patch(
         "ax.analysis.plotly.cross_validation.cross_validate", wraps=cross_validate
     )
diff --git a/ax/analysis/plotly/tests/test_scatter.py b/ax/analysis/plotly/tests/test_scatter.py
@@ -236,6 +236,9 @@ def test_compute_adhoc(self) -> None:
             **kwargs,
         )
 
+        # Normalize timestamps since cards are computed at different times
+        for card, adhoc_card in zip(cards.flatten(), adhoc_cards.flatten()):
+            adhoc_card._timestamp = card._timestamp
         self.assertEqual(cards, adhoc_cards)
 
     @TestCase.ax_long_test(

Original file line number	Diff line number	Diff line change
`@@ -218,6 +218,7 @@ def test_offline(self) -> None:`
`218`	`218`	`generation_strategy=generation_strategy,`
`219`	`219`	`)`
`220`	`220`
	`221`	`+ @mock_botorch_optimize`
`221`	`222`	`def test_online(self) -> None:`
`222`	`223`	`for experiment in get_online_experiments():`
`223`	`224`	`# Skip if no outcome constraints`
Original file line number	Diff line number	Diff line change
`@@ -236,6 +236,9 @@ def test_compute_adhoc(self) -> None:`
`236`	`236`	`**kwargs,`
`237`	`237`	`)`
`238`	`238`
	`239`	`+ # Normalize timestamps since cards are computed at different times`
	`240`	`+ for card, adhoc_card in zip(cards.flatten(), adhoc_cards.flatten()):`
	`241`	`+ adhoc_card._timestamp = card._timestamp`
`239`	`242`	`self.assertEqual(cards, adhoc_cards)`
`240`	`243`
`241`	`244`	`@TestCase.ax_long_test(`