Add MinTrialsWithLILOInputHashCheck transition criterion (#4994)

ItsMrLin · meta-codesync[bot] · commit 9facfc49c704 · 2026-03-08T21:18:59.000-07:00
Summary: Pull Request resolved: #4994 Add a hash-aware transition criterion for LILO GS loops. Unlike plain MinTrials which counts all completed trials from a node, MinTrialsWithLILOInputHashCheck only counts trials whose LILO input hash matches the current experiment state. This ensures the GS correctly transitions from LILO labeling → MBG only when enough *fresh* labels exist (labels produced under the current experiment data + LLM messages). Trials without a LILO input hash (non-LILO trials) are always counted, preserving backward compatibility. Changes: - Add `MinTrialsWithLILOInputHashCheck` class to `transition_criterion.py` that delegates hash computation to `get_current_lilo_hash` from `hash_utils` (replacing a private `_compute_current_hash` static method) - Remove redundant pass-through `__init__` — the parent class handles all args - Register in JSON encoder/decoder registries for serialization support - Add tests verifying fresh/stale counting behavior Reviewed By: saitcakmak Differential Revision: D95284285
diff --git a/ax/generation_strategy/tests/test_transition_criterion.py b/ax/generation_strategy/tests/test_transition_criterion.py
@@ -7,11 +7,15 @@
 
 
 from logging import Logger
+from unittest.mock import MagicMock
 
 import pandas as pd
 from ax.adapter.registry import Generators
+from ax.core.arm import Arm
 from ax.core.auxiliary import AuxiliaryExperiment, AuxiliaryExperimentPurpose
 from ax.core.data import Data
+from ax.core.derived_metric import DerivedMetric
+from ax.core.experiment import Experiment
 from ax.core.trial_status import TrialStatus
 from ax.exceptions.core import DataRequiredError, UserInputError
 from ax.exceptions.generation_strategy import MaxParallelismReachedException
@@ -28,7 +32,10 @@
     MaxGenerationParallelism,
     MaxTrialsAwaitingData,
     MinTrials,
+    MinTrialsWithLILOInputHashCheck,
 )
+from ax.utils.common.constants import Keys
+from ax.utils.common.hash_utils import compute_lilo_input_hash
 from ax.utils.common.logger import get_logger
 from ax.utils.common.testutils import TestCase
 from ax.utils.testing.core_stubs import (
@@ -41,6 +48,13 @@
 logger: Logger = get_logger(__name__)
 
 
+def _mock_node(trials_from_node: set[int]) -> MagicMock:
+    """Create a mock GenerationNode with a specified trials_from_node set."""
+    node = MagicMock()
+    node.trials_from_node = trials_from_node
+    return node
+
+
 class TestTransitionCriterion(TestCase):
     def setUp(self) -> None:
         super().setUp()
@@ -614,3 +628,93 @@ def test_max_generation_parallelism_block_error(self) -> None:
                 experiment=self.experiment,
                 trials_from_node={0, 1, 2},
             )
+
+    def test_min_trials_with_lilo_input_hash_check(self) -> None:
+        """Verify MinTrialsWithLILOInputHashCheck counts only hash-fresh trials."""
+        exp = get_branin_experiment()
+
+        # Register a DerivedMetric with pairwise name.
+        pairwise_metric = DerivedMetric(
+            name=Keys.PAIRWISE_PREFERENCE_QUERY.value,
+            input_metric_names=["branin"],
+        )
+        exp.add_tracking_metric(pairwise_metric)
+
+        criterion = MinTrialsWithLILOInputHashCheck(
+            threshold=2,
+            transition_to="next_node",
+            only_in_statuses=[TrialStatus.COMPLETED],
+        )
+
+        # Helper to create and complete a trial with data.
+        def _add_trial(idx: int, exp: Experiment = exp) -> None:
+            trial = exp.new_batch_trial()
+            trial.add_arm(
+                Arm(name=f"{idx}_0", parameters={"x1": float(idx), "x2": 0.0})
+            )
+            trial.mark_running(no_runner_required=True)
+            trial.mark_completed()
+            exp.attach_data(
+                Data(
+                    df=pd.DataFrame(
+                        [
+                            {
+                                "trial_index": idx,
+                                "arm_name": f"{idx}_0",
+                                "metric_name": "branin",
+                                "metric_signature": "branin",
+                                "mean": float(idx),
+                                "sem": 0.1,
+                            }
+                        ]
+                    )
+                )
+            )
+
+        # Create 3 trials, stamp first 2 with current hash.
+        for i in range(3):
+            _add_trial(i)
+
+        current_hash = compute_lilo_input_hash(exp, ["branin"])
+        trials_from_node = {0, 1, 2}
+
+        with self.subTest("no_hashes_all_count"):
+            # No hash stamps → all counted (fallback behavior).
+            count = criterion.num_contributing_to_threshold(exp, trials_from_node)
+            self.assertEqual(count, 3)
+
+        # Stamp trials 0 and 1 with the current hash.
+        exp.trials[0]._properties[Keys.LILO_INPUT_HASH] = current_hash
+        exp.trials[1]._properties[Keys.LILO_INPUT_HASH] = current_hash
+
+        with self.subTest("fresh_hashes_count"):
+            count = criterion.num_contributing_to_threshold(exp, trials_from_node)
+            # Trials 0, 1 (fresh hash) + trial 2 (no hash → included).
+            self.assertEqual(count, 3)
+
+        # Make trial 1 stale.
+        exp.trials[1]._properties[Keys.LILO_INPUT_HASH] = "stale_hash"
+
+        with self.subTest("stale_hash_excluded"):
+            count = criterion.num_contributing_to_threshold(exp, trials_from_node)
+            # Trial 0 (fresh) + trial 2 (no hash) = 2. Trial 1 excluded.
+            self.assertEqual(count, 2)
+            self.assertTrue(criterion.is_met(exp, _mock_node(trials_from_node)))
+
+        # Make trial 0 stale too.
+        exp.trials[0]._properties[Keys.LILO_INPUT_HASH] = "another_stale"
+
+        with self.subTest("not_enough_fresh"):
+            count = criterion.num_contributing_to_threshold(exp, trials_from_node)
+            # Only trial 2 (no hash) counts.
+            self.assertEqual(count, 1)
+            self.assertFalse(criterion.is_met(exp, _mock_node(trials_from_node)))
+
+        with self.subTest("data_change_invalidates"):
+            # Add new data — changes the current hash, making ALL stamped
+            # trials stale.
+            _add_trial(3)
+            trials_from_node.add(3)
+            count = criterion.num_contributing_to_threshold(exp, trials_from_node)
+            # Trials 0, 1 stale. Trials 2, 3 have no hash → included.
+            self.assertEqual(count, 2)
diff --git a/ax/generation_strategy/transition_criterion.py b/ax/generation_strategy/transition_criterion.py
@@ -17,6 +17,8 @@
 from ax.core.utils import get_trial_indices_with_required_metrics
 from ax.exceptions.core import DataRequiredError, UserInputError
 from ax.exceptions.generation_strategy import MaxParallelismReachedException
+from ax.utils.common.constants import Keys
+from ax.utils.common.hash_utils import get_current_lilo_hash
 
 if TYPE_CHECKING:
     from ax.generation_strategy.generation_node import GenerationNode
@@ -644,6 +646,77 @@ def __init__(
         )
 
 
+class MinTrialsWithLILOInputHashCheck(TrialBasedCriterion):
+    """Like ``MinTrials``, but only counts trials whose LILO input hash
+    matches the current experiment state.
+
+    LILO (Language-in-the-Loop) trials are stamped with a hash of the
+    experiment state (metric data + LLM messages) at labeling time.
+    When the experiment state changes (new data arrives, or the user updates
+    LLM messages), old labels become stale.  This criterion ensures that
+    the transition fires only when enough *fresh* labels exist — i.e.,
+    labels produced under the current experiment state.
+
+    Freshness is checked against the *current* experiment state (not the
+    most-recently-stamped LILO hash) because the LLM prompt includes a
+    full experiment summary, so any change to input metric data alters the
+    context under which labels would be produced and warrants relabeling.
+
+    Trials without a LILO input hash (e.g., Sobol or MBG trials) are always
+    counted, preserving backward compatibility with non-LILO workflows.
+
+    Args:
+        threshold: Minimum number of fresh trials required.
+        transition_to: The GenerationNode to transition to when met.
+        only_in_statuses: Only count trials with these statuses.
+        not_in_statuses: Exclude trials with these statuses.
+        use_all_trials_in_exp: Count all experiment trials, not just
+            those from the current node.
+        continue_trial_generation: Continue generating arms for the
+            same trial after transition.
+        count_only_trials_with_data: Only count trials that have data.
+    """
+
+    def num_contributing_to_threshold(
+        self,
+        experiment: Experiment,
+        trials_from_node: set[int],
+    ) -> int:
+        """Count trials toward threshold, excluding those with stale hashes.
+
+        First applies the standard status-based filtering from the base class,
+        then further filters to only trials whose LILO input hash matches
+        the current experiment state.
+        """
+        # Get the base count of candidate trial indices (status-filtered).
+        all_trials = self.all_trials_to_check(experiment)
+        if self.count_only_trials_with_data:
+            data_trial_indices = get_trial_indices_with_required_metrics(
+                experiment=experiment,
+                df=experiment.lookup_data().df,
+                require_data_for_all_metrics=False,
+            )
+            all_trials = all_trials.intersection(data_trial_indices)
+
+        if not bool(self.use_all_trials_in_exp):
+            all_trials = trials_from_node.intersection(all_trials)
+
+        # Further filter by LILO input hash freshness.
+        current_hash = get_current_lilo_hash(experiment)
+        if current_hash is None:
+            # No pairwise DerivedMetric found — fall back to plain count.
+            return len(all_trials)
+
+        fresh_count = 0
+        for idx in all_trials:
+            trial = experiment.trials[idx]
+            trial_hash = trial._properties.get(Keys.LILO_INPUT_HASH)
+            if trial_hash is None or trial_hash == current_hash:
+                fresh_count += 1
+
+        return fresh_count
+
+
 class AuxiliaryExperimentCheck(TransitionCriterion):
     """A class to transition from one GenerationNode to another by checking if certain
     types of Auxiliary Experiment purposes exists.
diff --git a/ax/storage/json_store/registry.py b/ax/storage/json_store/registry.py
@@ -82,6 +82,7 @@
     MaxGenerationParallelism,
     MaxTrialsAwaitingData,
     MinTrials,
+    MinTrialsWithLILOInputHashCheck,
     TransitionCriterion,
 )
 from ax.generators.torch.botorch_modular.acquisition import Acquisition
@@ -222,6 +223,7 @@
     MaxTrialsAwaitingData: pausing_criterion_to_dict,
     Metric: metric_to_dict,
     MinTrials: transition_criterion_to_dict,
+    MinTrialsWithLILOInputHashCheck: transition_criterion_to_dict,
     AuxiliaryExperimentCheck: transition_criterion_to_dict,
     GeneratorSpec: generator_spec_to_dict,
     MultiObjective: multi_objective_to_dict,
@@ -350,6 +352,7 @@
     "MaxTrialsAwaitingData": MaxTrialsAwaitingData,
     "Metric": Metric,
     "MinTrials": MinTrials,
+    "MinTrialsWithLILOInputHashCheck": MinTrialsWithLILOInputHashCheck,
     # DEPRECATED; backward compatibility for MinimumTrialsInStatus -> MinTrials
     "MinimumTrialsInStatus": MinTrials,
     "GeneratorRegistryBase": GeneratorRegistryBase,