facebook
diff --git a/‎ax/benchmark/benchmark.py
Lines changed: 101 additions & 69 deletions b/‎ax/benchmark/benchmark.py
Lines changed: 101 additions & 69 deletions
diff --git a/‎ax/benchmark/benchmark_result.py
Lines changed: 43 additions & 25 deletions b/‎ax/benchmark/benchmark_result.py
Lines changed: 43 additions & 25 deletions
@@ -30,7 +30,7 @@
 from ax.benchmark.benchmark_method import BenchmarkMethod
 from ax.benchmark.benchmark_problem import BenchmarkProblem
 from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult
-from ax.benchmark.benchmark_runner import BenchmarkRunner
+from ax.benchmark.benchmark_runner import BenchmarkRunner, get_total_runtime
 from ax.benchmark.benchmark_test_function import BenchmarkTestFunction
 from ax.benchmark.methods.sobol import get_sobol_generation_strategy
 from ax.core.arm import Arm
@@ -39,13 +39,14 @@
 from ax.core.optimization_config import OptimizationConfig
 from ax.core.search_space import SearchSpace
 from ax.core.trial_status import TrialStatus
-from ax.core.types import TParameterization, TParamValue
+from ax.core.types import TParamValue
 from ax.core.utils import get_model_times
 from ax.service.scheduler import Scheduler
 from ax.service.utils.best_point_mixin import BestPointMixin
 from ax.service.utils.scheduler_options import SchedulerOptions, TrialType
 from ax.utils.common.logger import DEFAULT_LOG_LEVEL, get_logger
 from ax.utils.common.random import with_rng_seed
+from pyre_extensions import assert_is_instance
 
 logger: Logger = get_logger(__name__)
 
@@ -172,23 +173,6 @@ def get_oracle_experiment_from_params(
     return experiment
 
 
-def get_oracle_experiment_from_experiment(
-    problem: BenchmarkProblem, experiment: Experiment
-) -> Experiment:
-    """
-    Get an ``Experiment`` that is the same as the original experiment but has
-    metrics evaluated at oracle values (noiseless ground-truth values
-    evaluated at the target task and fidelity)
-    """
-    return get_oracle_experiment_from_params(
-        problem=problem,
-        dict_of_dict_of_params={
-            trial.index: {arm.name: arm.parameters for arm in trial.arms}
-            for trial in experiment.trials.values()
-        },
-    )
-
-
 def get_benchmark_scheduler_options(
     method: BenchmarkMethod,
     include_sq: bool = False,
@@ -225,6 +209,35 @@ def get_benchmark_scheduler_options(
     )
 
 
+def _get_cumulative_cost(
+    previous_cost: float,
+    new_trials: set[int],
+    experiment: Experiment,
+) -> float:
+    """
+    Get the total cost of running a benchmark where `new_trials` have just
+    completed, and the cost up to that point was `previous_cost`.
+
+    If a backend simulator is used to track runtime the cost is just the
+    simulated time. If there is no backend simulator, it is still possible that
+    trials have varying runtimes without that being simulated, so in that case,
+    runtimes are computed.
+    """
+    runner = assert_is_instance(experiment.runner, BenchmarkRunner)
+    if runner.simulated_backend_runner is not None:
+        return runner.simulated_backend_runner.simulator.time
+
+    per_trial_times = (
+        get_total_runtime(
+            trial=experiment.trials[i],
+            step_runtime_function=runner.step_runtime_function,
+            n_steps=runner.test_function.n_steps,
+        )
+        for i in new_trials
+    )
+    return previous_cost + sum(per_trial_times)
+
+
 def benchmark_replication(
     problem: BenchmarkProblem,
     method: BenchmarkMethod,
@@ -284,16 +297,22 @@ def benchmark_replication(
         options=scheduler_options,
     )
 
-    # list of parameters for each trial
-    best_params_by_trial: list[list[TParameterization]] = []
+    # Each of these lists is added to when a trial completes or stops early.
+    # Since multiple trials can complete at once, there may be fewer elements in
+    # these traces than the number of trials run.
+    cost_trace: list[float] = []
+    best_params_list: list[Mapping[str, TParamValue]] = []  # For inference trace
+    evaluated_arms_list: list[set[Arm]] = []  # For oracle trace
 
     is_mf_or_mt = len(problem.target_fidelity_and_task) > 0
-    trials_used_for_best_point: set[int] = set()
 
     # Run the optimization loop.
     timeout_hours = method.timeout_hours
     remaining_hours = timeout_hours
 
+    previously_completed_trials = set()
+    cost = 0.0
+
     with with_rng_seed(seed=seed), warnings.catch_warnings():
         warnings.filterwarnings(
             "ignore",
@@ -302,28 +321,15 @@ def benchmark_replication(
             module="ax.modelbridge.cross_validation",
         )
         start = monotonic()
-        # These next several lines do the same thing as `run_n_trials`, but
+        # These next several lines do the same thing as
+        # `scheduler.run_n_trials`, but
         # decrement the timeout with each step, so that the timeout refers to
         # the total time spent in the optimization loop, not time per trial.
         scheduler.poll_and_process_results()
         for _ in scheduler.run_trials_and_yield_results(
             max_trials=problem.num_trials,
             timeout_hours=remaining_hours,
         ):
-            if timeout_hours is not None:
-                elapsed_hours = (monotonic() - start) / 3600
-                remaining_hours = timeout_hours - elapsed_hours
-                if remaining_hours <= 0.0:
-                    logger.warning("The optimization loop timed out.")
-                    break
-
-            if problem.is_moo or is_mf_or_mt:
-                # Inference trace is not supported for MOO.
-                # It's also not supported for multi-fidelity or multi-task
-                # problems, because Ax's best-point functionality doesn't know
-                # to predict at the target task or fidelity.
-                continue
-
             currently_completed_trials = {
                 t.index
                 for t in experiment.trials.values()
@@ -334,45 +340,70 @@ def benchmark_replication(
                 )
             }
             newly_completed_trials = (
-                currently_completed_trials - trials_used_for_best_point
-            )
-            if len(newly_completed_trials) == 0:
-                continue
-            for t in newly_completed_trials:
-                trials_used_for_best_point.add(t)
-
-            best_params = method.get_best_parameters(
-                experiment=experiment,
-                optimization_config=problem.optimization_config,
-                n_points=problem.n_best_points,
+                currently_completed_trials - previously_completed_trials
             )
-            # If multiple trials complete at the same time, add that number of
-            # points to the inference trace so that the trace has length equal to
-            # the number of trials.
-            for _ in newly_completed_trials:
-                best_params_by_trial.append(best_params)
+            previously_completed_trials = currently_completed_trials
+
+            if len(newly_completed_trials) > 0:
+                cost = _get_cumulative_cost(
+                    new_trials=newly_completed_trials,
+                    experiment=experiment,
+                    previous_cost=cost,
+                )
+                cost_trace.append(cost)
+
+                # Track what params are newly evaluated from those trials, for
+                # the oracle trace
+                params = {
+                    arm
+                    for i in newly_completed_trials
+                    for arm in experiment.trials[i].arms
+                }
+                evaluated_arms_list.append(params)
+
+                # Inference trace: Not supported for MOO.
+                # It's also not supported for multi-fidelity or multi-task
+                # problems, because Ax's best-point functionality doesn't know
+                # to predict at the target task or fidelity.
+                if not (problem.is_moo or is_mf_or_mt):
+                    best_params = method.get_best_parameters(
+                        experiment=experiment,
+                        optimization_config=problem.optimization_config,
+                        n_points=problem.n_best_points,
+                    )[0]
+                    best_params_list.append(best_params)
+
+            if timeout_hours is not None:
+                elapsed_hours = (monotonic() - start) / 3600
+                remaining_hours = timeout_hours - elapsed_hours
+                if remaining_hours <= 0.0:
+                    logger.warning("The optimization loop timed out.")
+                    break
 
         scheduler.summarize_final_result()
 
     # Construct inference trace from best parameters
-    inference_trace = np.full(problem.num_trials, np.nan)
-    for trial_index, best_params in enumerate(best_params_by_trial):
-        if len(best_params) == 0:
-            inference_trace[trial_index] = np.nan
-            continue
-        # Construct an experiment with one BatchTrial
-        best_params_oracle_experiment = get_oracle_experiment_from_params(
-            problem=problem,
-            dict_of_dict_of_params={0: {str(i): p for i, p in enumerate(best_params)}},
+    single_params_as_experiments = (
+        get_oracle_experiment_from_params(
+            problem=problem, dict_of_dict_of_params={0: {"0_0": params}}
         )
-        # Get the optimization trace. It will have only one point.
-        inference_trace[trial_index] = BestPointMixin._get_trace(
-            experiment=best_params_oracle_experiment,
-            optimization_config=problem.optimization_config,
-        )[0]
+        for params in best_params_list
+    )
+    inference_trace = np.array(
+        [
+            BestPointMixin._get_trace(
+                experiment=exp, optimization_config=problem.optimization_config
+            )[0]
+            for exp in single_params_as_experiments
+        ]
+    )
 
-    actual_params_oracle_experiment = get_oracle_experiment_from_experiment(
-        problem=problem, experiment=experiment
+    actual_params_oracle_experiment = get_oracle_experiment_from_params(
+        problem=problem,
+        dict_of_dict_of_params={
+            i: {arm.name: arm.parameters for arm in arms}
+            for i, arms in enumerate(evaluated_arms_list)
+        },
     )
     oracle_trace = np.array(
         BestPointMixin._get_trace(
@@ -404,6 +435,7 @@ def benchmark_replication(
         inference_trace=inference_trace,
         optimization_trace=optimization_trace,
         score_trace=score_trace,
+        cost_trace=np.array(cost_trace),
         fit_time=fit_time,
         gen_time=gen_time,
     )
 
@@ -32,40 +32,57 @@ class BenchmarkResult(Base):
         name: Name of the benchmark. Should make it possible to determine the
             problem and the method.
         seed: Seed used for determinism.
-        oracle_trace: For single-objective problems, element i of the
-            optimization trace is the best oracle value of the arms evaluated
-            after the first i trials.  For multi-objective problems, element i
-            of the optimization trace is the hypervolume of the oracle values of
-            the arms in the first i trials (which may be ``BatchTrial``s).
-            Oracle values are typically ground-truth (rather than noisy) and
-            evaluated at the target task and fidelity.
-        inference_trace: Inference trace comes from choosing a "best" point
-            based only on data that would be observable in realistic settings,
-            as specified by `BenchmarkMethod.get_best_parameters`,
-            and then evaluating the oracle value of that point according to the
-            problem's `OptimizationConfig`. For multi-objective problems, the
-            hypervolume of a set of points is considered.
+        oracle_trace: For single-objective problems, the oracle trace is the
+            cumulative best oracle objective value seen so far. For
+            multi-objective problems, it is the cumulative hypervolume of
+            feasible oracle objective values.
+
+            Oracle values are typically objective values that are at the ground
+            truth (not noisy) and evaluated at the target task and fidelity.
+
+            The trace may have fewer elements than the number of trials run if
+            multiple trials stop at the same time; the trace is updated whenever
+            trials stop (TrialStatus COMPLETED or EARLY_STOPPED). The number of
+            trials completed is reflected in the `cost_trace`, which is updated
+            at the same time as the `oracle_trace`. For example, if each trial
+            has a cost of 1, and `cost_trace[i] = 4`, then `oracle_trace[i]` is
+            the value of the best of the first four trials to complete, or the
+            feasible hypervolume of those trials.
+        inference_trace: Inference values come from choosing a "best" point or
+            points based only on data that would be observable in realistic
+            settings, as specified by `BenchmarkMethod.get_best_parameters`, and
+            then evaluating the oracle objective value of that point according
+            to the problem's `OptimizationConfig`.
 
             By default, if it is not overridden,
             `BenchmarkMethod.get_best_parameters` uses the empirical best point
             if `use_model_predictions_for_best_point` is False and the best
             point of those evaluated so far if it is True.
 
-            Note: This is not "inference regret", which is a lower-is-better value
-            that is relative to the best possible value. The inference value
-            trace is higher-is-better if the problem is a maximization problem
-            or if the problem is multi-objective (in which case hypervolume is
-            used). Hence, it is signed the same as ``oracle_trace`` and
-            ``optimization_trace``. ``score_trace`` is higher-is-better and
-            relative to the optimum.
-        optimization_trace: Either the ``oracle_trace`` or the
-            ``inference_trace``, depending on whether the ``BenchmarkProblem``
-            specifies ``report_inference_value``. Having ``optimization_trace``
-            specified separately is useful when we need just one value to
-            evaluate how well the benchmark went.
+            As with the oracle trace, the inference trace is updated whenever a
+            trial completes and may have fewer elements than the number of trials.
+
+            Note: This is scaled differently from "inference regret", which is a
+            lower-is-better value that is relative to the best possible value.
+            The inference value trace is higher-is-better if the problem is a
+            maximization problem or if the problem is multi-objective (in which
+            case hypervolume is used). Hence, it is signed the same as
+            `oracle_trace` and `optimization_trace`. `score_trace`, meanwhile,
+            is higher-is-better and relative to the optimum.
+        optimization_trace: Either the `oracle_trace` or the `inference_trace`,
+            depending on whether the `BenchmarkProblem` specifies
+            `report_inference_value`. Having `optimization_trace` specified
+            separately is useful when we need just one value to evaluate how
+            well the benchmark went.
         score_trace: The scores associated with the problem, typically either
             the optimization_trace or inference_value_trace normalized to a
             0-100 scale for comparability between problems.
+        cost_trace: The cumulative cost of completed trials. The `cost_trace` is
+            updated whenever a trial completes, so, like the `oracle_trace` and
+            `inference_trace`, it can have fewer elements than the number of
+            trials if multiple trials complete at the same time. Trials that do
+            not produce `MapData` have a cost of 1, and trials that produce
+            `MapData` have a cost equal to the length of the `MapData`.
         fit_time: Total time spent fitting models.
         gen_time: Total time spent generating candidates.
         experiment: If not ``None``, the Ax experiment associated with the
@@ -81,6 +98,7 @@ class BenchmarkResult(Base):
     inference_trace: npt.NDArray
     optimization_trace: npt.NDArray
     score_trace: npt.NDArray
+    cost_trace: npt.NDArray
 
     fit_time: float
     gen_time: float