From cc1ef68d3634d54547668fd9de90ab0a2099b3a9 Mon Sep 17 00:00:00 2001
From: Elizabeth Santorella <santorella@meta.com>
Date: Tue, 25 Feb 2025 09:39:23 -0800
Subject: [PATCH] Track cost; order oracle trace by completion order (#3411)

Summary:

I am not sure if this is what we will want in the long run, but it will unblock benchmarking early stopping.


# What's wrong with the current behavior

**Ordering by start order vs. completion order:**

Currently, the oracle trace is ordered by trial order and has one entry for each trial. The inference trace has always been ordered by completion order because it is updated every time a trial ceases running. The order of completion (including early stopping) seems preferable for both, and it's a little weird for the oracle trace to have a different ordering than the inference trace. See here for discussion on this: https://fb.workplace.com/groups/1294299434097422/posts/2563368300523856

**Inability to compare more costly vs. less costly strategies**:
Separately, tracking cost is necessary to fairly compare more aggressive vs. less aggressive early-stopping strategies or to compare stopping early against not.

I am bundling these two changes (reordering the oracle trace and introducing cost) because the oracle trace should now only be compared against the cost. Ordering by completion order doesn't make a lot of sense without a notion of cost when multiple trials can complete at the same time.

# New behavior


| time | first trial running | second trial running | objective values | best point |
| ---- | ----------------- | -------------------- | --------------- |  ---------- |
| 0 | 0 | 1 | | |
| 1 | 0 | 2 | y_1 | y_a |
| 2 | 0 | 2 | y_1 | not computed |
| 3 | 0 | 2 | y_1, y_0, y_2 | y_b |

Assuming higher is better, this produces

```BenchmarkResult:
  cost_trace: [1, 3]
  oracle_trace: [y_1, max(y_1, y_0, y_2)]
  inference_trace: [y_a, y_b]
```

Now traces are only updated when a trial completes, so there are 2 trace elements with 3 trials. (We could also just duplicate elements when multiple trials complete at the same time to preserve the length.) See docstrings for more detail.

# What's not ideal about this

I want to flag that a few things are not great about this setup.
* It makes plotting hard: One one replication produces a cost_trace of [3, 5] and another one produces a cost_trace of [2, 6], how do we aggregate their optimization traces? We can do this by left-interpolating the optimization traces onto [2, 3, .., 6] and then aggregating as usual, but it is clunky.
* Even aside from the issue of different replications producing different cost traces, plotting is harder because plotting must be against cost now.
* People typically are interested in epoch-by-epoch results for early stopping, and those are not available here.

# Better long-term solution

Two alternatives are
* Storing trace values for each time step, which would remove the need to track cost at all: element `i` of the trace would have happened at virtual second `i`.
* Storing cost/time information at each step in MapData, and then deriving a proper trace from there (we may already have this -- need to check)

# Internal:

Reviewed By: Balandat

Differential Revision: D69489720
---
 ax/benchmark/benchmark.py            | 211 +++++++++++++++++----------
 ax/benchmark/benchmark_result.py     |  69 +++++----
 ax/benchmark/tests/test_benchmark.py |  74 +++++-----
 ax/utils/testing/benchmark_stubs.py  |   1 +
 4 files changed, 214 insertions(+), 141 deletions(-)

diff --git a/ax/benchmark/benchmark.py b/ax/benchmark/benchmark.py
index 22513d32dea..3caa2b73c30 100644
--- a/ax/benchmark/benchmark.py
+++ b/ax/benchmark/benchmark.py
@@ -30,7 +30,7 @@
 from ax.benchmark.benchmark_method import BenchmarkMethod
 from ax.benchmark.benchmark_problem import BenchmarkProblem
 from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult
-from ax.benchmark.benchmark_runner import BenchmarkRunner
+from ax.benchmark.benchmark_runner import BenchmarkRunner, get_total_runtime
 from ax.benchmark.benchmark_test_function import BenchmarkTestFunction
 from ax.benchmark.methods.sobol import get_sobol_generation_strategy
 from ax.core.arm import Arm
@@ -39,13 +39,14 @@
 from ax.core.optimization_config import OptimizationConfig
 from ax.core.search_space import SearchSpace
 from ax.core.trial_status import TrialStatus
-from ax.core.types import TParameterization, TParamValue
+from ax.core.types import TParamValue
 from ax.core.utils import get_model_times
 from ax.service.scheduler import Scheduler
 from ax.service.utils.best_point_mixin import BestPointMixin
 from ax.service.utils.scheduler_options import SchedulerOptions, TrialType
 from ax.utils.common.logger import DEFAULT_LOG_LEVEL, get_logger
 from ax.utils.common.random import with_rng_seed
+from pyre_extensions import assert_is_instance
 
 logger: Logger = get_logger(__name__)
 
@@ -172,23 +173,6 @@ def get_oracle_experiment_from_params(
     return experiment
 
 
-def get_oracle_experiment_from_experiment(
-    problem: BenchmarkProblem, experiment: Experiment
-) -> Experiment:
-    """
-    Get an ``Experiment`` that is the same as the original experiment but has
-    metrics evaluated at oracle values (noiseless ground-truth values
-    evaluated at the target task and fidelity)
-    """
-    return get_oracle_experiment_from_params(
-        problem=problem,
-        dict_of_dict_of_params={
-            trial.index: {arm.name: arm.parameters for arm in trial.arms}
-            for trial in experiment.trials.values()
-        },
-    )
-
-
 def get_benchmark_scheduler_options(
     method: BenchmarkMethod,
     include_sq: bool = False,
@@ -225,6 +209,79 @@ def get_benchmark_scheduler_options(
     )
 
 
+def _get_cumulative_cost(
+    previous_cost: float,
+    new_trials: set[int],
+    experiment: Experiment,
+) -> float:
+    """
+    Get the total cost of running a benchmark where `new_trials` have just
+    completed, and the cost up to that point was `previous_cost`.
+
+    If a backend simulator is used to track runtime the cost is just the
+    simulated time. If there is no backend simulator, it is still possible that
+    trials have varying runtimes without that being simulated, so in that case,
+    runtimes are computed.
+    """
+    runner = assert_is_instance(experiment.runner, BenchmarkRunner)
+    if runner.simulated_backend_runner is not None:
+        return runner.simulated_backend_runner.simulator.time
+
+    per_trial_times = (
+        get_total_runtime(
+            trial=experiment.trials[i],
+            step_runtime_function=runner.step_runtime_function,
+            n_steps=runner.test_function.n_steps,
+        )
+        for i in new_trials
+    )
+    return previous_cost + sum(per_trial_times)
+
+
+def _get_oracle_value_of_params(
+    params: Mapping[str, TParamValue], problem: BenchmarkProblem
+) -> float:
+    """
+    A roundabout way of getting the value of a parameterization:
+    1. Construct an experiment with the parameterization as its only trial,
+        using the BenchmarkProblem to get the oracle value of that
+        parameterization.
+    2. Get the optimization trace of that experiment.
+    """
+    dummy_experiment = get_oracle_experiment_from_params(
+        problem=problem, dict_of_dict_of_params={0: {"0_0": params}}
+    )
+    (inference_value,) = BestPointMixin._get_trace(
+        experiment=dummy_experiment, optimization_config=problem.optimization_config
+    )
+    return inference_value
+
+
+def _get_oracle_trace_from_arms(
+    evaluated_arms_list: Iterable[set[Arm]], problem: BenchmarkProblem
+) -> npt.NDArray:
+    """
+    Get the oracle trace from a list of arms.
+
+    1. Construct a dummy experiment where trial ``i`` contains the arms in
+        ``evaluated_arms_list[i]``; if there are multiple arms, it will be a
+        ``BatchTrial``. Its data will be at oracle values.
+    2. Get the optimization trace of that experiment.
+    """
+    dummy_experiment = get_oracle_experiment_from_params(
+        problem=problem,
+        dict_of_dict_of_params={
+            i: {arm.name: arm.parameters for arm in arms}
+            for i, arms in enumerate(evaluated_arms_list)
+        },
+    )
+    oracle_trace = BestPointMixin._get_trace(
+        experiment=dummy_experiment,
+        optimization_config=problem.optimization_config,
+    )
+    return np.array(oracle_trace)
+
+
 def benchmark_replication(
     problem: BenchmarkProblem,
     method: BenchmarkMethod,
@@ -284,16 +341,22 @@ def benchmark_replication(
         options=scheduler_options,
     )
 
-    # list of parameters for each trial
-    best_params_by_trial: list[list[TParameterization]] = []
+    # Each of these lists is added to when a trial completes or stops early.
+    # Since multiple trials can complete at once, there may be fewer elements in
+    # these traces than the number of trials run.
+    cost_trace: list[float] = []
+    best_params_list: list[Mapping[str, TParamValue]] = []  # For inference trace
+    evaluated_arms_list: list[set[Arm]] = []  # For oracle trace
 
     is_mf_or_mt = len(problem.target_fidelity_and_task) > 0
-    trials_used_for_best_point: set[int] = set()
 
     # Run the optimization loop.
     timeout_hours = method.timeout_hours
     remaining_hours = timeout_hours
 
+    previously_completed_trial_idcs: set[int] = set()
+    cost = 0.0
+
     with with_rng_seed(seed=seed), warnings.catch_warnings():
         warnings.filterwarnings(
             "ignore",
@@ -302,7 +365,8 @@ def benchmark_replication(
             module="ax.modelbridge.cross_validation",
         )
         start = monotonic()
-        # These next several lines do the same thing as `run_n_trials`, but
+        # These next several lines do the same thing as
+        # `scheduler.run_n_trials`, but
         # decrement the timeout with each step, so that the timeout refers to
         # the total time spent in the optimization loop, not time per trial.
         scheduler.poll_and_process_results()
@@ -310,20 +374,6 @@ def benchmark_replication(
             max_trials=problem.num_trials,
             timeout_hours=remaining_hours,
         ):
-            if timeout_hours is not None:
-                elapsed_hours = (monotonic() - start) / 3600
-                remaining_hours = timeout_hours - elapsed_hours
-                if remaining_hours <= 0.0:
-                    logger.warning("The optimization loop timed out.")
-                    break
-
-            if problem.is_moo or is_mf_or_mt:
-                # Inference trace is not supported for MOO.
-                # It's also not supported for multi-fidelity or multi-task
-                # problems, because Ax's best-point functionality doesn't know
-                # to predict at the target task or fidelity.
-                continue
-
             currently_completed_trials = {
                 t.index
                 for t in experiment.trials.values()
@@ -334,52 +384,58 @@ def benchmark_replication(
                 )
             }
             newly_completed_trials = (
-                currently_completed_trials - trials_used_for_best_point
-            )
-            if len(newly_completed_trials) == 0:
-                continue
-            for t in newly_completed_trials:
-                trials_used_for_best_point.add(t)
-
-            best_params = method.get_best_parameters(
-                experiment=experiment,
-                optimization_config=problem.optimization_config,
-                n_points=problem.n_best_points,
+                currently_completed_trials - previously_completed_trial_idcs
             )
-            # If multiple trials complete at the same time, add that number of
-            # points to the inference trace so that the trace has length equal to
-            # the number of trials.
-            for _ in newly_completed_trials:
-                best_params_by_trial.append(best_params)
+            previously_completed_trial_idcs = currently_completed_trials
+
+            if len(newly_completed_trials) > 0:
+                cost = _get_cumulative_cost(
+                    new_trials=newly_completed_trials,
+                    experiment=experiment,
+                    previous_cost=cost,
+                )
+                cost_trace.append(cost)
+
+                # Track what params are newly evaluated from those trials, for
+                # the oracle trace
+                params = {
+                    arm
+                    for i in newly_completed_trials
+                    for arm in experiment.trials[i].arms
+                }
+                evaluated_arms_list.append(params)
+
+                # Inference trace: Not supported for MOO.
+                # It's also not supported for multi-fidelity or multi-task
+                # problems, because Ax's best-point functionality doesn't know
+                # to predict at the target task or fidelity.
+                if not (problem.is_moo or is_mf_or_mt):
+                    (best_params,) = method.get_best_parameters(
+                        experiment=experiment,
+                        optimization_config=problem.optimization_config,
+                        n_points=problem.n_best_points,
+                    )
+                    best_params_list.append(best_params)
+
+            if timeout_hours is not None:
+                elapsed_hours = (monotonic() - start) / 3600
+                remaining_hours = timeout_hours - elapsed_hours
+                if remaining_hours <= 0.0:
+                    logger.warning("The optimization loop timed out.")
+                    break
 
         scheduler.summarize_final_result()
 
-    # Construct inference trace from best parameters
-    inference_trace = np.full(problem.num_trials, np.nan)
-    for trial_index, best_params in enumerate(best_params_by_trial):
-        if len(best_params) == 0:
-            inference_trace[trial_index] = np.nan
-            continue
-        # Construct an experiment with one BatchTrial
-        best_params_oracle_experiment = get_oracle_experiment_from_params(
-            problem=problem,
-            dict_of_dict_of_params={0: {str(i): p for i, p in enumerate(best_params)}},
-        )
-        # Get the optimization trace. It will have only one point.
-        inference_trace[trial_index] = BestPointMixin._get_trace(
-            experiment=best_params_oracle_experiment,
-            optimization_config=problem.optimization_config,
-        )[0]
-
-    actual_params_oracle_experiment = get_oracle_experiment_from_experiment(
-        problem=problem, experiment=experiment
+    inference_trace = np.array(
+        [
+            _get_oracle_value_of_params(params=params, problem=problem)
+            for params in best_params_list
+        ]
     )
-    oracle_trace = np.array(
-        BestPointMixin._get_trace(
-            experiment=actual_params_oracle_experiment,
-            optimization_config=problem.optimization_config,
-        )
+    oracle_trace = _get_oracle_trace_from_arms(
+        evaluated_arms_list=evaluated_arms_list, problem=problem
     )
+
     optimization_trace = (
         inference_trace if problem.report_inference_value_as_trace else oracle_trace
     )
@@ -404,6 +460,7 @@ def benchmark_replication(
         inference_trace=inference_trace,
         optimization_trace=optimization_trace,
         score_trace=score_trace,
+        cost_trace=np.array(cost_trace),
         fit_time=fit_time,
         gen_time=gen_time,
     )
diff --git a/ax/benchmark/benchmark_result.py b/ax/benchmark/benchmark_result.py
index 538f3986c0c..39601995e0b 100644
--- a/ax/benchmark/benchmark_result.py
+++ b/ax/benchmark/benchmark_result.py
@@ -32,40 +32,58 @@ class BenchmarkResult(Base):
         name: Name of the benchmark. Should make it possible to determine the
             problem and the method.
         seed: Seed used for determinism.
-        oracle_trace: For single-objective problems, element i of the
-            optimization trace is the best oracle value of the arms evaluated
-            after the first i trials.  For multi-objective problems, element i
-            of the optimization trace is the hypervolume of the oracle values of
-            the arms in the first i trials (which may be ``BatchTrial``s).
-            Oracle values are typically ground-truth (rather than noisy) and
-            evaluated at the target task and fidelity.
-        inference_trace: Inference trace comes from choosing a "best" point
-            based only on data that would be observable in realistic settings,
-            as specified by `BenchmarkMethod.get_best_parameters`,
-            and then evaluating the oracle value of that point according to the
-            problem's `OptimizationConfig`. For multi-objective problems, the
-            hypervolume of a set of points is considered.
+        oracle_trace: For single-objective problems, the oracle trace is the
+            best oracle objective value seen on completed trials up to that
+            point. For multi-objective problems, it is the cumulative
+            hypervolume of feasible oracle objective values.
+
+            Oracle values are typically objective values that are at the ground
+            truth (not noisy) and evaluated at the target task and fidelity.
+
+            The trace may have fewer elements than the number of trials run if
+            multiple trials stop at the same time; the trace is updated whenever
+            trials stop (TrialStatus COMPLETED or EARLY_STOPPED). The number of
+            trials completed is reflected in the `cost_trace`, which is updated
+            at the same time as the `oracle_trace`. For example, if each trial
+            has a cost of 1, and `cost_trace[i] = 4`, then `oracle_trace[i]` is
+            the value of the best of the first four trials to complete, or the
+            feasible hypervolume of those trials.
+        inference_trace: Inference values come from choosing a "best" point or
+            points based only on data that would be observable in realistic
+            settings, as specified by `BenchmarkMethod.get_best_parameters`, and
+            then evaluating the oracle objective value of that point according
+            to the problem's `OptimizationConfig`.
 
             By default, if it is not overridden,
             `BenchmarkMethod.get_best_parameters` uses the empirical best point
             if `use_model_predictions_for_best_point` is False and the best
             point of those evaluated so far if it is True.
 
-            Note: This is not "inference regret", which is a lower-is-better value
-            that is relative to the best possible value. The inference value
-            trace is higher-is-better if the problem is a maximization problem
-            or if the problem is multi-objective (in which case hypervolume is
-            used). Hence, it is signed the same as ``oracle_trace`` and
-            ``optimization_trace``. ``score_trace`` is higher-is-better and
-            relative to the optimum.
-        optimization_trace: Either the ``oracle_trace`` or the
-            ``inference_trace``, depending on whether the ``BenchmarkProblem``
-            specifies ``report_inference_value``. Having ``optimization_trace``
-            specified separately is useful when we need just one value to
-            evaluate how well the benchmark went.
+            As with the oracle trace, the inference trace is updated whenever a
+            trial completes or stops early and may have fewer elements than the
+            number of trials of multiple trials complete at the same time.
+
+            Note: This is scaled differently from "inference regret", which is a
+            lower-is-better value that is relative to the best possible value.
+            The inference value trace is higher-is-better if the problem is a
+            maximization problem or if the problem is multi-objective (in which
+            case hypervolume is used). Hence, it is signed the same as
+            `oracle_trace` and `optimization_trace`. `score_trace`, meanwhile,
+            is higher-is-better and relative to the optimum.
+        optimization_trace: Either the `oracle_trace` or the `inference_trace`,
+            depending on whether the `BenchmarkProblem` specifies
+            `report_inference_value`. Having `optimization_trace` specified
+            separately is useful when we need just one value to evaluate how
+            well the benchmark went.
         score_trace: The scores associated with the problem, typically either
             the optimization_trace or inference_value_trace normalized to a
             0-100 scale for comparability between problems.
+        cost_trace: The cumulative cost of completed trials. The `cost_trace` is
+            updated whenever a trial completes, so, like the `oracle_trace` and
+            `inference_trace`, it can have fewer elements than the number of
+            trials if multiple trials complete at the same time. Trials that do
+            not produce `MapData` have a cost of 1, and trials that produce
+            `MapData` have a cost equal to the length of the `MapData`.
         fit_time: Total time spent fitting models.
         gen_time: Total time spent generating candidates.
         experiment: If not ``None``, the Ax experiment associated with the
@@ -81,6 +99,7 @@ class BenchmarkResult(Base):
     inference_trace: npt.NDArray
     optimization_trace: npt.NDArray
     score_trace: npt.NDArray
+    cost_trace: npt.NDArray
 
     fit_time: float
     gen_time: float
diff --git a/ax/benchmark/tests/test_benchmark.py b/ax/benchmark/tests/test_benchmark.py
index 18231060042..ec4c207d7df 100644
--- a/ax/benchmark/tests/test_benchmark.py
+++ b/ax/benchmark/tests/test_benchmark.py
@@ -22,7 +22,6 @@
     compute_baseline_value_from_sobol,
     compute_score_trace,
     get_benchmark_scheduler_options,
-    get_oracle_experiment_from_experiment,
     get_oracle_experiment_from_params,
 )
 from ax.benchmark.benchmark_method import BenchmarkMethod
@@ -71,7 +70,7 @@
     TestDataset,
 )
 
-from ax.utils.testing.core_stubs import get_branin_experiment, get_experiment
+from ax.utils.testing.core_stubs import get_experiment
 from ax.utils.testing.mock import mock_botorch_optimize
 from botorch.acquisition.knowledge_gradient import qKnowledgeGradient
 from botorch.acquisition.logei import qLogNoisyExpectedImprovement
@@ -163,6 +162,7 @@ def test_benchmark_result_invalid_inputs(self) -> None:
                 oracle_trace=np.array([]),
                 optimization_trace=np.array([]),
                 score_trace=np.array([]),
+                cost_trace=np.array([]),
                 fit_time=0.0,
                 gen_time=0.0,
                 experiment=get_experiment(),
@@ -179,6 +179,7 @@ def test_benchmark_result_invalid_inputs(self) -> None:
                 oracle_trace=np.array([]),
                 optimization_trace=np.array([]),
                 score_trace=np.array([]),
+                cost_trace=np.array([]),
                 fit_time=0.0,
                 gen_time=0.0,
             )
@@ -321,14 +322,25 @@ def _test_replication_async(self, map_data: bool) -> None:
         }
         # When two trials complete at the same time, the inference trace uses
         # data from both to get the best point, and repeats it.
-        # The oracle trace is the same.
-        expected_inference_traces = {
+        expected_traces = {
             "All complete at different times": [0, 1, 2, 3],
             # 0 and 1 complete at the same time, as do 2 and 3
-            "Trials complete immediately": [1, 1, 3, 3],
-            "Trials complete at same time": [1, 1, 3, 3],
+            "Trials complete immediately": [1, 3],
+            "Trials complete at same time": [1, 3],
             "Complete out of order": [1, 1, 3, 3],
         }
+        expected_costs = {
+            "All complete at different times": [1, 3, 7, 12],
+            "Trials complete immediately": [1, 2],
+            "Trials complete at same time": [1, 2],
+            "Complete out of order": [1, 2, 3, 4],
+        }
+        expected_backend_simulator_time = {
+            "All complete at different times": 12,
+            "Trials complete immediately": 2,
+            "Trials complete at same time": 2,
+            "Complete out of order": 4,
+        }
 
         for case_name, step_runtime_fn in step_runtime_fns.items():
             with self.subTest(case_name, step_runtime_fn=step_runtime_fn):
@@ -368,6 +380,11 @@ def _test_replication_async(self, map_data: bool) -> None:
                 backend_simulator = none_throws(
                     runner.simulated_backend_runner
                 ).simulator
+                self.assertEqual(
+                    backend_simulator.time,
+                    expected_backend_simulator_time[case_name],
+                    msg=case_name,
+                )
                 completed_trials = backend_simulator.state().completed
                 self.assertEqual(len(completed_trials), 4)
                 for trial_index, expected_start_time in enumerate(
@@ -394,7 +411,18 @@ def _test_replication_async(self, map_data: bool) -> None:
                 self.assertFalse(np.isnan(result.inference_trace).any())
                 self.assertEqual(
                     result.inference_trace.tolist(),
-                    expected_inference_traces[case_name],
+                    expected_traces[case_name],
+                    msg=case_name,
+                )
+                self.assertEqual(
+                    result.oracle_trace.tolist(),
+                    expected_traces[case_name],
+                    msg=case_name,
+                )
+                self.assertEqual(
+                    result.cost_trace.tolist(),
+                    expected_costs[case_name],
+                    msg=case_name,
                 )
                 if map_data:
                     data = assert_is_instance(experiment.lookup_data(), MapData)
@@ -863,38 +891,6 @@ def test_get_oracle_experiment_from_params(self) -> None:
                 problem=problem, dict_of_dict_of_params={0: {}}
             )
 
-    def test_get_oracle_experiment_from_experiment(self) -> None:
-        problem = create_problem_from_botorch(
-            test_problem_class=Branin,
-            test_problem_kwargs={},
-            num_trials=5,
-        )
-
-        # empty experiment
-        empty_experiment = get_branin_experiment(with_trial=False)
-        oracle_experiment = get_oracle_experiment_from_experiment(
-            problem=problem, experiment=empty_experiment
-        )
-        self.assertEqual(oracle_experiment.search_space, problem.search_space)
-        self.assertEqual(
-            oracle_experiment.optimization_config, problem.optimization_config
-        )
-        self.assertEqual(oracle_experiment.trials.keys(), set())
-
-        experiment = get_branin_experiment(
-            with_trial=True,
-            search_space=problem.search_space,
-            with_status_quo=False,
-        )
-        oracle_experiment = get_oracle_experiment_from_experiment(
-            problem=problem, experiment=experiment
-        )
-        self.assertEqual(oracle_experiment.search_space, problem.search_space)
-        self.assertEqual(
-            oracle_experiment.optimization_config, problem.optimization_config
-        )
-        self.assertEqual(oracle_experiment.trials.keys(), experiment.trials.keys())
-
     def _test_multi_fidelity_or_multi_task(self, fidelity_or_task: str) -> None:
         """
         Args:
diff --git a/ax/utils/testing/benchmark_stubs.py b/ax/utils/testing/benchmark_stubs.py
index c493d70b362..77fd711d942 100644
--- a/ax/utils/testing/benchmark_stubs.py
+++ b/ax/utils/testing/benchmark_stubs.py
@@ -181,6 +181,7 @@ def get_benchmark_result() -> BenchmarkResult:
         ),
         inference_trace=np.ones(4),
         oracle_trace=np.zeros(4),
+        cost_trace=np.zeros(4),
         optimization_trace=np.array([3, 2, 1, 0.1]),
         score_trace=np.array([3, 2, 1, 0.1]),
         fit_time=0.1,