From 80913864a234cc4860630c6de8f3b26875a6ecbf Mon Sep 17 00:00:00 2001 From: Elizabeth Santorella Date: Fri, 21 Feb 2025 16:09:16 -0800 Subject: [PATCH] Track cost; order oracle trace by completion order Summary: I am not sure if this is what we will want in the long run, but it will unblock benchmarking early stopping. # What's wrong with the current behavior **Ordering by start order vs. completion order:** Currently, the oracle trace is ordered by trial order and has one entry for each trial. The inference trace has always been ordered by completion order because it is updated every time a trial ceases running. The order of completion (including early stopping) seems preferable for both, and it's a little weird for the oracle trace to have a different ordering than the inference trace. See here for discussion on this: https://fb.workplace.com/groups/1294299434097422/posts/2563368300523856 **Inability to compare more costly vs. less costly strategies**: Separately, tracking cost is necessary to fairly compare more aggressive vs. less aggressive early-stopping strategies or to compare stopping early against not. I am bundling these two changes (reordering the oracle trace and introducing cost) because the oracle trace should now only be compared against the cost. Ordering by completion order doesn't make a lot of sense without a notion of cost when multiple trials can complete at the same time. # New behavior | time | first trial running | second trial running | objective values | best point | | ---- | ----------------- | -------------------- | --------------- | ---------- | | 0 | 0 | 1 | | | | 1 | 0 | 2 | y_1 | y_a | | 2 | 0 | 2 | y_1 | not computed | | 3 | 0 | 2 | y_1, y_0, y_2 | y_b | Assuming higher is better, this produces ```BenchmarkResult: cost_trace: [1, 3] oracle_trace: [y_1, max(y_1, y_0, y_2)] inference_trace: [y_a, y_b] ``` Now traces are only updated when a trial completes, so there are 2 trace elements with 3 trials. (We could also just duplicate elements when multiple trials complete at the same time to preserve the length.) See docstrings for more detail. # What's not ideal about this I want to flag that a few things are not great about this setup. * It makes plotting hard: One one replication produces a cost_trace of [3, 5] and another one produces a cost_trace of [2, 6], how do we aggregate their optimization traces? We can do this by left-interpolating the optimization traces onto [2, 3, .., 6] and then aggregating as usual, but it is clunky. * Even aside from the issue of different replications producing different cost traces, plotting is harder because plotting must be against cost now. * People typically are interested in epoch-by-epoch results for early stopping, and those are not available here. # Better long-term solution Two alternatives are * Storing trace values for each time step, which would remove the need to track cost at all: element `i` of the trace would have happened at virtual second `i`. * Storing cost/time information at each step in MapData, and then deriving a proper trace from there (we may already have this -- need to check) # Internal: Differential Revision: D69489720 --- ax/benchmark/benchmark.py | 170 ++++++++++++++++----------- ax/benchmark/benchmark_result.py | 68 +++++++---- ax/benchmark/tests/test_benchmark.py | 74 ++++++------ ax/utils/testing/benchmark_stubs.py | 1 + 4 files changed, 180 insertions(+), 133 deletions(-) diff --git a/ax/benchmark/benchmark.py b/ax/benchmark/benchmark.py index 22513d32dea..946b8bffa70 100644 --- a/ax/benchmark/benchmark.py +++ b/ax/benchmark/benchmark.py @@ -30,7 +30,7 @@ from ax.benchmark.benchmark_method import BenchmarkMethod from ax.benchmark.benchmark_problem import BenchmarkProblem from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult -from ax.benchmark.benchmark_runner import BenchmarkRunner +from ax.benchmark.benchmark_runner import BenchmarkRunner, get_total_runtime from ax.benchmark.benchmark_test_function import BenchmarkTestFunction from ax.benchmark.methods.sobol import get_sobol_generation_strategy from ax.core.arm import Arm @@ -39,13 +39,14 @@ from ax.core.optimization_config import OptimizationConfig from ax.core.search_space import SearchSpace from ax.core.trial_status import TrialStatus -from ax.core.types import TParameterization, TParamValue +from ax.core.types import TParamValue from ax.core.utils import get_model_times from ax.service.scheduler import Scheduler from ax.service.utils.best_point_mixin import BestPointMixin from ax.service.utils.scheduler_options import SchedulerOptions, TrialType from ax.utils.common.logger import DEFAULT_LOG_LEVEL, get_logger from ax.utils.common.random import with_rng_seed +from pyre_extensions import assert_is_instance logger: Logger = get_logger(__name__) @@ -172,23 +173,6 @@ def get_oracle_experiment_from_params( return experiment -def get_oracle_experiment_from_experiment( - problem: BenchmarkProblem, experiment: Experiment -) -> Experiment: - """ - Get an ``Experiment`` that is the same as the original experiment but has - metrics evaluated at oracle values (noiseless ground-truth values - evaluated at the target task and fidelity) - """ - return get_oracle_experiment_from_params( - problem=problem, - dict_of_dict_of_params={ - trial.index: {arm.name: arm.parameters for arm in trial.arms} - for trial in experiment.trials.values() - }, - ) - - def get_benchmark_scheduler_options( method: BenchmarkMethod, include_sq: bool = False, @@ -225,6 +209,35 @@ def get_benchmark_scheduler_options( ) +def _get_cumulative_cost( + previous_cost: float, + new_trials: set[int], + experiment: Experiment, +) -> float: + """ + Get the total cost of running a benchmark where `new_trials` have just + completed, and the cost up to that point was `previous_cost`. + + If a backend simulator is used to track runtime the cost is just the + simulated time. If there is no backend simulator, it is still possible that + trials have varying runtimes without that being simulated, so in that case, + runtimes are computed. + """ + runner = assert_is_instance(experiment.runner, BenchmarkRunner) + if runner.simulated_backend_runner is not None: + return runner.simulated_backend_runner.simulator.time + + per_trial_times = ( + get_total_runtime( + trial=experiment.trials[i], + step_runtime_function=runner.step_runtime_function, + n_steps=runner.test_function.n_steps, + ) + for i in new_trials + ) + return previous_cost + sum(per_trial_times) + + def benchmark_replication( problem: BenchmarkProblem, method: BenchmarkMethod, @@ -284,16 +297,22 @@ def benchmark_replication( options=scheduler_options, ) - # list of parameters for each trial - best_params_by_trial: list[list[TParameterization]] = [] + # Each of these lists is added to when a trial completes or stops early. + # Since multiple trials can complete at once, there may be fewer elements in + # these traces than the number of trials run. + cost_trace: list[float] = [] + best_params_list: list[Mapping[str, TParamValue]] = [] # For inference trace + evaluated_arms_list: list[set[Arm]] = [] # For oracle trace is_mf_or_mt = len(problem.target_fidelity_and_task) > 0 - trials_used_for_best_point: set[int] = set() # Run the optimization loop. timeout_hours = method.timeout_hours remaining_hours = timeout_hours + previously_completed_trials = set() + cost = 0.0 + with with_rng_seed(seed=seed), warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -302,7 +321,8 @@ def benchmark_replication( module="ax.modelbridge.cross_validation", ) start = monotonic() - # These next several lines do the same thing as `run_n_trials`, but + # These next several lines do the same thing as + # `scheduler.run_n_trials`, but # decrement the timeout with each step, so that the timeout refers to # the total time spent in the optimization loop, not time per trial. scheduler.poll_and_process_results() @@ -310,20 +330,6 @@ def benchmark_replication( max_trials=problem.num_trials, timeout_hours=remaining_hours, ): - if timeout_hours is not None: - elapsed_hours = (monotonic() - start) / 3600 - remaining_hours = timeout_hours - elapsed_hours - if remaining_hours <= 0.0: - logger.warning("The optimization loop timed out.") - break - - if problem.is_moo or is_mf_or_mt: - # Inference trace is not supported for MOO. - # It's also not supported for multi-fidelity or multi-task - # problems, because Ax's best-point functionality doesn't know - # to predict at the target task or fidelity. - continue - currently_completed_trials = { t.index for t in experiment.trials.values() @@ -334,45 +340,70 @@ def benchmark_replication( ) } newly_completed_trials = ( - currently_completed_trials - trials_used_for_best_point - ) - if len(newly_completed_trials) == 0: - continue - for t in newly_completed_trials: - trials_used_for_best_point.add(t) - - best_params = method.get_best_parameters( - experiment=experiment, - optimization_config=problem.optimization_config, - n_points=problem.n_best_points, + currently_completed_trials - previously_completed_trials ) - # If multiple trials complete at the same time, add that number of - # points to the inference trace so that the trace has length equal to - # the number of trials. - for _ in newly_completed_trials: - best_params_by_trial.append(best_params) + previously_completed_trials = currently_completed_trials + + if len(newly_completed_trials) > 0: + cost = _get_cumulative_cost( + new_trials=newly_completed_trials, + experiment=experiment, + previous_cost=cost, + ) + cost_trace.append(cost) + + # Track what params are newly evaluated from those trials, for + # the oracle trace + params = { + arm + for i in newly_completed_trials + for arm in experiment.trials[i].arms + } + evaluated_arms_list.append(params) + + # Inference trace: Not supported for MOO. + # It's also not supported for multi-fidelity or multi-task + # problems, because Ax's best-point functionality doesn't know + # to predict at the target task or fidelity. + if not (problem.is_moo or is_mf_or_mt): + best_params = method.get_best_parameters( + experiment=experiment, + optimization_config=problem.optimization_config, + n_points=problem.n_best_points, + )[0] + best_params_list.append(best_params) + + if timeout_hours is not None: + elapsed_hours = (monotonic() - start) / 3600 + remaining_hours = timeout_hours - elapsed_hours + if remaining_hours <= 0.0: + logger.warning("The optimization loop timed out.") + break scheduler.summarize_final_result() # Construct inference trace from best parameters - inference_trace = np.full(problem.num_trials, np.nan) - for trial_index, best_params in enumerate(best_params_by_trial): - if len(best_params) == 0: - inference_trace[trial_index] = np.nan - continue - # Construct an experiment with one BatchTrial - best_params_oracle_experiment = get_oracle_experiment_from_params( - problem=problem, - dict_of_dict_of_params={0: {str(i): p for i, p in enumerate(best_params)}}, + single_params_as_experiments = ( + get_oracle_experiment_from_params( + problem=problem, dict_of_dict_of_params={0: {"0_0": params}} ) - # Get the optimization trace. It will have only one point. - inference_trace[trial_index] = BestPointMixin._get_trace( - experiment=best_params_oracle_experiment, - optimization_config=problem.optimization_config, - )[0] + for params in best_params_list + ) + inference_trace = np.array( + [ + BestPointMixin._get_trace( + experiment=exp, optimization_config=problem.optimization_config + )[0] + for exp in single_params_as_experiments + ] + ) - actual_params_oracle_experiment = get_oracle_experiment_from_experiment( - problem=problem, experiment=experiment + actual_params_oracle_experiment = get_oracle_experiment_from_params( + problem=problem, + dict_of_dict_of_params={ + i: {arm.name: arm.parameters for arm in arms} + for i, arms in enumerate(evaluated_arms_list) + }, ) oracle_trace = np.array( BestPointMixin._get_trace( @@ -404,6 +435,7 @@ def benchmark_replication( inference_trace=inference_trace, optimization_trace=optimization_trace, score_trace=score_trace, + cost_trace=np.array(cost_trace), fit_time=fit_time, gen_time=gen_time, ) diff --git a/ax/benchmark/benchmark_result.py b/ax/benchmark/benchmark_result.py index 538f3986c0c..dc6cf940541 100644 --- a/ax/benchmark/benchmark_result.py +++ b/ax/benchmark/benchmark_result.py @@ -32,40 +32,57 @@ class BenchmarkResult(Base): name: Name of the benchmark. Should make it possible to determine the problem and the method. seed: Seed used for determinism. - oracle_trace: For single-objective problems, element i of the - optimization trace is the best oracle value of the arms evaluated - after the first i trials. For multi-objective problems, element i - of the optimization trace is the hypervolume of the oracle values of - the arms in the first i trials (which may be ``BatchTrial``s). - Oracle values are typically ground-truth (rather than noisy) and - evaluated at the target task and fidelity. - inference_trace: Inference trace comes from choosing a "best" point - based only on data that would be observable in realistic settings, - as specified by `BenchmarkMethod.get_best_parameters`, - and then evaluating the oracle value of that point according to the - problem's `OptimizationConfig`. For multi-objective problems, the - hypervolume of a set of points is considered. + oracle_trace: For single-objective problems, the oracle trace is the + cumulative best oracle objective value seen so far. For + multi-objective problems, it is the cumulative hypervolume of + feasible oracle objective values. + + Oracle values are typically objective values that are at the ground + truth (not noisy) and evaluated at the target task and fidelity. + + The trace may have fewer elements than the number of trials run if + multiple trials stop at the same time; the trace is updated whenever + trials stop (TrialStatus COMPLETED or EARLY_STOPPED). The number of + trials completed is reflected in the `cost_trace`, which is updated + at the same time as the `oracle_trace`. For example, if each trial + has a cost of 1, and `cost_trace[i] = 4`, then `oracle_trace[i]` is + the value of the best of the first four trials to complete, or the + feasible hypervolume of those trials. + inference_trace: Inference values come from choosing a "best" point or + points based only on data that would be observable in realistic + settings, as specified by `BenchmarkMethod.get_best_parameters`, and + then evaluating the oracle objective value of that point according + to the problem's `OptimizationConfig`. By default, if it is not overridden, `BenchmarkMethod.get_best_parameters` uses the empirical best point if `use_model_predictions_for_best_point` is False and the best point of those evaluated so far if it is True. - Note: This is not "inference regret", which is a lower-is-better value - that is relative to the best possible value. The inference value - trace is higher-is-better if the problem is a maximization problem - or if the problem is multi-objective (in which case hypervolume is - used). Hence, it is signed the same as ``oracle_trace`` and - ``optimization_trace``. ``score_trace`` is higher-is-better and - relative to the optimum. - optimization_trace: Either the ``oracle_trace`` or the - ``inference_trace``, depending on whether the ``BenchmarkProblem`` - specifies ``report_inference_value``. Having ``optimization_trace`` - specified separately is useful when we need just one value to - evaluate how well the benchmark went. + As with the oracle trace, the inference trace is updated whenever a + trial completes and may have fewer elements than the number of trials. + + Note: This is scaled differently from "inference regret", which is a + lower-is-better value that is relative to the best possible value. + The inference value trace is higher-is-better if the problem is a + maximization problem or if the problem is multi-objective (in which + case hypervolume is used). Hence, it is signed the same as + `oracle_trace` and `optimization_trace`. `score_trace`, meanwhile, + is higher-is-better and relative to the optimum. + optimization_trace: Either the `oracle_trace` or the `inference_trace`, + depending on whether the `BenchmarkProblem` specifies + `report_inference_value`. Having `optimization_trace` specified + separately is useful when we need just one value to evaluate how + well the benchmark went. score_trace: The scores associated with the problem, typically either the optimization_trace or inference_value_trace normalized to a 0-100 scale for comparability between problems. + cost_trace: The cumulative cost of completed trials. The `cost_trace` is + updated whenever a trial completes, so, like the `oracle_trace` and + `inference_trace`, it can have fewer elements than the number of + trials if multiple trials complete at the same time. Trials that do + not produce `MapData` have a cost of 1, and trials that produce + `MapData` have a cost equal to the length of the `MapData`. fit_time: Total time spent fitting models. gen_time: Total time spent generating candidates. experiment: If not ``None``, the Ax experiment associated with the @@ -81,6 +98,7 @@ class BenchmarkResult(Base): inference_trace: npt.NDArray optimization_trace: npt.NDArray score_trace: npt.NDArray + cost_trace: npt.NDArray fit_time: float gen_time: float diff --git a/ax/benchmark/tests/test_benchmark.py b/ax/benchmark/tests/test_benchmark.py index 18231060042..ec4c207d7df 100644 --- a/ax/benchmark/tests/test_benchmark.py +++ b/ax/benchmark/tests/test_benchmark.py @@ -22,7 +22,6 @@ compute_baseline_value_from_sobol, compute_score_trace, get_benchmark_scheduler_options, - get_oracle_experiment_from_experiment, get_oracle_experiment_from_params, ) from ax.benchmark.benchmark_method import BenchmarkMethod @@ -71,7 +70,7 @@ TestDataset, ) -from ax.utils.testing.core_stubs import get_branin_experiment, get_experiment +from ax.utils.testing.core_stubs import get_experiment from ax.utils.testing.mock import mock_botorch_optimize from botorch.acquisition.knowledge_gradient import qKnowledgeGradient from botorch.acquisition.logei import qLogNoisyExpectedImprovement @@ -163,6 +162,7 @@ def test_benchmark_result_invalid_inputs(self) -> None: oracle_trace=np.array([]), optimization_trace=np.array([]), score_trace=np.array([]), + cost_trace=np.array([]), fit_time=0.0, gen_time=0.0, experiment=get_experiment(), @@ -179,6 +179,7 @@ def test_benchmark_result_invalid_inputs(self) -> None: oracle_trace=np.array([]), optimization_trace=np.array([]), score_trace=np.array([]), + cost_trace=np.array([]), fit_time=0.0, gen_time=0.0, ) @@ -321,14 +322,25 @@ def _test_replication_async(self, map_data: bool) -> None: } # When two trials complete at the same time, the inference trace uses # data from both to get the best point, and repeats it. - # The oracle trace is the same. - expected_inference_traces = { + expected_traces = { "All complete at different times": [0, 1, 2, 3], # 0 and 1 complete at the same time, as do 2 and 3 - "Trials complete immediately": [1, 1, 3, 3], - "Trials complete at same time": [1, 1, 3, 3], + "Trials complete immediately": [1, 3], + "Trials complete at same time": [1, 3], "Complete out of order": [1, 1, 3, 3], } + expected_costs = { + "All complete at different times": [1, 3, 7, 12], + "Trials complete immediately": [1, 2], + "Trials complete at same time": [1, 2], + "Complete out of order": [1, 2, 3, 4], + } + expected_backend_simulator_time = { + "All complete at different times": 12, + "Trials complete immediately": 2, + "Trials complete at same time": 2, + "Complete out of order": 4, + } for case_name, step_runtime_fn in step_runtime_fns.items(): with self.subTest(case_name, step_runtime_fn=step_runtime_fn): @@ -368,6 +380,11 @@ def _test_replication_async(self, map_data: bool) -> None: backend_simulator = none_throws( runner.simulated_backend_runner ).simulator + self.assertEqual( + backend_simulator.time, + expected_backend_simulator_time[case_name], + msg=case_name, + ) completed_trials = backend_simulator.state().completed self.assertEqual(len(completed_trials), 4) for trial_index, expected_start_time in enumerate( @@ -394,7 +411,18 @@ def _test_replication_async(self, map_data: bool) -> None: self.assertFalse(np.isnan(result.inference_trace).any()) self.assertEqual( result.inference_trace.tolist(), - expected_inference_traces[case_name], + expected_traces[case_name], + msg=case_name, + ) + self.assertEqual( + result.oracle_trace.tolist(), + expected_traces[case_name], + msg=case_name, + ) + self.assertEqual( + result.cost_trace.tolist(), + expected_costs[case_name], + msg=case_name, ) if map_data: data = assert_is_instance(experiment.lookup_data(), MapData) @@ -863,38 +891,6 @@ def test_get_oracle_experiment_from_params(self) -> None: problem=problem, dict_of_dict_of_params={0: {}} ) - def test_get_oracle_experiment_from_experiment(self) -> None: - problem = create_problem_from_botorch( - test_problem_class=Branin, - test_problem_kwargs={}, - num_trials=5, - ) - - # empty experiment - empty_experiment = get_branin_experiment(with_trial=False) - oracle_experiment = get_oracle_experiment_from_experiment( - problem=problem, experiment=empty_experiment - ) - self.assertEqual(oracle_experiment.search_space, problem.search_space) - self.assertEqual( - oracle_experiment.optimization_config, problem.optimization_config - ) - self.assertEqual(oracle_experiment.trials.keys(), set()) - - experiment = get_branin_experiment( - with_trial=True, - search_space=problem.search_space, - with_status_quo=False, - ) - oracle_experiment = get_oracle_experiment_from_experiment( - problem=problem, experiment=experiment - ) - self.assertEqual(oracle_experiment.search_space, problem.search_space) - self.assertEqual( - oracle_experiment.optimization_config, problem.optimization_config - ) - self.assertEqual(oracle_experiment.trials.keys(), experiment.trials.keys()) - def _test_multi_fidelity_or_multi_task(self, fidelity_or_task: str) -> None: """ Args: diff --git a/ax/utils/testing/benchmark_stubs.py b/ax/utils/testing/benchmark_stubs.py index c493d70b362..77fd711d942 100644 --- a/ax/utils/testing/benchmark_stubs.py +++ b/ax/utils/testing/benchmark_stubs.py @@ -181,6 +181,7 @@ def get_benchmark_result() -> BenchmarkResult: ), inference_trace=np.ones(4), oracle_trace=np.zeros(4), + cost_trace=np.zeros(4), optimization_trace=np.array([3, 2, 1, 0.1]), score_trace=np.array([3, 2, 1, 0.1]), fit_time=0.1,