diff --git a/ax/benchmark/benchmark.py b/ax/benchmark/benchmark.py index 22513d32dea..946b8bffa70 100644 --- a/ax/benchmark/benchmark.py +++ b/ax/benchmark/benchmark.py @@ -30,7 +30,7 @@ from ax.benchmark.benchmark_method import BenchmarkMethod from ax.benchmark.benchmark_problem import BenchmarkProblem from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult -from ax.benchmark.benchmark_runner import BenchmarkRunner +from ax.benchmark.benchmark_runner import BenchmarkRunner, get_total_runtime from ax.benchmark.benchmark_test_function import BenchmarkTestFunction from ax.benchmark.methods.sobol import get_sobol_generation_strategy from ax.core.arm import Arm @@ -39,13 +39,14 @@ from ax.core.optimization_config import OptimizationConfig from ax.core.search_space import SearchSpace from ax.core.trial_status import TrialStatus -from ax.core.types import TParameterization, TParamValue +from ax.core.types import TParamValue from ax.core.utils import get_model_times from ax.service.scheduler import Scheduler from ax.service.utils.best_point_mixin import BestPointMixin from ax.service.utils.scheduler_options import SchedulerOptions, TrialType from ax.utils.common.logger import DEFAULT_LOG_LEVEL, get_logger from ax.utils.common.random import with_rng_seed +from pyre_extensions import assert_is_instance logger: Logger = get_logger(__name__) @@ -172,23 +173,6 @@ def get_oracle_experiment_from_params( return experiment -def get_oracle_experiment_from_experiment( - problem: BenchmarkProblem, experiment: Experiment -) -> Experiment: - """ - Get an ``Experiment`` that is the same as the original experiment but has - metrics evaluated at oracle values (noiseless ground-truth values - evaluated at the target task and fidelity) - """ - return get_oracle_experiment_from_params( - problem=problem, - dict_of_dict_of_params={ - trial.index: {arm.name: arm.parameters for arm in trial.arms} - for trial in experiment.trials.values() - }, - ) - - def get_benchmark_scheduler_options( method: BenchmarkMethod, include_sq: bool = False, @@ -225,6 +209,35 @@ def get_benchmark_scheduler_options( ) +def _get_cumulative_cost( + previous_cost: float, + new_trials: set[int], + experiment: Experiment, +) -> float: + """ + Get the total cost of running a benchmark where `new_trials` have just + completed, and the cost up to that point was `previous_cost`. + + If a backend simulator is used to track runtime the cost is just the + simulated time. If there is no backend simulator, it is still possible that + trials have varying runtimes without that being simulated, so in that case, + runtimes are computed. + """ + runner = assert_is_instance(experiment.runner, BenchmarkRunner) + if runner.simulated_backend_runner is not None: + return runner.simulated_backend_runner.simulator.time + + per_trial_times = ( + get_total_runtime( + trial=experiment.trials[i], + step_runtime_function=runner.step_runtime_function, + n_steps=runner.test_function.n_steps, + ) + for i in new_trials + ) + return previous_cost + sum(per_trial_times) + + def benchmark_replication( problem: BenchmarkProblem, method: BenchmarkMethod, @@ -284,16 +297,22 @@ def benchmark_replication( options=scheduler_options, ) - # list of parameters for each trial - best_params_by_trial: list[list[TParameterization]] = [] + # Each of these lists is added to when a trial completes or stops early. + # Since multiple trials can complete at once, there may be fewer elements in + # these traces than the number of trials run. + cost_trace: list[float] = [] + best_params_list: list[Mapping[str, TParamValue]] = [] # For inference trace + evaluated_arms_list: list[set[Arm]] = [] # For oracle trace is_mf_or_mt = len(problem.target_fidelity_and_task) > 0 - trials_used_for_best_point: set[int] = set() # Run the optimization loop. timeout_hours = method.timeout_hours remaining_hours = timeout_hours + previously_completed_trials = set() + cost = 0.0 + with with_rng_seed(seed=seed), warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -302,7 +321,8 @@ def benchmark_replication( module="ax.modelbridge.cross_validation", ) start = monotonic() - # These next several lines do the same thing as `run_n_trials`, but + # These next several lines do the same thing as + # `scheduler.run_n_trials`, but # decrement the timeout with each step, so that the timeout refers to # the total time spent in the optimization loop, not time per trial. scheduler.poll_and_process_results() @@ -310,20 +330,6 @@ def benchmark_replication( max_trials=problem.num_trials, timeout_hours=remaining_hours, ): - if timeout_hours is not None: - elapsed_hours = (monotonic() - start) / 3600 - remaining_hours = timeout_hours - elapsed_hours - if remaining_hours <= 0.0: - logger.warning("The optimization loop timed out.") - break - - if problem.is_moo or is_mf_or_mt: - # Inference trace is not supported for MOO. - # It's also not supported for multi-fidelity or multi-task - # problems, because Ax's best-point functionality doesn't know - # to predict at the target task or fidelity. - continue - currently_completed_trials = { t.index for t in experiment.trials.values() @@ -334,45 +340,70 @@ def benchmark_replication( ) } newly_completed_trials = ( - currently_completed_trials - trials_used_for_best_point - ) - if len(newly_completed_trials) == 0: - continue - for t in newly_completed_trials: - trials_used_for_best_point.add(t) - - best_params = method.get_best_parameters( - experiment=experiment, - optimization_config=problem.optimization_config, - n_points=problem.n_best_points, + currently_completed_trials - previously_completed_trials ) - # If multiple trials complete at the same time, add that number of - # points to the inference trace so that the trace has length equal to - # the number of trials. - for _ in newly_completed_trials: - best_params_by_trial.append(best_params) + previously_completed_trials = currently_completed_trials + + if len(newly_completed_trials) > 0: + cost = _get_cumulative_cost( + new_trials=newly_completed_trials, + experiment=experiment, + previous_cost=cost, + ) + cost_trace.append(cost) + + # Track what params are newly evaluated from those trials, for + # the oracle trace + params = { + arm + for i in newly_completed_trials + for arm in experiment.trials[i].arms + } + evaluated_arms_list.append(params) + + # Inference trace: Not supported for MOO. + # It's also not supported for multi-fidelity or multi-task + # problems, because Ax's best-point functionality doesn't know + # to predict at the target task or fidelity. + if not (problem.is_moo or is_mf_or_mt): + best_params = method.get_best_parameters( + experiment=experiment, + optimization_config=problem.optimization_config, + n_points=problem.n_best_points, + )[0] + best_params_list.append(best_params) + + if timeout_hours is not None: + elapsed_hours = (monotonic() - start) / 3600 + remaining_hours = timeout_hours - elapsed_hours + if remaining_hours <= 0.0: + logger.warning("The optimization loop timed out.") + break scheduler.summarize_final_result() # Construct inference trace from best parameters - inference_trace = np.full(problem.num_trials, np.nan) - for trial_index, best_params in enumerate(best_params_by_trial): - if len(best_params) == 0: - inference_trace[trial_index] = np.nan - continue - # Construct an experiment with one BatchTrial - best_params_oracle_experiment = get_oracle_experiment_from_params( - problem=problem, - dict_of_dict_of_params={0: {str(i): p for i, p in enumerate(best_params)}}, + single_params_as_experiments = ( + get_oracle_experiment_from_params( + problem=problem, dict_of_dict_of_params={0: {"0_0": params}} ) - # Get the optimization trace. It will have only one point. - inference_trace[trial_index] = BestPointMixin._get_trace( - experiment=best_params_oracle_experiment, - optimization_config=problem.optimization_config, - )[0] + for params in best_params_list + ) + inference_trace = np.array( + [ + BestPointMixin._get_trace( + experiment=exp, optimization_config=problem.optimization_config + )[0] + for exp in single_params_as_experiments + ] + ) - actual_params_oracle_experiment = get_oracle_experiment_from_experiment( - problem=problem, experiment=experiment + actual_params_oracle_experiment = get_oracle_experiment_from_params( + problem=problem, + dict_of_dict_of_params={ + i: {arm.name: arm.parameters for arm in arms} + for i, arms in enumerate(evaluated_arms_list) + }, ) oracle_trace = np.array( BestPointMixin._get_trace( @@ -404,6 +435,7 @@ def benchmark_replication( inference_trace=inference_trace, optimization_trace=optimization_trace, score_trace=score_trace, + cost_trace=np.array(cost_trace), fit_time=fit_time, gen_time=gen_time, ) diff --git a/ax/benchmark/benchmark_result.py b/ax/benchmark/benchmark_result.py index 538f3986c0c..dc6cf940541 100644 --- a/ax/benchmark/benchmark_result.py +++ b/ax/benchmark/benchmark_result.py @@ -32,40 +32,57 @@ class BenchmarkResult(Base): name: Name of the benchmark. Should make it possible to determine the problem and the method. seed: Seed used for determinism. - oracle_trace: For single-objective problems, element i of the - optimization trace is the best oracle value of the arms evaluated - after the first i trials. For multi-objective problems, element i - of the optimization trace is the hypervolume of the oracle values of - the arms in the first i trials (which may be ``BatchTrial``s). - Oracle values are typically ground-truth (rather than noisy) and - evaluated at the target task and fidelity. - inference_trace: Inference trace comes from choosing a "best" point - based only on data that would be observable in realistic settings, - as specified by `BenchmarkMethod.get_best_parameters`, - and then evaluating the oracle value of that point according to the - problem's `OptimizationConfig`. For multi-objective problems, the - hypervolume of a set of points is considered. + oracle_trace: For single-objective problems, the oracle trace is the + cumulative best oracle objective value seen so far. For + multi-objective problems, it is the cumulative hypervolume of + feasible oracle objective values. + + Oracle values are typically objective values that are at the ground + truth (not noisy) and evaluated at the target task and fidelity. + + The trace may have fewer elements than the number of trials run if + multiple trials stop at the same time; the trace is updated whenever + trials stop (TrialStatus COMPLETED or EARLY_STOPPED). The number of + trials completed is reflected in the `cost_trace`, which is updated + at the same time as the `oracle_trace`. For example, if each trial + has a cost of 1, and `cost_trace[i] = 4`, then `oracle_trace[i]` is + the value of the best of the first four trials to complete, or the + feasible hypervolume of those trials. + inference_trace: Inference values come from choosing a "best" point or + points based only on data that would be observable in realistic + settings, as specified by `BenchmarkMethod.get_best_parameters`, and + then evaluating the oracle objective value of that point according + to the problem's `OptimizationConfig`. By default, if it is not overridden, `BenchmarkMethod.get_best_parameters` uses the empirical best point if `use_model_predictions_for_best_point` is False and the best point of those evaluated so far if it is True. - Note: This is not "inference regret", which is a lower-is-better value - that is relative to the best possible value. The inference value - trace is higher-is-better if the problem is a maximization problem - or if the problem is multi-objective (in which case hypervolume is - used). Hence, it is signed the same as ``oracle_trace`` and - ``optimization_trace``. ``score_trace`` is higher-is-better and - relative to the optimum. - optimization_trace: Either the ``oracle_trace`` or the - ``inference_trace``, depending on whether the ``BenchmarkProblem`` - specifies ``report_inference_value``. Having ``optimization_trace`` - specified separately is useful when we need just one value to - evaluate how well the benchmark went. + As with the oracle trace, the inference trace is updated whenever a + trial completes and may have fewer elements than the number of trials. + + Note: This is scaled differently from "inference regret", which is a + lower-is-better value that is relative to the best possible value. + The inference value trace is higher-is-better if the problem is a + maximization problem or if the problem is multi-objective (in which + case hypervolume is used). Hence, it is signed the same as + `oracle_trace` and `optimization_trace`. `score_trace`, meanwhile, + is higher-is-better and relative to the optimum. + optimization_trace: Either the `oracle_trace` or the `inference_trace`, + depending on whether the `BenchmarkProblem` specifies + `report_inference_value`. Having `optimization_trace` specified + separately is useful when we need just one value to evaluate how + well the benchmark went. score_trace: The scores associated with the problem, typically either the optimization_trace or inference_value_trace normalized to a 0-100 scale for comparability between problems. + cost_trace: The cumulative cost of completed trials. The `cost_trace` is + updated whenever a trial completes, so, like the `oracle_trace` and + `inference_trace`, it can have fewer elements than the number of + trials if multiple trials complete at the same time. Trials that do + not produce `MapData` have a cost of 1, and trials that produce + `MapData` have a cost equal to the length of the `MapData`. fit_time: Total time spent fitting models. gen_time: Total time spent generating candidates. experiment: If not ``None``, the Ax experiment associated with the @@ -81,6 +98,7 @@ class BenchmarkResult(Base): inference_trace: npt.NDArray optimization_trace: npt.NDArray score_trace: npt.NDArray + cost_trace: npt.NDArray fit_time: float gen_time: float diff --git a/ax/benchmark/tests/test_benchmark.py b/ax/benchmark/tests/test_benchmark.py index 18231060042..ec4c207d7df 100644 --- a/ax/benchmark/tests/test_benchmark.py +++ b/ax/benchmark/tests/test_benchmark.py @@ -22,7 +22,6 @@ compute_baseline_value_from_sobol, compute_score_trace, get_benchmark_scheduler_options, - get_oracle_experiment_from_experiment, get_oracle_experiment_from_params, ) from ax.benchmark.benchmark_method import BenchmarkMethod @@ -71,7 +70,7 @@ TestDataset, ) -from ax.utils.testing.core_stubs import get_branin_experiment, get_experiment +from ax.utils.testing.core_stubs import get_experiment from ax.utils.testing.mock import mock_botorch_optimize from botorch.acquisition.knowledge_gradient import qKnowledgeGradient from botorch.acquisition.logei import qLogNoisyExpectedImprovement @@ -163,6 +162,7 @@ def test_benchmark_result_invalid_inputs(self) -> None: oracle_trace=np.array([]), optimization_trace=np.array([]), score_trace=np.array([]), + cost_trace=np.array([]), fit_time=0.0, gen_time=0.0, experiment=get_experiment(), @@ -179,6 +179,7 @@ def test_benchmark_result_invalid_inputs(self) -> None: oracle_trace=np.array([]), optimization_trace=np.array([]), score_trace=np.array([]), + cost_trace=np.array([]), fit_time=0.0, gen_time=0.0, ) @@ -321,14 +322,25 @@ def _test_replication_async(self, map_data: bool) -> None: } # When two trials complete at the same time, the inference trace uses # data from both to get the best point, and repeats it. - # The oracle trace is the same. - expected_inference_traces = { + expected_traces = { "All complete at different times": [0, 1, 2, 3], # 0 and 1 complete at the same time, as do 2 and 3 - "Trials complete immediately": [1, 1, 3, 3], - "Trials complete at same time": [1, 1, 3, 3], + "Trials complete immediately": [1, 3], + "Trials complete at same time": [1, 3], "Complete out of order": [1, 1, 3, 3], } + expected_costs = { + "All complete at different times": [1, 3, 7, 12], + "Trials complete immediately": [1, 2], + "Trials complete at same time": [1, 2], + "Complete out of order": [1, 2, 3, 4], + } + expected_backend_simulator_time = { + "All complete at different times": 12, + "Trials complete immediately": 2, + "Trials complete at same time": 2, + "Complete out of order": 4, + } for case_name, step_runtime_fn in step_runtime_fns.items(): with self.subTest(case_name, step_runtime_fn=step_runtime_fn): @@ -368,6 +380,11 @@ def _test_replication_async(self, map_data: bool) -> None: backend_simulator = none_throws( runner.simulated_backend_runner ).simulator + self.assertEqual( + backend_simulator.time, + expected_backend_simulator_time[case_name], + msg=case_name, + ) completed_trials = backend_simulator.state().completed self.assertEqual(len(completed_trials), 4) for trial_index, expected_start_time in enumerate( @@ -394,7 +411,18 @@ def _test_replication_async(self, map_data: bool) -> None: self.assertFalse(np.isnan(result.inference_trace).any()) self.assertEqual( result.inference_trace.tolist(), - expected_inference_traces[case_name], + expected_traces[case_name], + msg=case_name, + ) + self.assertEqual( + result.oracle_trace.tolist(), + expected_traces[case_name], + msg=case_name, + ) + self.assertEqual( + result.cost_trace.tolist(), + expected_costs[case_name], + msg=case_name, ) if map_data: data = assert_is_instance(experiment.lookup_data(), MapData) @@ -863,38 +891,6 @@ def test_get_oracle_experiment_from_params(self) -> None: problem=problem, dict_of_dict_of_params={0: {}} ) - def test_get_oracle_experiment_from_experiment(self) -> None: - problem = create_problem_from_botorch( - test_problem_class=Branin, - test_problem_kwargs={}, - num_trials=5, - ) - - # empty experiment - empty_experiment = get_branin_experiment(with_trial=False) - oracle_experiment = get_oracle_experiment_from_experiment( - problem=problem, experiment=empty_experiment - ) - self.assertEqual(oracle_experiment.search_space, problem.search_space) - self.assertEqual( - oracle_experiment.optimization_config, problem.optimization_config - ) - self.assertEqual(oracle_experiment.trials.keys(), set()) - - experiment = get_branin_experiment( - with_trial=True, - search_space=problem.search_space, - with_status_quo=False, - ) - oracle_experiment = get_oracle_experiment_from_experiment( - problem=problem, experiment=experiment - ) - self.assertEqual(oracle_experiment.search_space, problem.search_space) - self.assertEqual( - oracle_experiment.optimization_config, problem.optimization_config - ) - self.assertEqual(oracle_experiment.trials.keys(), experiment.trials.keys()) - def _test_multi_fidelity_or_multi_task(self, fidelity_or_task: str) -> None: """ Args: diff --git a/ax/utils/testing/benchmark_stubs.py b/ax/utils/testing/benchmark_stubs.py index c493d70b362..77fd711d942 100644 --- a/ax/utils/testing/benchmark_stubs.py +++ b/ax/utils/testing/benchmark_stubs.py @@ -181,6 +181,7 @@ def get_benchmark_result() -> BenchmarkResult: ), inference_trace=np.ones(4), oracle_trace=np.zeros(4), + cost_trace=np.zeros(4), optimization_trace=np.array([3, 2, 1, 0.1]), score_trace=np.array([3, 2, 1, 0.1]), fit_time=0.1,