Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Track cost; order oracle trace by completion order #3411

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 101 additions & 69 deletions ax/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from ax.benchmark.benchmark_method import BenchmarkMethod
from ax.benchmark.benchmark_problem import BenchmarkProblem
from ax.benchmark.benchmark_result import AggregatedBenchmarkResult, BenchmarkResult
from ax.benchmark.benchmark_runner import BenchmarkRunner
from ax.benchmark.benchmark_runner import BenchmarkRunner, get_total_runtime
from ax.benchmark.benchmark_test_function import BenchmarkTestFunction
from ax.benchmark.methods.sobol import get_sobol_generation_strategy
from ax.core.arm import Arm
Expand All @@ -39,13 +39,14 @@
from ax.core.optimization_config import OptimizationConfig
from ax.core.search_space import SearchSpace
from ax.core.trial_status import TrialStatus
from ax.core.types import TParameterization, TParamValue
from ax.core.types import TParamValue
from ax.core.utils import get_model_times
from ax.service.scheduler import Scheduler
from ax.service.utils.best_point_mixin import BestPointMixin
from ax.service.utils.scheduler_options import SchedulerOptions, TrialType
from ax.utils.common.logger import DEFAULT_LOG_LEVEL, get_logger
from ax.utils.common.random import with_rng_seed
from pyre_extensions import assert_is_instance

logger: Logger = get_logger(__name__)

Expand Down Expand Up @@ -172,23 +173,6 @@ def get_oracle_experiment_from_params(
return experiment


def get_oracle_experiment_from_experiment(
problem: BenchmarkProblem, experiment: Experiment
) -> Experiment:
"""
Get an ``Experiment`` that is the same as the original experiment but has
metrics evaluated at oracle values (noiseless ground-truth values
evaluated at the target task and fidelity)
"""
return get_oracle_experiment_from_params(
problem=problem,
dict_of_dict_of_params={
trial.index: {arm.name: arm.parameters for arm in trial.arms}
for trial in experiment.trials.values()
},
)


def get_benchmark_scheduler_options(
method: BenchmarkMethod,
include_sq: bool = False,
Expand Down Expand Up @@ -225,6 +209,35 @@ def get_benchmark_scheduler_options(
)


def _get_cumulative_cost(
previous_cost: float,
new_trials: set[int],
experiment: Experiment,
) -> float:
"""
Get the total cost of running a benchmark where `new_trials` have just
completed, and the cost up to that point was `previous_cost`.

If a backend simulator is used to track runtime the cost is just the
simulated time. If there is no backend simulator, it is still possible that
trials have varying runtimes without that being simulated, so in that case,
runtimes are computed.
"""
runner = assert_is_instance(experiment.runner, BenchmarkRunner)
if runner.simulated_backend_runner is not None:
return runner.simulated_backend_runner.simulator.time

per_trial_times = (
get_total_runtime(
trial=experiment.trials[i],
step_runtime_function=runner.step_runtime_function,
n_steps=runner.test_function.n_steps,
)
for i in new_trials
)
return previous_cost + sum(per_trial_times)


def benchmark_replication(
problem: BenchmarkProblem,
method: BenchmarkMethod,
Expand Down Expand Up @@ -284,16 +297,22 @@ def benchmark_replication(
options=scheduler_options,
)

# list of parameters for each trial
best_params_by_trial: list[list[TParameterization]] = []
# Each of these lists is added to when a trial completes or stops early.
# Since multiple trials can complete at once, there may be fewer elements in
# these traces than the number of trials run.
cost_trace: list[float] = []
best_params_list: list[Mapping[str, TParamValue]] = [] # For inference trace
evaluated_arms_list: list[set[Arm]] = [] # For oracle trace

is_mf_or_mt = len(problem.target_fidelity_and_task) > 0
trials_used_for_best_point: set[int] = set()

# Run the optimization loop.
timeout_hours = method.timeout_hours
remaining_hours = timeout_hours

previously_completed_trials = set()
cost = 0.0

with with_rng_seed(seed=seed), warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
Expand All @@ -302,28 +321,15 @@ def benchmark_replication(
module="ax.modelbridge.cross_validation",
)
start = monotonic()
# These next several lines do the same thing as `run_n_trials`, but
# These next several lines do the same thing as
# `scheduler.run_n_trials`, but
# decrement the timeout with each step, so that the timeout refers to
# the total time spent in the optimization loop, not time per trial.
scheduler.poll_and_process_results()
for _ in scheduler.run_trials_and_yield_results(
max_trials=problem.num_trials,
timeout_hours=remaining_hours,
):
if timeout_hours is not None:
elapsed_hours = (monotonic() - start) / 3600
remaining_hours = timeout_hours - elapsed_hours
if remaining_hours <= 0.0:
logger.warning("The optimization loop timed out.")
break

if problem.is_moo or is_mf_or_mt:
# Inference trace is not supported for MOO.
# It's also not supported for multi-fidelity or multi-task
# problems, because Ax's best-point functionality doesn't know
# to predict at the target task or fidelity.
continue

currently_completed_trials = {
t.index
for t in experiment.trials.values()
Expand All @@ -334,45 +340,70 @@ def benchmark_replication(
)
}
newly_completed_trials = (
currently_completed_trials - trials_used_for_best_point
)
if len(newly_completed_trials) == 0:
continue
for t in newly_completed_trials:
trials_used_for_best_point.add(t)

best_params = method.get_best_parameters(
experiment=experiment,
optimization_config=problem.optimization_config,
n_points=problem.n_best_points,
currently_completed_trials - previously_completed_trials
)
# If multiple trials complete at the same time, add that number of
# points to the inference trace so that the trace has length equal to
# the number of trials.
for _ in newly_completed_trials:
best_params_by_trial.append(best_params)
previously_completed_trials = currently_completed_trials

if len(newly_completed_trials) > 0:
cost = _get_cumulative_cost(
new_trials=newly_completed_trials,
experiment=experiment,
previous_cost=cost,
)
cost_trace.append(cost)

# Track what params are newly evaluated from those trials, for
# the oracle trace
params = {
arm
for i in newly_completed_trials
for arm in experiment.trials[i].arms
}
evaluated_arms_list.append(params)

# Inference trace: Not supported for MOO.
# It's also not supported for multi-fidelity or multi-task
# problems, because Ax's best-point functionality doesn't know
# to predict at the target task or fidelity.
if not (problem.is_moo or is_mf_or_mt):
best_params = method.get_best_parameters(
experiment=experiment,
optimization_config=problem.optimization_config,
n_points=problem.n_best_points,
)[0]
best_params_list.append(best_params)

if timeout_hours is not None:
elapsed_hours = (monotonic() - start) / 3600
remaining_hours = timeout_hours - elapsed_hours
if remaining_hours <= 0.0:
logger.warning("The optimization loop timed out.")
break

scheduler.summarize_final_result()

# Construct inference trace from best parameters
inference_trace = np.full(problem.num_trials, np.nan)
for trial_index, best_params in enumerate(best_params_by_trial):
if len(best_params) == 0:
inference_trace[trial_index] = np.nan
continue
# Construct an experiment with one BatchTrial
best_params_oracle_experiment = get_oracle_experiment_from_params(
problem=problem,
dict_of_dict_of_params={0: {str(i): p for i, p in enumerate(best_params)}},
single_params_as_experiments = (
get_oracle_experiment_from_params(
problem=problem, dict_of_dict_of_params={0: {"0_0": params}}
)
# Get the optimization trace. It will have only one point.
inference_trace[trial_index] = BestPointMixin._get_trace(
experiment=best_params_oracle_experiment,
optimization_config=problem.optimization_config,
)[0]
for params in best_params_list
)
inference_trace = np.array(
[
BestPointMixin._get_trace(
experiment=exp, optimization_config=problem.optimization_config
)[0]
for exp in single_params_as_experiments
]
)

actual_params_oracle_experiment = get_oracle_experiment_from_experiment(
problem=problem, experiment=experiment
actual_params_oracle_experiment = get_oracle_experiment_from_params(
problem=problem,
dict_of_dict_of_params={
i: {arm.name: arm.parameters for arm in arms}
for i, arms in enumerate(evaluated_arms_list)
},
)
oracle_trace = np.array(
BestPointMixin._get_trace(
Expand Down Expand Up @@ -404,6 +435,7 @@ def benchmark_replication(
inference_trace=inference_trace,
optimization_trace=optimization_trace,
score_trace=score_trace,
cost_trace=np.array(cost_trace),
fit_time=fit_time,
gen_time=gen_time,
)
Expand Down
68 changes: 43 additions & 25 deletions ax/benchmark/benchmark_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,40 +32,57 @@ class BenchmarkResult(Base):
name: Name of the benchmark. Should make it possible to determine the
problem and the method.
seed: Seed used for determinism.
oracle_trace: For single-objective problems, element i of the
optimization trace is the best oracle value of the arms evaluated
after the first i trials. For multi-objective problems, element i
of the optimization trace is the hypervolume of the oracle values of
the arms in the first i trials (which may be ``BatchTrial``s).
Oracle values are typically ground-truth (rather than noisy) and
evaluated at the target task and fidelity.
inference_trace: Inference trace comes from choosing a "best" point
based only on data that would be observable in realistic settings,
as specified by `BenchmarkMethod.get_best_parameters`,
and then evaluating the oracle value of that point according to the
problem's `OptimizationConfig`. For multi-objective problems, the
hypervolume of a set of points is considered.
oracle_trace: For single-objective problems, the oracle trace is the
cumulative best oracle objective value seen so far. For
multi-objective problems, it is the cumulative hypervolume of
feasible oracle objective values.

Oracle values are typically objective values that are at the ground
truth (not noisy) and evaluated at the target task and fidelity.

The trace may have fewer elements than the number of trials run if
multiple trials stop at the same time; the trace is updated whenever
trials stop (TrialStatus COMPLETED or EARLY_STOPPED). The number of
trials completed is reflected in the `cost_trace`, which is updated
at the same time as the `oracle_trace`. For example, if each trial
has a cost of 1, and `cost_trace[i] = 4`, then `oracle_trace[i]` is
the value of the best of the first four trials to complete, or the
feasible hypervolume of those trials.
inference_trace: Inference values come from choosing a "best" point or
points based only on data that would be observable in realistic
settings, as specified by `BenchmarkMethod.get_best_parameters`, and
then evaluating the oracle objective value of that point according
to the problem's `OptimizationConfig`.

By default, if it is not overridden,
`BenchmarkMethod.get_best_parameters` uses the empirical best point
if `use_model_predictions_for_best_point` is False and the best
point of those evaluated so far if it is True.

Note: This is not "inference regret", which is a lower-is-better value
that is relative to the best possible value. The inference value
trace is higher-is-better if the problem is a maximization problem
or if the problem is multi-objective (in which case hypervolume is
used). Hence, it is signed the same as ``oracle_trace`` and
``optimization_trace``. ``score_trace`` is higher-is-better and
relative to the optimum.
optimization_trace: Either the ``oracle_trace`` or the
``inference_trace``, depending on whether the ``BenchmarkProblem``
specifies ``report_inference_value``. Having ``optimization_trace``
specified separately is useful when we need just one value to
evaluate how well the benchmark went.
As with the oracle trace, the inference trace is updated whenever a
trial completes and may have fewer elements than the number of trials.

Note: This is scaled differently from "inference regret", which is a
lower-is-better value that is relative to the best possible value.
The inference value trace is higher-is-better if the problem is a
maximization problem or if the problem is multi-objective (in which
case hypervolume is used). Hence, it is signed the same as
`oracle_trace` and `optimization_trace`. `score_trace`, meanwhile,
is higher-is-better and relative to the optimum.
optimization_trace: Either the `oracle_trace` or the `inference_trace`,
depending on whether the `BenchmarkProblem` specifies
`report_inference_value`. Having `optimization_trace` specified
separately is useful when we need just one value to evaluate how
well the benchmark went.
score_trace: The scores associated with the problem, typically either
the optimization_trace or inference_value_trace normalized to a
0-100 scale for comparability between problems.
cost_trace: The cumulative cost of completed trials. The `cost_trace` is
updated whenever a trial completes, so, like the `oracle_trace` and
`inference_trace`, it can have fewer elements than the number of
trials if multiple trials complete at the same time. Trials that do
not produce `MapData` have a cost of 1, and trials that produce
`MapData` have a cost equal to the length of the `MapData`.
fit_time: Total time spent fitting models.
gen_time: Total time spent generating candidates.
experiment: If not ``None``, the Ax experiment associated with the
Expand All @@ -81,6 +98,7 @@ class BenchmarkResult(Base):
inference_trace: npt.NDArray
optimization_trace: npt.NDArray
score_trace: npt.NDArray
cost_trace: npt.NDArray

fit_time: float
gen_time: float
Expand Down
Loading