3232from ax .core .runner import Runner
3333from ax .core .trial import Trial
3434from ax .core .trial_status import TrialStatus
35+ from ax .core .utils import compute_metric_availability , MetricAvailability
3536from ax .exceptions .core import (
3637 AxError ,
3738 DataRequiredError ,
8081 "of an optimization and if at least {min_failed} trials have been "
8182 "failed/abandoned, potentially automatically due to issues with the trial."
8283)
83- METRIC_FETCH_ERR_MESSAGE = (
84- "A majority of the trial failures encountered are due to metric fetching errors. "
85- "This could mean the metrics are flaky, broken, or misconfigured. Please check "
86- "that the trial processes/jobs are successfully producing the expected metrics and "
87- "that the metric is correctly configured."
88- )
89-
9084EXPECTED_STAGED_MSG = (
9185 "Expected all trials to be in status {expected} after running or staging, "
9286 "found {t_idx_to_status}."
@@ -191,13 +185,6 @@ class Orchestrator(WithDBSettingsBase, BestPointMixin):
191185 # Saved as a property so that it can be accessed after optimization is complex (ex.
192186 # for global stopping saving calculation).
193187 _num_remaining_requested_trials : int = 0
194- # Total number of MetricFetchEs encountered during the course of optimization. Note
195- # this is different from and may be greater than the number of trials that have
196- # been marked either FAILED or ABANDONED due to metric fetching errors.
197- _num_metric_fetch_e_encountered : int = 0
198- # Number of trials that have been marked either FAILED or ABANDONED due to
199- # MetricFetchE being encountered during _fetch_and_process_trials_data_results
200- _num_trials_bad_due_to_err : int = 0
201188 # Keeps track of whether the allowed failure rate has been exceeded during
202189 # the optimization. If true, allows any pending trials to finish and raises
203190 # an error through self._complete_optimization.
@@ -1073,83 +1060,63 @@ def summarize_final_result(self) -> OptimizationResult:
10731060 """
10741061 return OptimizationResult ()
10751062
1076- def _check_if_failure_rate_exceeded (self , force_check : bool = False ) -> bool :
1077- """Checks if the failure rate (set in Orchestrator options) has been exceeded at
1078- any point during the optimization.
1063+ def error_if_failure_rate_exceeded (self , force_check : bool = False ) -> None :
1064+ """Raises an exception if the failure rate (set in Orchestrator options) has
1065+ been exceeded at any point during the optimization.
10791066
1080- NOTE: Both FAILED and ABANDONED trial statuses count towards the failure rate.
1067+ The failure rate is computed as the ratio of "bad" trials to total trials
1068+ created by this orchestrator. "Bad" trials include:
1069+ - Execution failures: trials with FAILED or ABANDONED status.
1070+ - Metric-incomplete trials: COMPLETED trials whose metric data is not
1071+ fully available (as determined by ``compute_metric_availability``).
10811072
10821073 Args:
10831074 force_check: Indicates whether to force a failure-rate check
10841075 regardless of the number of trials that have been executed. If False
10851076 (default), the check will be skipped if the optimization has fewer than
1086- five failed trials. If True, the check will be performed unless there
1087- are 0 failures.
1077+ ``min_failed_trials_for_failure_rate_check`` bad trials. If True, the
1078+ check will be performed unless there are 0 bad trials.
1079+ """
1080+ # Count runner-level failures (FAILED + ABANDONED).
1081+ num_execution_failures = self ._num_bad_in_orchestrator ()
10881082
1089- Effect on state:
1090- If the failure rate has been exceeded, a warning is logged and the private
1091- attribute `_failure_rate_has_been_exceeded` is set to True, which causes the
1092- `_get_max_pending_trials` to return zero, so that no further trials are
1093- scheduled and an error is raised at the end of the optimization.
1083+ # Count completed trials with incomplete metric availability.
1084+ num_metric_incomplete , missing_metrics_by_trial = (
1085+ self ._get_metric_incomplete_trials ()
1086+ )
10941087
1095- Returns:
1096- Boolean representing whether the failure rate has been exceeded.
1097- """
1098- if self ._failure_rate_has_been_exceeded :
1099- return True
1088+ num_bad = num_execution_failures + num_metric_incomplete
11001089
1101- num_bad_in_orchestrator = self ._num_bad_in_orchestrator ()
1102- # skip check if 0 failures
1103- if num_bad_in_orchestrator == 0 :
1104- return False
1090+ if not self ._failure_rate_has_been_exceeded :
1091+ # Skip check if 0 bad trials.
1092+ if num_bad == 0 :
1093+ return
11051094
1106- # skip check if fewer than min_failed_trials_for_failure_rate_check failures
1107- # unless force_check is True
1108- if (
1109- num_bad_in_orchestrator
1110- < self .options .min_failed_trials_for_failure_rate_check
1111- and not force_check
1112- ):
1113- return False
1095+ # Skip check if fewer than min threshold unless force_check.
1096+ if (
1097+ num_bad < self .options .min_failed_trials_for_failure_rate_check
1098+ and not force_check
1099+ ):
1100+ return
11141101
1115- num_ran_in_orchestrator = self ._num_ran_in_orchestrator ()
1116- failure_rate_exceeded = (
1117- num_bad_in_orchestrator / num_ran_in_orchestrator
1118- ) > self .options .tolerated_trial_failure_rate
1102+ num_ran_in_orchestrator = self ._num_ran_in_orchestrator ()
1103+ failure_rate_exceeded = (
1104+ num_bad / num_ran_in_orchestrator
1105+ ) > self .options .tolerated_trial_failure_rate
1106+
1107+ if not failure_rate_exceeded :
1108+ return
11191109
1120- if failure_rate_exceeded :
1121- if self ._num_trials_bad_due_to_err > num_bad_in_orchestrator / 2 :
1122- self .logger .warning (
1123- "MetricFetchE INFO: Sweep aborted due to an exceeded error rate, "
1124- "which was primarily caused by failure to fetch metrics. Please "
1125- "check if anything could cause your metrics to be flaky or "
1126- "broken."
1127- )
11281110 # NOTE: this private attribute causes `_get_max_pending_trials` to
11291111 # return zero, which causes no further trials to be scheduled.
11301112 self ._failure_rate_has_been_exceeded = True
1131- return True
1132-
1133- return False
1134-
1135- def error_if_failure_rate_exceeded (self , force_check : bool = False ) -> None :
1136- """Raises an exception if the failure rate (set in Orchestrator options) has
1137- been exceeded at any point during the optimization.
11381113
1139- NOTE: Both FAILED and ABANDONED trial statuses count towards the failure rate.
1140-
1141- Args:
1142- force_check: Indicates whether to force a failure-rate check
1143- regardless of the number of trials that have been executed. If False
1144- (default), the check will be skipped if the optimization has fewer than
1145- five failed trials. If True, the check will be performed unless there
1146- are 0 failures.
1147- """
1148- if self ._check_if_failure_rate_exceeded (force_check = force_check ):
1149- raise self ._get_failure_rate_exceeded_error (
1150- num_bad_in_orchestrator = self ._num_bad_in_orchestrator (),
1151- num_ran_in_orchestrator = self ._num_ran_in_orchestrator (),
1152- )
1114+ raise self ._get_failure_rate_exceeded_error (
1115+ num_execution_failures = num_execution_failures ,
1116+ num_metric_incomplete = num_metric_incomplete ,
1117+ num_ran_in_orchestrator = self ._num_ran_in_orchestrator (),
1118+ missing_metrics_by_trial = missing_metrics_by_trial ,
1119+ )
11531120
11541121 def _error_if_status_quo_infeasible (self ) -> None :
11551122 """Raises an exception if the status-quo arm is infeasible and the
@@ -2032,9 +1999,13 @@ def _fetch_and_process_trials_data_results(
20321999 self ,
20332000 trial_indices : Iterable [int ],
20342001 ) -> dict [int , dict [str , MetricFetchResult ]]:
2035- """
2036- Fetches results from experiment and modifies trial statuses depending on
2037- success or failure.
2002+ """Fetch trial data results and log any metric fetch errors.
2003+
2004+ Metric fetch errors are logged but do NOT change trial status.
2005+ ``MetricAvailability`` (computed via ``compute_metric_availability``)
2006+ tracks data completeness separately, and the failure rate check in
2007+ ``error_if_failure_rate_exceeded`` uses it to detect persistent
2008+ metric issues.
20382009 """
20392010
20402011 try :
@@ -2085,41 +2056,12 @@ def _fetch_and_process_trials_data_results(
20852056 f"Failed to fetch { metric_name } for trial { trial_index } with "
20862057 f"status { status } , found { metric_fetch_e } ."
20872058 )
2088- self ._num_metric_fetch_e_encountered += 1
20892059 self ._report_metric_fetch_e (
20902060 trial = self .experiment .trials [trial_index ],
20912061 metric_name = metric_name ,
20922062 metric_fetch_e = metric_fetch_e ,
20932063 )
20942064
2095- # If the fetch failure was for a metric in the optimization config (an
2096- # objective or constraint) mark the trial as failed
2097- optimization_config = self .experiment .optimization_config
2098- if (
2099- optimization_config is not None
2100- and metric_name in optimization_config .metric_names
2101- and not self .experiment .metrics [metric_name ].is_recoverable_fetch_e (
2102- metric_fetch_e = metric_fetch_e
2103- )
2104- ):
2105- status = self ._mark_err_trial_status (
2106- trial = self .experiment .trials [trial_index ],
2107- metric_name = metric_name ,
2108- metric_fetch_e = metric_fetch_e ,
2109- )
2110- self .logger .warning (
2111- f"MetricFetchE INFO: Because { metric_name } is an objective, "
2112- f"marking trial { trial_index } as { status } ."
2113- )
2114- self ._num_trials_bad_due_to_err += 1
2115- continue
2116-
2117- self .logger .info (
2118- "MetricFetchE INFO: Continuing optimization even though "
2119- "MetricFetchE encountered."
2120- )
2121- continue
2122-
21232065 return results
21242066
21252067 def _report_metric_fetch_e (
@@ -2128,39 +2070,122 @@ def _report_metric_fetch_e(
21282070 metric_name : str ,
21292071 metric_fetch_e : MetricFetchE ,
21302072 ) -> None :
2073+ """Hook for subclasses to react to metric fetch errors.
2074+
2075+ Called once per metric fetch error during
2076+ ``_fetch_and_process_trials_data_results``. The default
2077+ implementation is a no-op; override in subclasses to add custom
2078+ reporting (e.g., creating error tables or pastes).
2079+ """
21312080 pass
21322081
2133- def _mark_err_trial_status (
2082+ def _get_metric_incomplete_trials (
21342083 self ,
2135- trial : BaseTrial ,
2136- metric_name : str | None = None ,
2137- metric_fetch_e : MetricFetchE | None = None ,
2138- ) -> TrialStatus :
2139- trial .mark_abandoned (
2140- reason = metric_fetch_e .message if metric_fetch_e else None , unsafe = True
2084+ ) -> tuple [int , dict [int , set [str ]]]:
2085+ """Count completed trials with incomplete metric availability and identify
2086+ which metrics are missing for each.
2087+
2088+ Required metrics include optimization config metrics and any explicitly
2089+ defined early stopping strategy metrics.
2090+
2091+ Returns:
2092+ A tuple of (num_metric_incomplete, missing_metrics_by_trial) where
2093+ missing_metrics_by_trial maps trial index to the set of missing
2094+ metric names.
2095+ """
2096+ opt_config = self .experiment .optimization_config
2097+ if opt_config is None :
2098+ return 0 , {}
2099+
2100+ completed_trial_indices = [
2101+ t .index
2102+ for t in self .experiment .trials .values ()
2103+ if t .status == TrialStatus .COMPLETED
2104+ and t .index >= self ._num_preexisting_trials
2105+ ]
2106+ if len (completed_trial_indices ) == 0 :
2107+ return 0 , {}
2108+
2109+ required_metrics = set (opt_config .metric_names )
2110+
2111+ # Include explicitly defined early stopping strategy metrics.
2112+ # ESS stores metric *signatures*, which may differ from metric names,
2113+ # so we resolve them via experiment.signature_to_metric.
2114+ ess = self .options .early_stopping_strategy
2115+ ess_signatures = ess .metric_signatures if ess is not None else None
2116+ if ess_signatures is not None :
2117+ for sig in ess_signatures :
2118+ metric = self .experiment .signature_to_metric [sig ]
2119+ required_metrics .add (metric .name )
2120+
2121+ metric_availabilities = compute_metric_availability (
2122+ experiment = self .experiment ,
2123+ trial_indices = completed_trial_indices ,
2124+ metric_names = required_metrics ,
21412125 )
2142- return TrialStatus .ABANDONED
2126+
2127+ # Identify which specific metrics are missing per trial.
2128+ data = self .experiment .lookup_data (trial_indices = completed_trial_indices )
2129+ metrics_per_trial : dict [int , set [str ]] = {}
2130+ if len (data .metric_names ) > 0 :
2131+ df = data .full_df
2132+ for trial_idx , group in df .groupby ("trial_index" )["metric_name" ]:
2133+ metrics_per_trial [int (trial_idx )] = set (group .unique ())
2134+
2135+ missing_metrics_by_trial : dict [int , set [str ]] = {}
2136+ for idx , avail in metric_availabilities .items ():
2137+ if avail != MetricAvailability .COMPLETE :
2138+ available = metrics_per_trial .get (idx , set ())
2139+ missing_metrics_by_trial [idx ] = required_metrics - available
2140+
2141+ return len (missing_metrics_by_trial ), missing_metrics_by_trial
21432142
21442143 def _get_failure_rate_exceeded_error (
21452144 self ,
2146- num_bad_in_orchestrator : int ,
2145+ num_execution_failures : int ,
2146+ num_metric_incomplete : int ,
21472147 num_ran_in_orchestrator : int ,
2148+ missing_metrics_by_trial : dict [int , set [str ]],
21482149 ) -> FailureRateExceededError :
2149- return FailureRateExceededError (
2150- (
2151- f"{ METRIC_FETCH_ERR_MESSAGE } \n "
2152- if self ._num_trials_bad_due_to_err > num_bad_in_orchestrator / 2
2153- else ""
2150+ """Build an actionable error message describing why the failure rate was
2151+ exceeded, including runner failures, metric-incomplete trials, which
2152+ metrics are missing, and which trials are affected.
2153+ """
2154+ num_bad = num_execution_failures + num_metric_incomplete
2155+ observed_rate = num_bad / num_ran_in_orchestrator
2156+
2157+ parts : list [str ] = []
2158+ parts .append (
2159+ f"Failure rate exceeded: { num_bad } of { num_ran_in_orchestrator } "
2160+ f"trials were unsuccessful (observed rate: { observed_rate :.0%} , tolerance: "
2161+ f"{ self .options .tolerated_trial_failure_rate :.0%} ). "
2162+ f"Checks are triggered when at least "
2163+ f"{ self .options .min_failed_trials_for_failure_rate_check } trials "
2164+ "are unsuccessful or at the end of the optimization."
2165+ )
2166+
2167+ if num_execution_failures > 0 :
2168+ parts .append (
2169+ f"{ num_execution_failures } trial(s) failed at the execution "
2170+ "level (FAILED or ABANDONED). Check any trial evaluation "
2171+ "processes/jobs to see why they are failing."
21542172 )
2155- + " Orignal error message: "
2156- + FAILURE_EXCEEDED_MSG .format (
2157- f_rate = self .options .tolerated_trial_failure_rate ,
2158- n_failed = num_bad_in_orchestrator ,
2159- n_ran = num_ran_in_orchestrator ,
2160- min_failed = self .options .min_failed_trials_for_failure_rate_check ,
2161- observed_rate = float (num_bad_in_orchestrator ) / num_ran_in_orchestrator ,
2173+
2174+ if num_metric_incomplete > 0 :
2175+ all_missing : set [str ] = set ()
2176+ for missing in missing_metrics_by_trial .values ():
2177+ all_missing .update (missing )
2178+ affected_trials = sorted (missing_metrics_by_trial .keys ())
2179+
2180+ parts .append (
2181+ f"{ num_metric_incomplete } trial(s) have incomplete metric data. "
2182+ f"Missing metrics: { sorted (all_missing )} . "
2183+ f"Affected trials: { affected_trials } . "
2184+ "Check that your metric fetching infrastructure is healthy "
2185+ "and that the metrics are being logged correctly."
21622186 )
2163- )
2187+
2188+ return FailureRateExceededError ("\n " .join (parts ))
21642189
21652190 def _warn_if_non_terminal_trials (self ) -> None :
21662191 """Warns if there are any non-terminal trials on the experiment."""
0 commit comments