Enable use of 'brier_score' and 'average_precision' for Tuner and UQEnsemble. Co-authored-by: vaifai <vaifaipandey1996@gmail.com>

dylanbouchard · vaifai · web-flow · commit 3b7104cbed71 · 2025-06-22T16:31:01.000-04:00
* Initialising new metrics

* Updating docstrings, examples and adding the new metrics to the validate_tuning_inputs method

* Replacing brier_score with average_precision in threshold objectives

* Removing brier_score and average_precision from threshold_objective

* Removing instances of average_precision from demo files

* Removing average_precision from thresh_objective in ensemble.py

* Improving upon the comments

* fix notebook description

* add clip step to ensemble score computation

* fix docstring

* fix unit test

---------

Co-authored-by: vaifai &lt;vaifaipandey1996@gmail.com&gt;
diff --git a/examples/ensemble_tuning_demo.ipynb b/examples/ensemble_tuning_demo.ipynb
@@ -501,8 +501,8 @@
     "        <li><code>ground_truth_answers</code> - (<strong>List[str]</strong>) A list of ideal (correct) responses.</li>\n",
     "        <li><code>grader_function</code> - (<strong>callable, default=None</strong>) A user-defined function that takes a response and a ground truth 'answer' and returns a boolean indicator of whether the response is correct. If not provided, vectara's HHEM is used: https://huggingface.co/vectara/hallucination_evaluation_model</li>\n",
     "        <li><code>num_responses</code> - (<strong>int, default=5</strong>) The number of sampled responses used to compute consistency.</li>\n",
-    "        <li><code>weights_objective</code> - (<strong>str, default='roc_auc'</strong>) Objective function for weight optimization. Must match thresh_objective if one of {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}. If same as thresh_objective, joint optimization will be done.</li>\n",
-    "        <li><code>thresh_objective</code> - (<strong>str, default='fbeta_score'</strong>) Objective function for threshold optimization via grid search. One of {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}.</li>\n",
+    "        <li><code>weights_objective</code> - (<strong>str, default='roc_auc'</strong>) Objective function for weight optimization. One of {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss', 'average_precision', 'brier_score'}. Must match thresh_objective if one of {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}. If same as thresh_objective, joint optimization will be done.</li>\n",
+    "        <li><code>thresh_objective</code> - (<strong>str, default='fbeta_score'</strong>) Objective function for threshold optimization via grid search. One of {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}.</li>\n",
     "        <li><code>thresh_bounds</code> - (<strong>tuple of floats, default=(0,1)</strong>) Bounds to search for threshold.</li>\n",
     "        <li><code>n_trials</code> - (<strong>int, default=100</strong>) Indicates how many trials to search over with optuna optimizer</li>\n",
     "        <li><code>step_size</code> - (<strong>float, default=0.01</strong>) Indicates step size in grid search, if used.</li>\n",
@@ -1229,15 +1229,15 @@
  ],
  "metadata": {
   "environment": {
-   "kernel": "uqlm",
-   "name": "workbench-notebooks.m125",
+   "kernel": "uqlm_my_test",
+   "name": "workbench-notebooks.m126",
    "type": "gcloud",
-   "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/workbench-notebooks:m125"
+   "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/workbench-notebooks:m126"
   },
   "kernelspec": {
-   "display_name": "uqlm",
+   "display_name": "uqlm_my_test",
    "language": "python",
-   "name": "uqlm"
+   "name": "uqlm_my_test"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1249,7 +1249,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.21"
+   "version": "3.11.12"
   }
  },
  "nbformat": 4,
diff --git a/tests/test_tuner.py b/tests/test_tuner.py
@@ -27,7 +27,7 @@ def setup_method(self):
     def test_initialization(self):
         # Test default initialization
         tuner = Tuner()
-        assert list(tuner.objective_to_func.keys()) == ["fbeta_score", "accuracy_score", "balanced_accuracy_score", "log_loss", "roc_auc"]
+        assert list(tuner.objective_to_func.keys()) == ["fbeta_score", "accuracy_score", "balanced_accuracy_score", "log_loss", "roc_auc", "average_precision", "brier_score"]
 
     def test_tune_threshold(self):
         tuner = Tuner()
@@ -61,7 +61,7 @@ def test_validation_errors_and_optimization_paths(self):
         # test unsupported weights_objective
         with pytest.raises(ValueError) as e:
             Tuner().tune_params(score_lists=self.score_lists, correct_indicators=self.correct_indicators, weights_objective="invalid")
-        assert "Only 'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc_score', and 'log_loss' are supported for tuning objectives." in str(e.value)
+        assert "Only 'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc_score', 'log_loss', 'average_precision', and 'brier_score' are supported for tuning objectives." in str(e.value)
 
         # test unsupported thresh_objective
         with pytest.raises(ValueError) as e:
diff --git a/uqlm/scorers/ensemble.py b/uqlm/scorers/ensemble.py
@@ -218,14 +218,14 @@ def tune_from_graded(self, correct_indicators: List[bool], weights_objective: st
         correct_indicators : list of bool
             A list of boolean indicators of whether self.responses are correct.
 
-        weights_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='roc_auc'
+        weights_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss', 'average_precision', 'brier_score'}, default='roc_auc'
             Objective function for weight optimization. Must match thresh_objective if one of 'fbeta_score',
             'accuracy_score', 'balanced_accuracy_score'. If same as thresh_objective, joint optimization will be done.
 
         thresh_bounds : tuple of floats, default=(0,1)
             Bounds to search for threshold
 
-        thresh_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='fbeta_score'
+        thresh_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}, default='fbeta_score'
             Objective function for threshold optimization via grid search.
 
         n_trials : int, default=100
@@ -269,14 +269,14 @@ async def tune(self, prompts: List[str], ground_truth_answers: List[str], grader
         num_responses : int, default=5
             The number of sampled responses used to compute consistency.
 
-        weights_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='roc_auc'
+        weights_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss', 'average_precision', 'brier_score'}, default='roc_auc'
             Objective function for weight optimization. Must match thresh_objective if one of 'fbeta_score',
             'accuracy_score', 'balanced_accuracy_score'. If same as thresh_objective, joint optimization will be done.
 
         thresh_bounds : tuple of floats, default=(0,1)
             Bounds to search for threshold
 
-        thresh_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='fbeta_score'
+        thresh_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}, default='fbeta_score'
             Objective function for threshold optimization via grid search.
 
         n_trials : int, default=100
diff --git a/uqlm/utils/tuner.py b/uqlm/utils/tuner.py
@@ -18,17 +18,17 @@
 import optuna
 from typing import Any, Dict, List, Tuple
 
-from sklearn.metrics import fbeta_score, balanced_accuracy_score, accuracy_score, roc_auc_score, log_loss
+from sklearn.metrics import fbeta_score, balanced_accuracy_score, accuracy_score, roc_auc_score, log_loss, average_precision_score, brier_score_loss
 
 optuna.logging.set_verbosity(optuna.logging.WARNING)
 
 
 class Tuner:
     def __init__(self) -> None:
         """
-        Class for tuning weights and threshold for UQEnsemble class.
+        Class for tuning weights and threshold for UQEnsemble
         """
-        self.objective_to_func = {"fbeta_score": self._f_score, "accuracy_score": accuracy_score, "balanced_accuracy_score": balanced_accuracy_score, "log_loss": log_loss, "roc_auc": roc_auc_score}
+        self.objective_to_func = {"fbeta_score": self._f_score, "accuracy_score": accuracy_score, "balanced_accuracy_score": balanced_accuracy_score, "log_loss": log_loss, "roc_auc": roc_auc_score, "average_precision": average_precision_score, "brier_score": brier_score_loss}
 
     def tune_threshold(self, y_scores: List[float], correct_indicators: List[bool], thresh_objective: str = "fbeta_score", fscore_beta: float = 1, bounds: Tuple[float, float] = (0, 1), step_size: int = 0.01) -> float:
         """
@@ -42,7 +42,7 @@ def tune_threshold(self, y_scores: List[float], correct_indicators: List[bool],
         correct_indicators : list of bool
             A list of boolean indicators of whether self.original_responses are correct.
 
-        thresh_objective: {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='fbeta_score'
+        thresh_objective: {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}, default='fbeta_score'
             Objective function for threshold optimization via grid search.
 
         fscore_beta : float, default=1
@@ -84,11 +84,11 @@ def tune_params(self, score_lists: List[List[float]], correct_indicators: List[b
         correct_indicators : list of bool
             A list of boolean indicators of whether self.original_responses are correct.
 
-        weights_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='roc_auc'
+        weights_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss', 'average_precision', 'brier_score'}, default='roc_auc'
             Objective function for optimization of weights. Must match thresh_objective if one of 'fbeta_score',
             'accuracy_score', 'balanced_accuracy_score'. If same as thresh_objective, joint optimization will be done.
 
-        thresh_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='fbeta_score'
+        thresh_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}, default='fbeta_score'
             Objective function for threshold optimization via grid search.
 
         thresh_bounds : tuple of floats, default=(0,1)
@@ -118,9 +118,8 @@ def tune_params(self, score_lists: List[List[float]], correct_indicators: List[b
         self.step_size = step_size
         self.fscore_beta = fscore_beta
         self.optimize_jointly = weights_objective == thresh_objective
-        self.obj_multiplier = 1 if weights_objective == "logloss" else -1
+        self.obj_multiplier = 1 if weights_objective in ["logloss", "brier_score"] else -1
 
-        # Validate inputs are correct
         self._validate_tuning_inputs()
         self.weights_tuning_objective = self.objective_to_func[self.weights_objective]
         self.threshold_tuning_objective = self.objective_to_func[self.thresh_objective]
@@ -171,7 +170,7 @@ def _validate_tuning_inputs(self):
         if self.weights_objective not in self.objective_to_func:
             raise ValueError(
                 """
-                Only 'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc_score', and 'log_loss' are supported for tuning objectives.
+                Only 'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc_score', 'log_loss', 'average_precision', and 'brier_score' are supported for tuning objectives.
                 """
             )
         if self.thresh_objective not in ["fbeta_score", "accuracy_score", "balanced_accuracy_score"]:
@@ -214,7 +213,8 @@ def _compute_ensemble_scores(self, weights: List[float], score_lists: List[List[
         adjusted_weights = weights[:, None] * valid_mask
         normalized_weights = adjusted_weights / np.sum(adjusted_weights, axis=0, keepdims=True)
         stacked_nonan = np.nan_to_num(score_lists, nan=0.0)
-        return np.sum(stacked_nonan * normalized_weights, axis=0)
+        ensemble_scores = np.sum(stacked_nonan * normalized_weights, axis=0)
+        return np.clip(ensemble_scores, 0, 1)
 
     def _grid_search_weights_thresh(self):
         """