Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions examples/ensemble_tuning_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -501,8 +501,8 @@
" <li><code>ground_truth_answers</code> - (<strong>List[str]</strong>) A list of ideal (correct) responses.</li>\n",
" <li><code>grader_function</code> - (<strong>callable, default=None</strong>) A user-defined function that takes a response and a ground truth 'answer' and returns a boolean indicator of whether the response is correct. If not provided, vectara's HHEM is used: https://huggingface.co/vectara/hallucination_evaluation_model</li>\n",
" <li><code>num_responses</code> - (<strong>int, default=5</strong>) The number of sampled responses used to compute consistency.</li>\n",
" <li><code>weights_objective</code> - (<strong>str, default='roc_auc'</strong>) Objective function for weight optimization. Must match thresh_objective if one of {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}. If same as thresh_objective, joint optimization will be done.</li>\n",
" <li><code>thresh_objective</code> - (<strong>str, default='fbeta_score'</strong>) Objective function for threshold optimization via grid search. One of {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}.</li>\n",
" <li><code>weights_objective</code> - (<strong>str, default='roc_auc'</strong>) Objective function for weight optimization. One of {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss', 'average_precision', 'brier_score'}. Must match thresh_objective if one of {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}. If same as thresh_objective, joint optimization will be done.</li>\n",
" <li><code>thresh_objective</code> - (<strong>str, default='fbeta_score'</strong>) Objective function for threshold optimization via grid search. One of {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}.</li>\n",
" <li><code>thresh_bounds</code> - (<strong>tuple of floats, default=(0,1)</strong>) Bounds to search for threshold.</li>\n",
" <li><code>n_trials</code> - (<strong>int, default=100</strong>) Indicates how many trials to search over with optuna optimizer</li>\n",
" <li><code>step_size</code> - (<strong>float, default=0.01</strong>) Indicates step size in grid search, if used.</li>\n",
Expand Down Expand Up @@ -1229,15 +1229,15 @@
],
"metadata": {
"environment": {
"kernel": "uqlm",
"name": "workbench-notebooks.m125",
"kernel": "uqlm_my_test",
"name": "workbench-notebooks.m126",
"type": "gcloud",
"uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/workbench-notebooks:m125"
"uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/workbench-notebooks:m126"
},
"kernelspec": {
"display_name": "uqlm",
"display_name": "uqlm_my_test",
"language": "python",
"name": "uqlm"
"name": "uqlm_my_test"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -1249,7 +1249,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.21"
"version": "3.11.12"
}
},
"nbformat": 4,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "uqlm"
version = "0.1.6"
version = "0.1.7"
description = "UQLM (Uncertainty Quantification for Language Models) is a Python package for UQ-based LLM hallucination detection."
authors = ["Dylan Bouchard <dylan.bouchard@cvshealth.com>", "Mohit Singh Chauhan <mohitsingh.chauhan@cvshealth.com>"]
maintainers = [
Expand Down
4 changes: 2 additions & 2 deletions tests/test_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def setup_method(self):
def test_initialization(self):
# Test default initialization
tuner = Tuner()
assert list(tuner.objective_to_func.keys()) == ["fbeta_score", "accuracy_score", "balanced_accuracy_score", "log_loss", "roc_auc"]
assert list(tuner.objective_to_func.keys()) == ["fbeta_score", "accuracy_score", "balanced_accuracy_score", "log_loss", "roc_auc", "average_precision", "brier_score"]

def test_tune_threshold(self):
tuner = Tuner()
Expand Down Expand Up @@ -61,7 +61,7 @@ def test_validation_errors_and_optimization_paths(self):
# test unsupported weights_objective
with pytest.raises(ValueError) as e:
Tuner().tune_params(score_lists=self.score_lists, correct_indicators=self.correct_indicators, weights_objective="invalid")
assert "Only 'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc_score', and 'log_loss' are supported for tuning objectives." in str(e.value)
assert "Only 'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc_score', 'log_loss', 'average_precision', and 'brier_score' are supported for tuning objectives." in str(e.value)

# test unsupported thresh_objective
with pytest.raises(ValueError) as e:
Expand Down
8 changes: 4 additions & 4 deletions uqlm/scorers/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,14 +218,14 @@ def tune_from_graded(self, correct_indicators: List[bool], weights_objective: st
correct_indicators : list of bool
A list of boolean indicators of whether self.responses are correct.

weights_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='roc_auc'
weights_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss', 'average_precision', 'brier_score'}, default='roc_auc'
Objective function for weight optimization. Must match thresh_objective if one of 'fbeta_score',
'accuracy_score', 'balanced_accuracy_score'. If same as thresh_objective, joint optimization will be done.

thresh_bounds : tuple of floats, default=(0,1)
Bounds to search for threshold

thresh_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='fbeta_score'
thresh_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}, default='fbeta_score'
Objective function for threshold optimization via grid search.

n_trials : int, default=100
Expand Down Expand Up @@ -269,14 +269,14 @@ async def tune(self, prompts: List[str], ground_truth_answers: List[str], grader
num_responses : int, default=5
The number of sampled responses used to compute consistency.

weights_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='roc_auc'
weights_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss', 'average_precision', 'brier_score'}, default='roc_auc'
Objective function for weight optimization. Must match thresh_objective if one of 'fbeta_score',
'accuracy_score', 'balanced_accuracy_score'. If same as thresh_objective, joint optimization will be done.

thresh_bounds : tuple of floats, default=(0,1)
Bounds to search for threshold

thresh_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='fbeta_score'
thresh_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}, default='fbeta_score'
Objective function for threshold optimization via grid search.

n_trials : int, default=100
Expand Down
20 changes: 10 additions & 10 deletions uqlm/utils/tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,17 @@
import optuna
from typing import Any, Dict, List, Tuple

from sklearn.metrics import fbeta_score, balanced_accuracy_score, accuracy_score, roc_auc_score, log_loss
from sklearn.metrics import fbeta_score, balanced_accuracy_score, accuracy_score, roc_auc_score, log_loss, average_precision_score, brier_score_loss

optuna.logging.set_verbosity(optuna.logging.WARNING)


class Tuner:
def __init__(self) -> None:
"""
Class for tuning weights and threshold for UQEnsemble class.
Class for tuning weights and threshold for UQEnsemble
"""
self.objective_to_func = {"fbeta_score": self._f_score, "accuracy_score": accuracy_score, "balanced_accuracy_score": balanced_accuracy_score, "log_loss": log_loss, "roc_auc": roc_auc_score}
self.objective_to_func = {"fbeta_score": self._f_score, "accuracy_score": accuracy_score, "balanced_accuracy_score": balanced_accuracy_score, "log_loss": log_loss, "roc_auc": roc_auc_score, "average_precision": average_precision_score, "brier_score": brier_score_loss}

def tune_threshold(self, y_scores: List[float], correct_indicators: List[bool], thresh_objective: str = "fbeta_score", fscore_beta: float = 1, bounds: Tuple[float, float] = (0, 1), step_size: int = 0.01) -> float:
"""
Expand All @@ -42,7 +42,7 @@ def tune_threshold(self, y_scores: List[float], correct_indicators: List[bool],
correct_indicators : list of bool
A list of boolean indicators of whether self.original_responses are correct.

thresh_objective: {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='fbeta_score'
thresh_objective: {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}, default='fbeta_score'
Objective function for threshold optimization via grid search.

fscore_beta : float, default=1
Expand Down Expand Up @@ -84,11 +84,11 @@ def tune_params(self, score_lists: List[List[float]], correct_indicators: List[b
correct_indicators : list of bool
A list of boolean indicators of whether self.original_responses are correct.

weights_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='roc_auc'
weights_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss', 'average_precision', 'brier_score'}, default='roc_auc'
Objective function for optimization of weights. Must match thresh_objective if one of 'fbeta_score',
'accuracy_score', 'balanced_accuracy_score'. If same as thresh_objective, joint optimization will be done.

thresh_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc', 'log_loss'}, default='fbeta_score'
thresh_objective : {'fbeta_score', 'accuracy_score', 'balanced_accuracy_score'}, default='fbeta_score'
Objective function for threshold optimization via grid search.

thresh_bounds : tuple of floats, default=(0,1)
Expand Down Expand Up @@ -118,9 +118,8 @@ def tune_params(self, score_lists: List[List[float]], correct_indicators: List[b
self.step_size = step_size
self.fscore_beta = fscore_beta
self.optimize_jointly = weights_objective == thresh_objective
self.obj_multiplier = 1 if weights_objective == "logloss" else -1
self.obj_multiplier = 1 if weights_objective in ["logloss", "brier_score"] else -1

# Validate inputs are correct
self._validate_tuning_inputs()
self.weights_tuning_objective = self.objective_to_func[self.weights_objective]
self.threshold_tuning_objective = self.objective_to_func[self.thresh_objective]
Expand Down Expand Up @@ -171,7 +170,7 @@ def _validate_tuning_inputs(self):
if self.weights_objective not in self.objective_to_func:
raise ValueError(
"""
Only 'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc_score', and 'log_loss' are supported for tuning objectives.
Only 'fbeta_score', 'accuracy_score', 'balanced_accuracy_score', 'roc_auc_score', 'log_loss', 'average_precision', and 'brier_score' are supported for tuning objectives.
"""
)
if self.thresh_objective not in ["fbeta_score", "accuracy_score", "balanced_accuracy_score"]:
Expand Down Expand Up @@ -214,7 +213,8 @@ def _compute_ensemble_scores(self, weights: List[float], score_lists: List[List[
adjusted_weights = weights[:, None] * valid_mask
normalized_weights = adjusted_weights / np.sum(adjusted_weights, axis=0, keepdims=True)
stacked_nonan = np.nan_to_num(score_lists, nan=0.0)
return np.sum(stacked_nonan * normalized_weights, axis=0)
ensemble_scores = np.sum(stacked_nonan * normalized_weights, axis=0)
return np.clip(ensemble_scores, 0, 1)

def _grid_search_weights_thresh(self):
"""
Expand Down