Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "uqlm"
version = "0.1.5"
version = "0.1.6"
description = "UQLM (Uncertainty Quantification for Language Models) is a Python package for UQ-based LLM hallucination detection."
authors = ["Dylan Bouchard <dylan.bouchard@cvshealth.com>", "Mohit Singh Chauhan <mohitsingh.chauhan@cvshealth.com>"]
maintainers = [
Expand Down
4 changes: 0 additions & 4 deletions tests/data/scorers/generate_data_llmjudge.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,6 @@ async def main():

judge = LLMJudge(llm=original_llm, max_calls_per_min=250)

judge_result = await judge.judge_responses(prompts=prompts, responses=responses)

extract_answer = judge._extract_answers(responses=judge_result["judge_responses"])

# Generate data for all templates
templates = ["true_false_uncertain", "true_false", "continuous", "likert"]
# Structure: one file with all template data
Expand Down
11 changes: 7 additions & 4 deletions tests/data/similarity/generate_data_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@

import os
import json
import asyncio

# import bert_score
from uqlm.similarity import BertScorer, BLEURTScorer, CosineScorer, MatchScorer
from uqlm.black_box import BertScorer, BLEURTScorer, CosineScorer, MatchScorer


async def main():
Expand Down Expand Up @@ -46,8 +46,11 @@ async def main():
# 2. Bleurt Scorer
bluert = BLEURTScorer()
bluert_result = bluert.evaluate(responses=responses, sampled_responses=sampled_responses)
bluert_scorer_result = []
for i in range(len(responses)):
bluert_scorer_result.append(bluert.bleurt_scorer.score(references=[responses[i]] * len(sampled_responses[i]), candidates=sampled_responses[i]))

store_results.update({"bluert_result": bluert_result})
store_results.update({"bluert_result": bluert_result, "bluert_score": bluert_scorer_result})

# 3. Cosine Similarity Scorer
cosine = CosineScorer()
Expand All @@ -72,4 +75,4 @@ async def main():


if __name__ == "__main__":
main()
asyncio.run(main())
2 changes: 1 addition & 1 deletion tests/data/similarity/similarity_results_file.json

Large diffs are not rendered by default.

34 changes: 31 additions & 3 deletions tests/test_blackboxuq.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
import pytest
import json
from uqlm.scorers import BlackBoxUQ
from uqlm.scorers.baseclass.uncertainty import DEFAULT_BLACK_BOX_SCORERS
from langchain_openai import AzureChatOpenAI
import sys

datafile_path = "tests/data/scorers/blackbox_results_file.json"
with open(datafile_path, "r") as f:
Expand All @@ -28,12 +30,16 @@
MOCKED_RESPONSES = data["responses"]
MOCKED_SAMPLED_RESPONSES = data["sampled_responses"]

mock_object = AzureChatOpenAI(deployment_name="YOUR-DEPLOYMENT", temperature=1, api_key="SECRET_API_KEY", api_version="2024-05-01-preview", azure_endpoint="https://mocked.endpoint.com")

@pytest.fixture
def mock_llm():
"""Define mock LLM object using pytest.fixture."""
return AzureChatOpenAI(deployment_name="YOUR-DEPLOYMENT", temperature=1, api_key="SECRET_API_KEY", api_version="2024-05-01-preview", azure_endpoint="https://mocked.endpoint.com")


@pytest.mark.asyncio
async def test_bbuq(monkeypatch):
uqe = BlackBoxUQ(llm=mock_object, scorers=["noncontradiction", "exact_match", "semantic_negentropy"])
async def test_bbuq(monkeypatch, mock_llm):
uqe = BlackBoxUQ(llm=mock_llm, scorers=["noncontradiction", "exact_match", "semantic_negentropy"])

async def mock_generate_original_responses(*args, **kwargs):
uqe.logprobs = [None] * 5
Expand All @@ -55,3 +61,25 @@ async def mock_generate_candidate_responses(*args, **kwargs):
assert all([results.data["semantic_negentropy"][i] == pytest.approx(data["semantic_negentropy"][i]) for i in range(len(PROMPTS))])

assert results.metadata == metadata

# Test invalid scorer
with pytest.raises(ValueError):
BlackBoxUQ(llm=mock_llm, scorers=["invalid_scorer"])

# Test default scorers
uqe_default = BlackBoxUQ(llm=mock_llm, scorers=None)
assert len(uqe_default.scorers) == len(DEFAULT_BLACK_BOX_SCORERS)

# Mock the entire bleurt module structure for testing
class MockBLEURTScorer:
def __init__(self):
pass

# Create a proper module structure that matches the import path
class MockBlackBoxModule:
BLEURTScorer = MockBLEURTScorer

# Directly modify sys.modules dictionary with the complete module structure
monkeypatch.setitem(sys.modules, "uqlm.black_box", MockBlackBoxModule())

BlackBoxUQ(llm=mock_llm, scorers=["bert_score", "bleurt"])
1 change: 1 addition & 0 deletions tests/test_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
MOCKED_JUDGE_SCORES = data["judge_1"]
MOCKED_LOGPROBS = metadata["logprobs"]


@pytest.fixture
def mock_llm():
"""Extract judge object using pytest.fixture."""
Expand Down
37 changes: 37 additions & 0 deletions tests/test_llmpanel.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,40 @@ async def mock_judge_responses(*args, **kwargs):

assert result.data == expected_result["data"]
assert result.metadata == expected_result["metadata"]


def test_scoring_templates_length_validation():
"""Test ValueError when scoring_templates length != judges length"""
judge1 = MagicMock(spec=LLMJudge)
judge2 = MagicMock(spec=LLMJudge)
mock_llm = MagicMock(spec=BaseChatModel)
with pytest.raises(ValueError) as value_error:
LLMPanel(judges=[judge1, judge2], llm=mock_llm, scoring_templates=["template1"])
assert "Length of scoring_templates list must be equal to length of judges list" == str(value_error.value)


def test_custom_scoring_templates():
"""Test the else branch when custom scoring_templates provided"""
judge1 = MagicMock(spec=LLMJudge)
mock_llm = MagicMock(spec=BaseChatModel)
panel = LLMPanel(judges=[judge1], llm=mock_llm, scoring_templates=["custom_template"])
assert panel.scoring_templates == ["custom_template"]


def test_invalid_judge_type():
"""Test ValueError for invalid judge types"""
mock_llm = MagicMock(spec=BaseChatModel)
with pytest.raises(ValueError) as value_error:
LLMPanel(judges=["invalid_judge"], llm=mock_llm)
assert "judges must be a list containing instances of either LLMJudge or BaseChatModel" == str(value_error.value)


def test_basechatmodel_judge_conversion(monkeypatch):
"""Test BaseChatModel judges get converted to LLMJudge"""
mock_judge = MagicMock(spec=BaseChatModel)
mock_llm = MagicMock(spec=BaseChatModel)
mock_llm_judge = MagicMock(spec=LLMJudge)
# Mock LLMJudge constructor
monkeypatch.setattr("uqlm.judges.judge.LLMJudge", lambda **kwargs: mock_llm_judge)
panel = LLMPanel(judges=[mock_judge], llm=mock_llm)
assert len(panel.judges) == 1
118 changes: 117 additions & 1 deletion tests/test_load_example_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

import pytest
import pandas as pd
from uqlm.utils.dataloader import load_example_dataset, list_dataset_names
from uqlm.utils.dataloader import load_example_dataset, list_dataset_names, _combine_question_and_choices
from uqlm.utils.dataloader import _dataset_processing


def test_list_dataset_names():
Expand Down Expand Up @@ -48,3 +49,118 @@ def test_load_dataset_with_processing():
df = load_example_dataset("gsm8k", n=100, cols=["question", "answer"])
assert df.shape[0] == 100
assert list(df.columns) == ["question", "answer"]


def test_dataset_processing_type_error():
# tests that _dataset_processing raises a TypeError when passed something other than a pandas DataFrame
with pytest.raises(TypeError):
_dataset_processing("not_a_dataframe")


def test_combine_question_and_choices_type_error():
# tests that _combine_question_and_choices raises a TypeError when choice_col parameter is not a string or a list
df = pd.DataFrame({"question": ["Q1"]})
with pytest.raises(TypeError):
_combine_question_and_choices(df, "question", 123)


def test_combine_question_and_choices_list_case():
# test the elif isinstance(choice_col, list) branch
df = pd.DataFrame({"question": ["Q1"], "choiceA": ["A"]})
result = _combine_question_and_choices(df, "question", ["choiceA"])
assert len(result) == 1


def test_load_example_dataset_with_concat_all():
# test the if split == “all”: branch and concatenate_datasets call
df = load_example_dataset("svamp") # svamp has concat=“all” in config
assert len(df) > 0


def test_dataset_processing_rename_columns():
# test the column renaming functionality
df = pd.DataFrame({"old_name": ["A", "B"]})
result = _dataset_processing(df, rename_columns={"old_name": "new_name"})
assert "new_name" in result.columns


def test_dataset_processing_strip_non_numeric():
# test removing non-numeric characters from specified columns
df = pd.DataFrame({"answer": ["A123B", "C456D"]})
result = _dataset_processing(df, strip_non_numeric=["answer"])
assert "123" in result["answer"].values


def test_dataset_processing_strip_whitespace():
# test removing leading/trailing whitespaces from specified columns
df = pd.DataFrame({"answer": [" hello ", " world "]})
result = _dataset_processing(df, strip_whitespace=["answer"])
assert "hello" in result["answer"].values


def test_dataset_processing_to_upper():
# test converting text to uppercase for both string columns and list columns
# test string case
df = pd.DataFrame({"answer": ["hello"]})
result = _dataset_processing(df, to_upper=["answer"])
assert result["answer"].iloc[0] == "HELLO"
# test list case
df = pd.DataFrame({"answer": [["hello", "world"]]})
result = _dataset_processing(df, to_upper=["answer"])
assert result["answer"].iloc[0] == ["HELLO", "WORLD"]


def test_dataset_processing_to_lower():
# test converting text to uppercase for both string columns and list columns
# test string case
df = pd.DataFrame({"answer": ["HELLO"]})
result = _dataset_processing(df, to_lower=["answer"])
assert result["answer"].iloc[0] == "hello"
# test list case
df = pd.DataFrame({"answer": [["HELLO", "WORLD"]]})
result = _dataset_processing(df, to_lower=["answer"])
assert result["answer"].iloc[0] == ["hello", "world"]


def test_dataset_processing_combine_question_and_choices():
# testing _combine_question_and_choices()
df = pd.DataFrame({"question": ["Q1"], "choices": ["A"]})
combine_params = {"question_col": "question", "choice_col": "choices"}
result = _dataset_processing(df, combine_question_and_choices=combine_params)
assert len(result) == 1


def test_dataset_processing_regex_filters_no_group():
# test regex matching when no group is specified
df = pd.DataFrame({"answer": ["A123", "B456"]})
regex_filters = [{"pattern": r"(\d+)", "col": "answer", "operation": "search"}]
result = _dataset_processing(df, regex_filters=regex_filters)
assert len(result) == 2


def test_dataset_processing_subset_columns_string():
# test column subsetting when single column name is provided as a string
df = pd.DataFrame({"question": ["Q1"], "answer": ["A1"], "other": ["O1"]})
result = _dataset_processing(df, subset_columns="question")
assert list(result.columns) == ["question"]


def test_dataset_processing_subset_columns_warning():
# test warnings when trying to subset columns that do not exist in dataset
df = pd.DataFrame({"question": ["Q1"], "answer": ["A1"]})
result = _dataset_processing(df, subset_columns=["question", "missing_col"])
assert len(result.columns) >= 1


def test_combine_question_and_choices_save_original():
# test saving original question when combining questions and choices
df = pd.DataFrame({"question": ["What is 2+2?"], "choices": ["4"]})
result = _combine_question_and_choices(df, "question", "choices", save_original_question=True)
assert "original_question" in result.columns


def test_combine_question_and_choices_dict_format():
# test handling complex choice formats with text and label columns
df = pd.DataFrame({"question": ["What is 2+2?"], "choices": [{"text": "Four", "label": "D"}], "text": ["Four"], "label": ["D"]})
result = _combine_question_and_choices(df, "question", "choices", choice_text_col="text", choice_label_col="label")
assert len(result) == 1
49 changes: 45 additions & 4 deletions tests/test_nli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,56 @@
# limitations under the License.

import gc
import pytest
from uqlm.black_box.nli import NLIScorer


def test_nli():
text1 = "Question: What is captial of France, Answer: Paris"
text2 = "Question: What is captial of France, Answer: Capital of France is Paris city."
@pytest.fixture
def text1():
return "Question: What is captial of France, Answer: Paris"

nli_model = NLIScorer()

@pytest.fixture
def text2():
return "Question: What is captial of France, Answer: Capital of France is Paris city."


@pytest.fixture
def nli_model():
return NLIScorer()


@pytest.fixture
def nli_model_cpu():
return NLIScorer(verbose=True, device="cpu")


def test_nli(text1, text2, nli_model):
probabilities = nli_model.predict(text1, text2)
del nli_model
gc.collect()
assert abs(float(probabilities[0][0]) - 0.00159405) < 1e-5


def test_nli2(text1, nli_model_cpu):
result = nli_model_cpu._observed_consistency_i(original=text1, candidates=[text1] * 5, use_best=False, compute_entropy=False)
assert result["nli_score_i"] == 1
assert result["semantic_negentropy"] is None


def test_nli3(text1, text2, nli_model_cpu):
expected_warning = "Maximum response length exceeded for NLI comparison. Truncation will occur. To adjust, change the value of max_length"

with pytest.warns(UserWarning, match=expected_warning):
nli_model_cpu.predict(text1 * 50, text2)
del nli_model_cpu
gc.collect()


def test_nli4(text1, nli_model_cpu):
with pytest.raises(ValueError) as value_error:
nli_model_cpu._semantic_entropy_process(candidates=[text1] * 5, i=1, discrete=False)
assert "SemanticEntropy currently only supports discrete evaluations" == str(value_error.value)

del nli_model_cpu
gc.collect()
10 changes: 2 additions & 8 deletions tests/test_plot_model_accuracies.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,7 @@ def test_plot_model_accuracies_basic():
correct_indicators = np.array([True, False, True, True])
thresholds = np.linspace(0, 0.9, num=10)

try:
plot_model_accuracies(scores, correct_indicators, thresholds)
except Exception as e:
pytest.fail(f"plot_model_accuracies raised an exception {e}")
plot_model_accuracies(scores, correct_indicators, thresholds)
plt.close("all")


Expand All @@ -52,10 +49,7 @@ def test_plot_model_accuracies_with_write_path():
thresholds = np.linspace(0, 0.9, num=10)
write_path = "test_plot.png"

try:
plot_model_accuracies(scores, correct_indicators, thresholds, write_path=write_path)
except Exception as e:
pytest.fail(f"plot_model_accuracies raised an exception {e}")
plot_model_accuracies(scores, correct_indicators, thresholds, write_path=write_path)
plt.close("all")
assert os.path.exists(write_path)
os.remove(write_path)
Loading