cvs-health · dylanbouchard · Jun 19, 2025 · Jun 13, 2025 · Jun 17, 2025 · Jun 17, 2025
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "uqlm"
-version = "0.1.5"
+version = "0.1.6"
 description = "UQLM (Uncertainty Quantification for Language Models) is a Python package for UQ-based LLM hallucination detection."
 authors = ["Dylan Bouchard <dylan.bouchard@cvshealth.com>", "Mohit Singh Chauhan <mohitsingh.chauhan@cvshealth.com>"]
 maintainers = [

diff --git a/tests/data/scorers/generate_data_llmjudge.py b/tests/data/scorers/generate_data_llmjudge.py
@@ -49,10 +49,6 @@ async def main():
 
     judge = LLMJudge(llm=original_llm, max_calls_per_min=250)
 
-    judge_result = await judge.judge_responses(prompts=prompts, responses=responses)
-
-    extract_answer = judge._extract_answers(responses=judge_result["judge_responses"])
-
     # Generate data for all templates
     templates = ["true_false_uncertain", "true_false", "continuous", "likert"]
     # Structure: one file with all template data

diff --git a/tests/data/similarity/generate_data_similarity.py b/tests/data/similarity/generate_data_similarity.py
@@ -14,9 +14,9 @@
 
 import os
 import json
+import asyncio
 
-# import bert_score
-from uqlm.similarity import BertScorer, BLEURTScorer, CosineScorer, MatchScorer
+from uqlm.black_box import BertScorer, BLEURTScorer, CosineScorer, MatchScorer
 
 
 async def main():
@@ -46,8 +46,11 @@ async def main():
     # 2. Bleurt Scorer
     bluert = BLEURTScorer()
     bluert_result = bluert.evaluate(responses=responses, sampled_responses=sampled_responses)
+    bluert_scorer_result = []
+    for i in range(len(responses)):
+        bluert_scorer_result.append(bluert.bleurt_scorer.score(references=[responses[i]] * len(sampled_responses[i]), candidates=sampled_responses[i]))
 
-    store_results.update({"bluert_result": bluert_result})
+    store_results.update({"bluert_result": bluert_result, "bluert_score": bluert_scorer_result})
 
     # 3. Cosine Similarity Scorer
     cosine = CosineScorer()
@@ -72,4 +75,4 @@ async def main():
 
 
 if __name__ == "__main__":
-    main()
+    asyncio.run(main())
diff --git a/tests/data/similarity/similarity_results_file.json b/tests/data/similarity/similarity_results_file.json
diff --git a/tests/test_blackboxuq.py b/tests/test_blackboxuq.py
@@ -15,7 +15,9 @@
 import pytest
 import json
 from uqlm.scorers import BlackBoxUQ
+from uqlm.scorers.baseclass.uncertainty import DEFAULT_BLACK_BOX_SCORERS
 from langchain_openai import AzureChatOpenAI
+import sys
 
 datafile_path = "tests/data/scorers/blackbox_results_file.json"
 with open(datafile_path, "r") as f:
@@ -28,12 +30,16 @@
 MOCKED_RESPONSES = data["responses"]
 MOCKED_SAMPLED_RESPONSES = data["sampled_responses"]
 
-mock_object = AzureChatOpenAI(deployment_name="YOUR-DEPLOYMENT", temperature=1, api_key="SECRET_API_KEY", api_version="2024-05-01-preview", azure_endpoint="https://mocked.endpoint.com")
+
+@pytest.fixture
+def mock_llm():
+    """Define mock LLM object using pytest.fixture."""
+    return AzureChatOpenAI(deployment_name="YOUR-DEPLOYMENT", temperature=1, api_key="SECRET_API_KEY", api_version="2024-05-01-preview", azure_endpoint="https://mocked.endpoint.com")
 
 
 @pytest.mark.asyncio
-async def test_bbuq(monkeypatch):
-    uqe = BlackBoxUQ(llm=mock_object, scorers=["noncontradiction", "exact_match", "semantic_negentropy"])
+async def test_bbuq(monkeypatch, mock_llm):
+    uqe = BlackBoxUQ(llm=mock_llm, scorers=["noncontradiction", "exact_match", "semantic_negentropy"])
 
     async def mock_generate_original_responses(*args, **kwargs):
         uqe.logprobs = [None] * 5
@@ -55,3 +61,25 @@ async def mock_generate_candidate_responses(*args, **kwargs):
     assert all([results.data["semantic_negentropy"][i] == pytest.approx(data["semantic_negentropy"][i]) for i in range(len(PROMPTS))])
 
     assert results.metadata == metadata
+
+    # Test invalid scorer
+    with pytest.raises(ValueError):
+        BlackBoxUQ(llm=mock_llm, scorers=["invalid_scorer"])
+
+    # Test default scorers
+    uqe_default = BlackBoxUQ(llm=mock_llm, scorers=None)
+    assert len(uqe_default.scorers) == len(DEFAULT_BLACK_BOX_SCORERS)
+
+    # Mock the entire bleurt module structure for testing
+    class MockBLEURTScorer:
+        def __init__(self):
+            pass
+
+    # Create a proper module structure that matches the import path
+    class MockBlackBoxModule:
+        BLEURTScorer = MockBLEURTScorer
+
+    # Directly modify sys.modules dictionary with the complete module structure
+    monkeypatch.setitem(sys.modules, "uqlm.black_box", MockBlackBoxModule())
+
+    BlackBoxUQ(llm=mock_llm, scorers=["bert_score", "bleurt"])
diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py
@@ -31,6 +31,7 @@
 MOCKED_JUDGE_SCORES = data["judge_1"]
 MOCKED_LOGPROBS = metadata["logprobs"]
 
+
 @pytest.fixture
 def mock_llm():
     """Extract judge object using pytest.fixture."""

diff --git a/tests/test_llmpanel.py b/tests/test_llmpanel.py
@@ -73,3 +73,40 @@ async def mock_judge_responses(*args, **kwargs):
 
     assert result.data == expected_result["data"]
     assert result.metadata == expected_result["metadata"]
+
+
+def test_scoring_templates_length_validation():
+    """Test ValueError when scoring_templates length != judges length"""
+    judge1 = MagicMock(spec=LLMJudge)
+    judge2 = MagicMock(spec=LLMJudge)
+    mock_llm = MagicMock(spec=BaseChatModel)
+    with pytest.raises(ValueError) as value_error:
+        LLMPanel(judges=[judge1, judge2], llm=mock_llm, scoring_templates=["template1"])
+    assert "Length of scoring_templates list must be equal to length of judges list" == str(value_error.value)
+
+
+def test_custom_scoring_templates():
+    """Test the else branch when custom scoring_templates provided"""
+    judge1 = MagicMock(spec=LLMJudge)
+    mock_llm = MagicMock(spec=BaseChatModel)
+    panel = LLMPanel(judges=[judge1], llm=mock_llm, scoring_templates=["custom_template"])
+    assert panel.scoring_templates == ["custom_template"]
+
+
+def test_invalid_judge_type():
+    """Test ValueError for invalid judge types"""
+    mock_llm = MagicMock(spec=BaseChatModel)
+    with pytest.raises(ValueError) as value_error:
+        LLMPanel(judges=["invalid_judge"], llm=mock_llm)
+    assert "judges must be a list containing instances of either LLMJudge or BaseChatModel" == str(value_error.value)
+
+
+def test_basechatmodel_judge_conversion(monkeypatch):
+    """Test BaseChatModel judges get converted to LLMJudge"""
+    mock_judge = MagicMock(spec=BaseChatModel)
+    mock_llm = MagicMock(spec=BaseChatModel)
+    mock_llm_judge = MagicMock(spec=LLMJudge)
+    # Mock LLMJudge constructor
+    monkeypatch.setattr("uqlm.judges.judge.LLMJudge", lambda **kwargs: mock_llm_judge)
+    panel = LLMPanel(judges=[mock_judge], llm=mock_llm)
+    assert len(panel.judges) == 1
diff --git a/tests/test_load_example_dataset.py b/tests/test_load_example_dataset.py
@@ -14,7 +14,8 @@
 
 import pytest
 import pandas as pd
-from uqlm.utils.dataloader import load_example_dataset, list_dataset_names
+from uqlm.utils.dataloader import load_example_dataset, list_dataset_names, _combine_question_and_choices
+from uqlm.utils.dataloader import _dataset_processing
 
 
 def test_list_dataset_names():
@@ -48,3 +49,118 @@ def test_load_dataset_with_processing():
     df = load_example_dataset("gsm8k", n=100, cols=["question", "answer"])
     assert df.shape[0] == 100
     assert list(df.columns) == ["question", "answer"]
+
+
+def test_dataset_processing_type_error():
+    # tests that _dataset_processing raises a TypeError when passed something other than a pandas DataFrame
+    with pytest.raises(TypeError):
+        _dataset_processing("not_a_dataframe")
+
+
+def test_combine_question_and_choices_type_error():
+    # tests that _combine_question_and_choices raises a TypeError when choice_col parameter is not a string or a list
+    df = pd.DataFrame({"question": ["Q1"]})
+    with pytest.raises(TypeError):
+        _combine_question_and_choices(df, "question", 123)
+
+
+def test_combine_question_and_choices_list_case():
+    # test the elif isinstance(choice_col, list) branch
+    df = pd.DataFrame({"question": ["Q1"], "choiceA": ["A"]})
+    result = _combine_question_and_choices(df, "question", ["choiceA"])
+    assert len(result) == 1
+
+
+def test_load_example_dataset_with_concat_all():
+    # test the if split == “all”: branch and concatenate_datasets call
+    df = load_example_dataset("svamp")  # svamp has concat=“all” in config
+    assert len(df) > 0
+
+
+def test_dataset_processing_rename_columns():
+    # test the column renaming functionality
+    df = pd.DataFrame({"old_name": ["A", "B"]})
+    result = _dataset_processing(df, rename_columns={"old_name": "new_name"})
+    assert "new_name" in result.columns
+
+
+def test_dataset_processing_strip_non_numeric():
+    # test removing non-numeric characters from specified columns
+    df = pd.DataFrame({"answer": ["A123B", "C456D"]})
+    result = _dataset_processing(df, strip_non_numeric=["answer"])
+    assert "123" in result["answer"].values
+
+
+def test_dataset_processing_strip_whitespace():
+    # test removing leading/trailing whitespaces from specified columns
+    df = pd.DataFrame({"answer": [" hello ", " world "]})
+    result = _dataset_processing(df, strip_whitespace=["answer"])
+    assert "hello" in result["answer"].values
+
+
+def test_dataset_processing_to_upper():
+    # test converting text to uppercase for both string columns and list columns
+    # test string case
+    df = pd.DataFrame({"answer": ["hello"]})
+    result = _dataset_processing(df, to_upper=["answer"])
+    assert result["answer"].iloc[0] == "HELLO"
+    # test list case
+    df = pd.DataFrame({"answer": [["hello", "world"]]})
+    result = _dataset_processing(df, to_upper=["answer"])
+    assert result["answer"].iloc[0] == ["HELLO", "WORLD"]
+
+
+def test_dataset_processing_to_lower():
+    # test converting text to uppercase for both string columns and list columns
+    # test string case
+    df = pd.DataFrame({"answer": ["HELLO"]})
+    result = _dataset_processing(df, to_lower=["answer"])
+    assert result["answer"].iloc[0] == "hello"
+    # test list case
+    df = pd.DataFrame({"answer": [["HELLO", "WORLD"]]})
+    result = _dataset_processing(df, to_lower=["answer"])
+    assert result["answer"].iloc[0] == ["hello", "world"]
+
+
+def test_dataset_processing_combine_question_and_choices():
+    # testing _combine_question_and_choices()
+    df = pd.DataFrame({"question": ["Q1"], "choices": ["A"]})
+    combine_params = {"question_col": "question", "choice_col": "choices"}
+    result = _dataset_processing(df, combine_question_and_choices=combine_params)
+    assert len(result) == 1
+
+
+def test_dataset_processing_regex_filters_no_group():
+    # test regex matching when no group is specified
+    df = pd.DataFrame({"answer": ["A123", "B456"]})
+    regex_filters = [{"pattern": r"(\d+)", "col": "answer", "operation": "search"}]
+    result = _dataset_processing(df, regex_filters=regex_filters)
+    assert len(result) == 2
+
+
+def test_dataset_processing_subset_columns_string():
+    # test column subsetting when single column name is provided as a string
+    df = pd.DataFrame({"question": ["Q1"], "answer": ["A1"], "other": ["O1"]})
+    result = _dataset_processing(df, subset_columns="question")
+    assert list(result.columns) == ["question"]
+
+
+def test_dataset_processing_subset_columns_warning():
+    # test warnings when trying to subset columns that do not exist in dataset
+    df = pd.DataFrame({"question": ["Q1"], "answer": ["A1"]})
+    result = _dataset_processing(df, subset_columns=["question", "missing_col"])
+    assert len(result.columns) >= 1
+
+
+def test_combine_question_and_choices_save_original():
+    # test saving original question when combining  questions and choices
+    df = pd.DataFrame({"question": ["What is 2+2?"], "choices": ["4"]})
+    result = _combine_question_and_choices(df, "question", "choices", save_original_question=True)
+    assert "original_question" in result.columns
+
+
+def test_combine_question_and_choices_dict_format():
+    # test handling complex choice formats with text and label columns
+    df = pd.DataFrame({"question": ["What is 2+2?"], "choices": [{"text": "Four", "label": "D"}], "text": ["Four"], "label": ["D"]})
+    result = _combine_question_and_choices(df, "question", "choices", choice_text_col="text", choice_label_col="label")
+    assert len(result) == 1
diff --git a/tests/test_nli.py b/tests/test_nli.py
@@ -13,15 +13,56 @@
 # limitations under the License.
 
 import gc
+import pytest
 from uqlm.black_box.nli import NLIScorer
 
 
-def test_nli():
-    text1 = "Question: What is captial of France, Answer: Paris"
-    text2 = "Question: What is captial of France, Answer: Capital of France is Paris city."
+@pytest.fixture
+def text1():
+    return "Question: What is captial of France, Answer: Paris"
 
-    nli_model = NLIScorer()
+
+@pytest.fixture
+def text2():
+    return "Question: What is captial of France, Answer: Capital of France is Paris city."
+
+
+@pytest.fixture
+def nli_model():
+    return NLIScorer()
+
+
+@pytest.fixture
+def nli_model_cpu():
+    return NLIScorer(verbose=True, device="cpu")
+
+
+def test_nli(text1, text2, nli_model):
     probabilities = nli_model.predict(text1, text2)
     del nli_model
     gc.collect()
     assert abs(float(probabilities[0][0]) - 0.00159405) < 1e-5
+
+
+def test_nli2(text1, nli_model_cpu):
+    result = nli_model_cpu._observed_consistency_i(original=text1, candidates=[text1] * 5, use_best=False, compute_entropy=False)
+    assert result["nli_score_i"] == 1
+    assert result["semantic_negentropy"] is None
+
+
+def test_nli3(text1, text2, nli_model_cpu):
+    expected_warning = "Maximum response length exceeded for NLI comparison. Truncation will occur. To adjust, change the value of max_length"
+
+    with pytest.warns(UserWarning, match=expected_warning):
+        nli_model_cpu.predict(text1 * 50, text2)
+    del nli_model_cpu
+    gc.collect()
+
+
+def test_nli4(text1, nli_model_cpu):
+    with pytest.raises(ValueError) as value_error:
+        nli_model_cpu._semantic_entropy_process(candidates=[text1] * 5, i=1, discrete=False)
+    assert "SemanticEntropy currently only supports discrete evaluations" == str(value_error.value)
+
+    del nli_model_cpu
+    gc.collect()
diff --git a/tests/test_plot_model_accuracies.py b/tests/test_plot_model_accuracies.py
@@ -28,10 +28,7 @@ def test_plot_model_accuracies_basic():
     correct_indicators = np.array([True, False, True, True])
     thresholds = np.linspace(0, 0.9, num=10)
 
-    try:
-        plot_model_accuracies(scores, correct_indicators, thresholds)
-    except Exception as e:
-        pytest.fail(f"plot_model_accuracies raised an exception {e}")
+    plot_model_accuracies(scores, correct_indicators, thresholds)
     plt.close("all")
 
 
@@ -52,10 +49,7 @@ def test_plot_model_accuracies_with_write_path():
     thresholds = np.linspace(0, 0.9, num=10)
     write_path = "test_plot.png"
 
-    try:
-        plot_model_accuracies(scores, correct_indicators, thresholds, write_path=write_path)
-    except Exception as e:
-        pytest.fail(f"plot_model_accuracies raised an exception {e}")
+    plot_model_accuracies(scores, correct_indicators, thresholds, write_path=write_path)
     plt.close("all")
     assert os.path.exists(write_path)
     os.remove(write_path)