Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions deepeval/metrics/hallucination/hallucination.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
async_mode: bool = True,
strict_mode: bool = False,
verbose_mode: bool = False,
penalize_ambiguous_claims: bool = False,
evaluation_template: Type[
HallucinationTemplate
] = HallucinationTemplate,
Expand All @@ -49,6 +50,7 @@ def __init__(
self.async_mode = async_mode
self.strict_mode = strict_mode
self.verbose_mode = verbose_mode
self.penalize_ambiguous_claims = penalize_ambiguous_claims
self.evaluation_template = evaluation_template

def measure(
Expand Down Expand Up @@ -153,10 +155,13 @@ async def _a_generate_reason(self):
factual_alignments = []
contradictions = []
for verdict in self.verdicts:
if verdict.verdict.strip().lower() == "yes":
v = verdict.verdict.strip().lower()
if v == "yes":
factual_alignments.append(verdict.reason)
else:
elif v == "no":
contradictions.append(verdict.reason)
elif v == "idk" and self.penalize_ambiguous_claims:
contradictions.append(f"(Ambiguous) {verdict.reason}")

prompt: dict = self.evaluation_template.generate_reason(
factual_alignments=factual_alignments,
Expand All @@ -179,10 +184,13 @@ def _generate_reason(self):
factual_alignments = []
contradictions = []
for verdict in self.verdicts:
if verdict.verdict.strip().lower() == "yes":
v = verdict.verdict.strip().lower()
if v == "yes":
factual_alignments.append(verdict.reason)
else:
elif v == "no":
contradictions.append(verdict.reason)
elif v == "idk" and self.penalize_ambiguous_claims:
contradictions.append(f"(Ambiguous) {verdict.reason}")

prompt: dict = self.evaluation_template.generate_reason(
factual_alignments=factual_alignments,
Expand Down Expand Up @@ -237,7 +245,10 @@ def _calculate_score(self) -> float:

hallucination_count = 0
for verdict in self.verdicts:
if verdict.verdict.strip().lower() == "no":
v = verdict.verdict.strip().lower()
if v == "no":
hallucination_count += 1
elif v == "idk" and self.penalize_ambiguous_claims:
hallucination_count += 1

score = hallucination_count / number_of_verdicts
Expand Down
2 changes: 1 addition & 1 deletion deepeval/metrics/hallucination/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


class HallucinationVerdict(BaseModel):
verdict: Literal["yes", "no"]
verdict: Literal["yes", "no", "idk"]
reason: str


Expand Down
7 changes: 4 additions & 3 deletions deepeval/metrics/hallucination/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ def generate_verdicts(actual_output: str, contexts: List[str]):

{HallucinationTemplate.multimodal_rules}

The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether the given text agrees with the context.
The 'reason' is the reason for the verdict. When the answer is 'no', try to provide a correction in the reason.
The 'verdict' key should STRICTLY be 'yes', 'no', or 'idk', and states whether the given text agrees with the context.
The 'reason' is the reason for the verdict. When the answer is 'no', try to provide a correction in the reason.
Use 'idk' when the actual output makes a claim that cannot be clearly verified or contradicted by the context alone.

**
IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects.
Expand All @@ -35,7 +36,7 @@ def generate_verdicts(actual_output: str, contexts: List[str]):
"reason": "The actual output contradicts the provided context which states that Einstein won the Nobel Prize in 1968, not 1969.",
"verdict": "no"
}}
]
]
}}

You should NOT incorporate any prior knowledge you have and take each context at face value. Since you are going to generate a verdict for each context, the number of 'verdicts' SHOULD BE STRICTLY EQUAL TO {len(contexts)}.
Expand Down
42 changes: 42 additions & 0 deletions tests/test_metrics/test_hallucination_metric.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
from unittest.mock import MagicMock, patch
import pytest
from deepeval.metrics import HallucinationMetric
from deepeval.metrics.hallucination.schema import HallucinationVerdict
from deepeval.test_case import LLMTestCase, MLLMImage, ToolCall
from deepeval import evaluate

Expand Down Expand Up @@ -171,3 +173,43 @@ def test_multimodal_evaluate_method(self):
results = evaluate([test_case], [metric])

assert results is not None


class TestHallucinationPenalizeAmbiguousClaims:
"""No-API-key unit tests for penalize_ambiguous_claims flag."""

def _make_metric(self, penalize: bool) -> HallucinationMetric:
mock_model = MagicMock()
mock_model.get_model_name.return_value = "mock"
with patch(
"deepeval.metrics.hallucination.hallucination.initialize_model",
return_value=(mock_model, True),
):
return HallucinationMetric(penalize_ambiguous_claims=penalize)

def test_idk_not_penalized_by_default(self):
metric = self._make_metric(penalize=False)
metric.verdicts = [
HallucinationVerdict(verdict="yes", reason="aligned"),
HallucinationVerdict(verdict="idk", reason="uncertain"),
]
score = metric._calculate_score()
assert score == 0.0

def test_idk_penalized_when_flag_set(self):
metric = self._make_metric(penalize=True)
metric.verdicts = [
HallucinationVerdict(verdict="yes", reason="aligned"),
HallucinationVerdict(verdict="idk", reason="uncertain"),
]
score = metric._calculate_score()
assert score == 0.5

def test_no_verdict_still_penalized(self):
metric = self._make_metric(penalize=True)
metric.verdicts = [
HallucinationVerdict(verdict="no", reason="contradicts"),
HallucinationVerdict(verdict="idk", reason="uncertain"),
]
score = metric._calculate_score()
assert score == 1.0
Loading