Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions TRUST_SCORE_METRIC_IMPLEMENTATION.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# TrustScoreMetric Implementation

## What is solved?
This PR resolves the feature request to add a new evaluation metric called `TrustScoreMetric` to the DeepEval library (Issue #2586). The new metric evaluates the trustworthiness of an LLM's output based on the sources it retrieved during RAG retrieval. This serves as an orthogonal metric to faithfulness, differentiating between highly trusted sources (like SEC filings) and less trusted sources (like unverified blog posts).

## How is it solved?
1. **Core Implementation**: Added `TrustScoreMetric` to `deepeval/metrics/trust_score/trust_score.py`, inheriting from `BaseMetric`.
2. **Parameters**:
- `source_tiers`: A dictionary mapping source substrings to a tier (1 to 5).
- `threshold`: A score threshold to determine if the metric is successful (defaults to 0.7).
3. **Scoring Logic**:
- Parses the `retrieval_context` list in the `LLMTestCase`.
- Iterates over each context chunk and searches for substring matches from the `source_tiers` dictionary keys.
- Assigns a score per matched source tier:
- T1 = 1.0
- T2 = 0.8
- T3 = 0.6
- T4 = 0.4
- T5 = 0.2
- Unmatched sources receive a default score of 0.5.
- The final score is the average of the chunk scores.
4. **Reasoning**: Automatically builds a human-readable `reason` showing exactly which source matched which tier.
5. **Exports**: Exported the new metric in `deepeval/metrics/trust_score/__init__.py` and exposed it in the top-level `deepeval/metrics/__init__.py`.

## How to verify it?
There are two ways to verify the functionality:

**1. Run the test suite**
We have written comprehensive automated tests covering high trust, low trust, mixed trust, unmatched trust, and edge cases (like empty retrieval context).
```bash
poetry install
poetry run pytest tests/test_metrics/test_trust_score_metric.py
```

**2. Run the provided minimal example**
A self-contained usage example script was added to the examples directory. This runs two assertions (one passing for a highly trusted source, one failing for a low trusted source).
```bash
poetry run pytest examples/getting_started/test_trust_score.py
```
2 changes: 2 additions & 0 deletions deepeval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
ImageHelpfulnessMetric,
ImageReferenceMetric,
)
from .trust_score.trust_score import TrustScoreMetric

__all__ = [
# Base classes
Expand Down Expand Up @@ -129,4 +130,5 @@
"ImageCoherenceMetric",
"ImageHelpfulnessMetric",
"ImageReferenceMetric",
"TrustScoreMetric",
]
3 changes: 3 additions & 0 deletions deepeval/metrics/trust_score/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .trust_score import TrustScoreMetric

__all__ = ["TrustScoreMetric"]
130 changes: 130 additions & 0 deletions deepeval/metrics/trust_score/trust_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
from typing import List, Dict, Optional
import asyncio

from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.metrics.utils import (
check_llm_test_case_params,
construct_verbose_logs,
)
from deepeval.metrics.api import metric_data_manager
from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase, LLMTestCaseParams


class TrustScoreMetric(BaseMetric):
_required_params: List[LLMTestCaseParams] = [
LLMTestCaseParams.RETRIEVAL_CONTEXT,
]

def __init__(
self,
source_tiers: Dict[str, int],
threshold: float = 0.7,
verbose_mode: bool = False,
):
self.source_tiers = source_tiers
self.threshold = threshold
self.verbose_mode = verbose_mode

def measure(
self,
test_case: LLMTestCase,
_show_indicator: bool = True,
_in_component: bool = False,
_log_metric_to_confident: bool = True,
) -> float:
check_llm_test_case_params(
test_case,
self._required_params,
None,
None,
self,
None,
test_case.multimodal,
)

with metric_progress_indicator(
self, _show_indicator=_show_indicator, _in_component=_in_component
):
self._calculate_score_and_reason(test_case.retrieval_context)
self.success = self.score >= self.threshold

if self.verbose_mode:
self.verbose_logs = construct_verbose_logs(
self,
steps=[
f"Score: {self.score:.2f}",
f"Reason: {self.reason}",
],
)

if _log_metric_to_confident:
metric_data_manager.post_metric_if_enabled(
self, test_case=test_case
)

return self.score

async def a_measure(
self,
test_case: LLMTestCase,
_show_indicator: bool = True,
_in_component: bool = False,
) -> float:
# The calculation is entirely synchronous based on the context strings,
# so we can just defer to measure().
return self.measure(
test_case,
_show_indicator=_show_indicator,
_in_component=_in_component,
)

def _calculate_score_and_reason(self, retrieval_context: Optional[List[str]]):
if not retrieval_context:
self.score = 0.5
self.reason = "Empty retrieval context, defaulting to score of 0.5."
return

tier_scores = {1: 1.0, 2: 0.8, 3: 0.6, 4: 0.4, 5: 0.2}
default_score = 0.5

chunk_scores = []
reason_parts = []

for chunk in retrieval_context:
matched_tier = None
matched_source = None

# Find the best (lowest number) tier that matches
for source, tier in self.source_tiers.items():
if source in chunk:
if matched_tier is None or tier < matched_tier:
matched_tier = tier
matched_source = source

if matched_tier is not None:
score = tier_scores.get(matched_tier, default_score)
chunk_scores.append(score)
reason_parts.append(f"Matched source '{matched_source}' mapped to Tier {matched_tier} (Score: {score}).")
else:
chunk_scores.append(default_score)
reason_parts.append(f"Unmatched source in context chunk, defaulting to Score: {default_score}.")

self.score = sum(chunk_scores) / len(chunk_scores) if chunk_scores else default_score

reason_str = " ".join(reason_parts)
self.reason = f"Average score calculated from sources: {reason_str}"

def is_successful(self) -> bool:
if self.error is not None:
self.success = False
else:
try:
self.success = self.score >= self.threshold
except:
self.success = False
return self.success

@property
def __name__(self):
return "Trust Score"
37 changes: 37 additions & 0 deletions examples/getting_started/test_trust_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pytest
from deepeval import assert_test
from deepeval.test_case import LLMTestCase
from deepeval.metrics import TrustScoreMetric

def test_trust_score():
# Define trust tiers for sources
source_tiers = {
"SEC filing": 1,
"Bloomberg": 2,
"Verified news": 3,
"Blog post": 4,
"Reddit": 5
}

# Initialize metric
metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7)

# Response A - High trust source
test_case_a = LLMTestCase(
input="What is the company's revenue?",
actual_output="The company's revenue is $10M.",
retrieval_context=["According to the SEC filing, the revenue is $10M."]
)

assert_test(test_case_a, [metric])

# Response B - Low trust source
test_case_b = LLMTestCase(
input="What is the company's revenue?",
actual_output="The company's revenue is $10M.",
retrieval_context=["I read on a Blog post that the revenue is $10M."]
)

# This should fail because the score will be 0.4 which is < 0.7
with pytest.raises(AssertionError):
assert_test(test_case_b, [metric])
94 changes: 94 additions & 0 deletions tests/test_metrics/test_trust_score_metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import pytest
from deepeval.metrics import TrustScoreMetric
from deepeval.test_case import LLMTestCase


class TestTrustScoreMetric:

@pytest.fixture
def source_tiers(self):
return {
"SEC": 1,
"Bloomberg": 2,
"NewsSite": 3,
"Blog": 4,
"Reddit": 5,
}

def test_high_trust(self, source_tiers):
metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7)
test_case = LLMTestCase(
input="What is the revenue?",
actual_output="The revenue is 100M.",
retrieval_context=["According to the SEC filings, the revenue is 100M."]
)
metric.measure(test_case)
assert metric.score == 1.0
assert metric.success is True
assert "Matched source 'SEC' mapped to Tier 1" in metric.reason

def test_low_trust(self, source_tiers):
metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7)
test_case = LLMTestCase(
input="What is the revenue?",
actual_output="The revenue is 100M.",
retrieval_context=["I read on a Blog that the revenue is 100M."]
)
metric.measure(test_case)
assert metric.score == 0.4
assert metric.success is False
assert "Matched source 'Blog' mapped to Tier 4" in metric.reason

def test_mixed_trust(self, source_tiers):
metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7)
test_case = LLMTestCase(
input="What is the revenue?",
actual_output="The revenue is 100M.",
retrieval_context=[
"According to the SEC filings, the revenue is 100M.",
"Also, some Reddit thread said it was 100M."
]
)
metric.measure(test_case)
# T1 (1.0) + T5 (0.2) = 1.2 / 2 = 0.6
assert metric.score == 0.6
assert metric.success is False
assert "Tier 1" in metric.reason
assert "Tier 5" in metric.reason

def test_unmatched_trust(self, source_tiers):
metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7)
test_case = LLMTestCase(
input="What is the revenue?",
actual_output="The revenue is 100M.",
retrieval_context=["According to an unknown source, the revenue is 100M."]
)
metric.measure(test_case)
assert metric.score == 0.5
assert metric.success is False
assert "Unmatched source" in metric.reason

def test_empty_retrieval_context(self, source_tiers):
metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7)
test_case = LLMTestCase(
input="What is the revenue?",
actual_output="The revenue is 100M.",
retrieval_context=[]
)
metric.measure(test_case)
assert metric.score == 0.5
assert metric.success is False
assert "Empty retrieval context" in metric.reason

@pytest.mark.asyncio
async def test_async_measure(self, source_tiers):
metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7)
test_case = LLMTestCase(
input="What is the revenue?",
actual_output="The revenue is 100M.",
retrieval_context=["According to Bloomberg, the revenue is 100M."]
)
score = await metric.a_measure(test_case)
assert score == 0.8
assert metric.score == 0.8
assert metric.success is True
Loading