diff --git a/TRUST_SCORE_METRIC_IMPLEMENTATION.md b/TRUST_SCORE_METRIC_IMPLEMENTATION.md new file mode 100644 index 0000000000..4f06ed4d45 --- /dev/null +++ b/TRUST_SCORE_METRIC_IMPLEMENTATION.md @@ -0,0 +1,39 @@ +# TrustScoreMetric Implementation + +## What is solved? +This PR resolves the feature request to add a new evaluation metric called `TrustScoreMetric` to the DeepEval library (Issue #2586). The new metric evaluates the trustworthiness of an LLM's output based on the sources it retrieved during RAG retrieval. This serves as an orthogonal metric to faithfulness, differentiating between highly trusted sources (like SEC filings) and less trusted sources (like unverified blog posts). + +## How is it solved? +1. **Core Implementation**: Added `TrustScoreMetric` to `deepeval/metrics/trust_score/trust_score.py`, inheriting from `BaseMetric`. +2. **Parameters**: + - `source_tiers`: A dictionary mapping source substrings to a tier (1 to 5). + - `threshold`: A score threshold to determine if the metric is successful (defaults to 0.7). +3. **Scoring Logic**: + - Parses the `retrieval_context` list in the `LLMTestCase`. + - Iterates over each context chunk and searches for substring matches from the `source_tiers` dictionary keys. + - Assigns a score per matched source tier: + - T1 = 1.0 + - T2 = 0.8 + - T3 = 0.6 + - T4 = 0.4 + - T5 = 0.2 + - Unmatched sources receive a default score of 0.5. + - The final score is the average of the chunk scores. +4. **Reasoning**: Automatically builds a human-readable `reason` showing exactly which source matched which tier. +5. **Exports**: Exported the new metric in `deepeval/metrics/trust_score/__init__.py` and exposed it in the top-level `deepeval/metrics/__init__.py`. + +## How to verify it? +There are two ways to verify the functionality: + +**1. Run the test suite** +We have written comprehensive automated tests covering high trust, low trust, mixed trust, unmatched trust, and edge cases (like empty retrieval context). +```bash +poetry install +poetry run pytest tests/test_metrics/test_trust_score_metric.py +``` + +**2. Run the provided minimal example** +A self-contained usage example script was added to the examples directory. This runs two assertions (one passing for a highly trusted source, one failing for a low trusted source). +```bash +poetry run pytest examples/getting_started/test_trust_score.py +``` diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py index 72cb973106..e915f9cd88 100644 --- a/deepeval/metrics/__init__.py +++ b/deepeval/metrics/__init__.py @@ -65,6 +65,7 @@ ImageHelpfulnessMetric, ImageReferenceMetric, ) +from .trust_score.trust_score import TrustScoreMetric __all__ = [ # Base classes @@ -129,4 +130,5 @@ "ImageCoherenceMetric", "ImageHelpfulnessMetric", "ImageReferenceMetric", + "TrustScoreMetric", ] diff --git a/deepeval/metrics/trust_score/__init__.py b/deepeval/metrics/trust_score/__init__.py new file mode 100644 index 0000000000..eb5076c142 --- /dev/null +++ b/deepeval/metrics/trust_score/__init__.py @@ -0,0 +1,3 @@ +from .trust_score import TrustScoreMetric + +__all__ = ["TrustScoreMetric"] diff --git a/deepeval/metrics/trust_score/trust_score.py b/deepeval/metrics/trust_score/trust_score.py new file mode 100644 index 0000000000..cc7f781c77 --- /dev/null +++ b/deepeval/metrics/trust_score/trust_score.py @@ -0,0 +1,130 @@ +from typing import List, Dict, Optional +import asyncio + +from deepeval.metrics.indicator import metric_progress_indicator +from deepeval.metrics.utils import ( + check_llm_test_case_params, + construct_verbose_logs, +) +from deepeval.metrics.api import metric_data_manager +from deepeval.metrics import BaseMetric +from deepeval.test_case import LLMTestCase, LLMTestCaseParams + + +class TrustScoreMetric(BaseMetric): + _required_params: List[LLMTestCaseParams] = [ + LLMTestCaseParams.RETRIEVAL_CONTEXT, + ] + + def __init__( + self, + source_tiers: Dict[str, int], + threshold: float = 0.7, + verbose_mode: bool = False, + ): + self.source_tiers = source_tiers + self.threshold = threshold + self.verbose_mode = verbose_mode + + def measure( + self, + test_case: LLMTestCase, + _show_indicator: bool = True, + _in_component: bool = False, + _log_metric_to_confident: bool = True, + ) -> float: + check_llm_test_case_params( + test_case, + self._required_params, + None, + None, + self, + None, + test_case.multimodal, + ) + + with metric_progress_indicator( + self, _show_indicator=_show_indicator, _in_component=_in_component + ): + self._calculate_score_and_reason(test_case.retrieval_context) + self.success = self.score >= self.threshold + + if self.verbose_mode: + self.verbose_logs = construct_verbose_logs( + self, + steps=[ + f"Score: {self.score:.2f}", + f"Reason: {self.reason}", + ], + ) + + if _log_metric_to_confident: + metric_data_manager.post_metric_if_enabled( + self, test_case=test_case + ) + + return self.score + + async def a_measure( + self, + test_case: LLMTestCase, + _show_indicator: bool = True, + _in_component: bool = False, + ) -> float: + # The calculation is entirely synchronous based on the context strings, + # so we can just defer to measure(). + return self.measure( + test_case, + _show_indicator=_show_indicator, + _in_component=_in_component, + ) + + def _calculate_score_and_reason(self, retrieval_context: Optional[List[str]]): + if not retrieval_context: + self.score = 0.5 + self.reason = "Empty retrieval context, defaulting to score of 0.5." + return + + tier_scores = {1: 1.0, 2: 0.8, 3: 0.6, 4: 0.4, 5: 0.2} + default_score = 0.5 + + chunk_scores = [] + reason_parts = [] + + for chunk in retrieval_context: + matched_tier = None + matched_source = None + + # Find the best (lowest number) tier that matches + for source, tier in self.source_tiers.items(): + if source in chunk: + if matched_tier is None or tier < matched_tier: + matched_tier = tier + matched_source = source + + if matched_tier is not None: + score = tier_scores.get(matched_tier, default_score) + chunk_scores.append(score) + reason_parts.append(f"Matched source '{matched_source}' mapped to Tier {matched_tier} (Score: {score}).") + else: + chunk_scores.append(default_score) + reason_parts.append(f"Unmatched source in context chunk, defaulting to Score: {default_score}.") + + self.score = sum(chunk_scores) / len(chunk_scores) if chunk_scores else default_score + + reason_str = " ".join(reason_parts) + self.reason = f"Average score calculated from sources: {reason_str}" + + def is_successful(self) -> bool: + if self.error is not None: + self.success = False + else: + try: + self.success = self.score >= self.threshold + except: + self.success = False + return self.success + + @property + def __name__(self): + return "Trust Score" diff --git a/examples/getting_started/test_trust_score.py b/examples/getting_started/test_trust_score.py new file mode 100644 index 0000000000..51ad56c8b1 --- /dev/null +++ b/examples/getting_started/test_trust_score.py @@ -0,0 +1,37 @@ +import pytest +from deepeval import assert_test +from deepeval.test_case import LLMTestCase +from deepeval.metrics import TrustScoreMetric + +def test_trust_score(): + # Define trust tiers for sources + source_tiers = { + "SEC filing": 1, + "Bloomberg": 2, + "Verified news": 3, + "Blog post": 4, + "Reddit": 5 + } + + # Initialize metric + metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7) + + # Response A - High trust source + test_case_a = LLMTestCase( + input="What is the company's revenue?", + actual_output="The company's revenue is $10M.", + retrieval_context=["According to the SEC filing, the revenue is $10M."] + ) + + assert_test(test_case_a, [metric]) + + # Response B - Low trust source + test_case_b = LLMTestCase( + input="What is the company's revenue?", + actual_output="The company's revenue is $10M.", + retrieval_context=["I read on a Blog post that the revenue is $10M."] + ) + + # This should fail because the score will be 0.4 which is < 0.7 + with pytest.raises(AssertionError): + assert_test(test_case_b, [metric]) diff --git a/tests/test_metrics/test_trust_score_metric.py b/tests/test_metrics/test_trust_score_metric.py new file mode 100644 index 0000000000..58e310235d --- /dev/null +++ b/tests/test_metrics/test_trust_score_metric.py @@ -0,0 +1,94 @@ +import pytest +from deepeval.metrics import TrustScoreMetric +from deepeval.test_case import LLMTestCase + + +class TestTrustScoreMetric: + + @pytest.fixture + def source_tiers(self): + return { + "SEC": 1, + "Bloomberg": 2, + "NewsSite": 3, + "Blog": 4, + "Reddit": 5, + } + + def test_high_trust(self, source_tiers): + metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7) + test_case = LLMTestCase( + input="What is the revenue?", + actual_output="The revenue is 100M.", + retrieval_context=["According to the SEC filings, the revenue is 100M."] + ) + metric.measure(test_case) + assert metric.score == 1.0 + assert metric.success is True + assert "Matched source 'SEC' mapped to Tier 1" in metric.reason + + def test_low_trust(self, source_tiers): + metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7) + test_case = LLMTestCase( + input="What is the revenue?", + actual_output="The revenue is 100M.", + retrieval_context=["I read on a Blog that the revenue is 100M."] + ) + metric.measure(test_case) + assert metric.score == 0.4 + assert metric.success is False + assert "Matched source 'Blog' mapped to Tier 4" in metric.reason + + def test_mixed_trust(self, source_tiers): + metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7) + test_case = LLMTestCase( + input="What is the revenue?", + actual_output="The revenue is 100M.", + retrieval_context=[ + "According to the SEC filings, the revenue is 100M.", + "Also, some Reddit thread said it was 100M." + ] + ) + metric.measure(test_case) + # T1 (1.0) + T5 (0.2) = 1.2 / 2 = 0.6 + assert metric.score == 0.6 + assert metric.success is False + assert "Tier 1" in metric.reason + assert "Tier 5" in metric.reason + + def test_unmatched_trust(self, source_tiers): + metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7) + test_case = LLMTestCase( + input="What is the revenue?", + actual_output="The revenue is 100M.", + retrieval_context=["According to an unknown source, the revenue is 100M."] + ) + metric.measure(test_case) + assert metric.score == 0.5 + assert metric.success is False + assert "Unmatched source" in metric.reason + + def test_empty_retrieval_context(self, source_tiers): + metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7) + test_case = LLMTestCase( + input="What is the revenue?", + actual_output="The revenue is 100M.", + retrieval_context=[] + ) + metric.measure(test_case) + assert metric.score == 0.5 + assert metric.success is False + assert "Empty retrieval context" in metric.reason + + @pytest.mark.asyncio + async def test_async_measure(self, source_tiers): + metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7) + test_case = LLMTestCase( + input="What is the revenue?", + actual_output="The revenue is 100M.", + retrieval_context=["According to Bloomberg, the revenue is 100M."] + ) + score = await metric.a_measure(test_case) + assert score == 0.8 + assert metric.score == 0.8 + assert metric.success is True