strands-evals/src/strands_evals/evaluators/faithfulness_evaluator.py at e990b9fcb585c14278a960a9e02c28b91444a21e · ybdarrenwang/strands-evals · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from enum import Enum
from typing import cast

from pydantic import BaseModel, Field
from strands import Agent
from strands.models.model import Model
from typing_extensions import Union

from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
from ..types.trace import EvaluationLevel
from .evaluator import Evaluator
from .prompt_templates.faithfulness import get_template


class FaithfulnessScore(str, Enum):
    """Categorical faithfulness ratings."""

    NOT_AT_ALL = "Not At All"
    NOT_GENERALLY = "Not Generally"
    NEUTRAL = "Neutral/Mixed"
    GENERALLY_YES = "Generally Yes"
    COMPLETELY_YES = "Completely Yes"


class FaithfulnessRating(BaseModel):
    """Structured output for faithfulness evaluation."""

    reasoning: str = Field(description="Step by step reasoning to derive the final score")
    score: FaithfulnessScore = Field(description="Categorical faithfulness rating")


class FaithfulnessEvaluator(Evaluator[InputT, OutputT]):
    """Evaluates faithfulness of agent responses against conversation history."""

    evaluation_level = EvaluationLevel.TRACE_LEVEL

    _score_mapping = {
        FaithfulnessScore.NOT_AT_ALL: 0.0,
        FaithfulnessScore.NOT_GENERALLY: 0.25,
        FaithfulnessScore.NEUTRAL: 0.5,
        FaithfulnessScore.GENERALLY_YES: 0.75,
        FaithfulnessScore.COMPLETELY_YES: 1.0,
    }

    def __init__(
        self,
        version: str = "v0",
        model: Union[Model, str, None] = None,
        system_prompt: str | None = None,
    ):
        super().__init__()
        self.system_prompt = system_prompt if system_prompt is not None else get_template(version).SYSTEM_PROMPT
        self.version = version
        self.model = model

    def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
        parsed_input = self._get_last_turn(evaluation_case)
        prompt = self._format_trace_level_prompt(parsed_input)
        evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
        result = evaluator_agent(prompt, structured_output_model=FaithfulnessRating)
        rating = cast(FaithfulnessRating, result.structured_output)
        normalized_score = self._score_mapping[rating.score]
        return [
            EvaluationOutput(
                score=normalized_score,
                test_pass=normalized_score >= 0.5,
                reason=rating.reasoning,
                label=rating.score,
            )
        ]

    async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
        parsed_input = self._get_last_turn(evaluation_case)
        prompt = self._format_trace_level_prompt(parsed_input)
        evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
        result = await evaluator_agent.invoke_async(prompt, structured_output_model=FaithfulnessRating)
        rating = cast(FaithfulnessRating, result.structured_output)
        normalized_score = self._score_mapping[rating.score]
        return [
            EvaluationOutput(
                score=normalized_score,
                test_pass=normalized_score >= 0.5,
                reason=rating.reasoning,
                label=rating.score,
            )
        ]