diff --git a/src/examples/faithfulness_evaluator.py b/src/examples/faithfulness_evaluator.py new file mode 100644 index 0000000..0176244 --- /dev/null +++ b/src/examples/faithfulness_evaluator.py @@ -0,0 +1,44 @@ +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter +from strands import Agent + +from strands_evals import Case, Dataset +from strands_evals.evaluators import FaithfulnessEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.telemetry import StrandsEvalsTelemetry + +# ====================================== +# SETUP TELEMETRY +# ====================================== +telemetry = StrandsEvalsTelemetry() +memory_exporter = InMemorySpanExporter() +span_processor = BatchSpanProcessor(memory_exporter) +telemetry.tracer_provider.add_span_processor(span_processor) + + +# ====================================== +# SETUP AND RUN STRANDS EVAL +# ====================================== + + +def user_task_function(query: str) -> str: + agent = Agent(callback_handler=None) + agent_response = agent(query) + finished_spans = memory_exporter.get_finished_spans() + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(finished_spans, session_id="test-session") + + return {"output": str(agent_response), "trajectory": session} + + +test_cases = [ + Case[str, str](name="knowledge-1", input="What is the capital of France?", metadata={"category": "knowledge"}), + Case[str, str](name="knowledge-2", input="What color is the ocean?", metadata={"category": "knowledge"}), +] + +evaluator = FaithfulnessEvaluator() + +dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator) + +report = dataset.run_evaluations(user_task_function) +report.run_display() diff --git a/src/strands_evals/evaluators/__init__.py b/src/strands_evals/evaluators/__init__.py index 0c28d10..ff76cb7 100644 --- a/src/strands_evals/evaluators/__init__.py +++ b/src/strands_evals/evaluators/__init__.py @@ -1,4 +1,5 @@ from .evaluator import Evaluator +from .faithfulness_evaluator import FaithfulnessEvaluator from .goal_success_rate_evaluator import GoalSuccessRateEvaluator from .helpfulness_evaluator import HelpfulnessEvaluator from .interactions_evaluator import InteractionsEvaluator @@ -12,4 +13,5 @@ "InteractionsEvaluator", "HelpfulnessEvaluator", "GoalSuccessRateEvaluator", + "FaithfulnessEvaluator", ] diff --git a/src/strands_evals/evaluators/evaluator.py b/src/strands_evals/evaluators/evaluator.py index a369d79..f4f9a47 100644 --- a/src/strands_evals/evaluators/evaluator.py +++ b/src/strands_evals/evaluators/evaluator.py @@ -1,11 +1,11 @@ import inspect import logging -from typing_extensions import Any, Generic, TypeVar +from typing_extensions import Any, Generic, TypeGuard, TypeVar from ..extractors import TraceExtractor from ..types.evaluation import EvaluationData, EvaluationOutput -from ..types.trace import Conversation, EvaluationLevel, Session, ToolConfig +from ..types.trace import AssistantMessage, Context, EvaluationLevel, Session, TextContent, ToolConfig, UserMessage logger = logging.getLogger(__name__) @@ -88,18 +88,34 @@ def _format_tools(self, tools: list[ToolConfig]) -> str: """Format available tools for prompt display.""" return "\n".join([f"- {tool.name}: {tool.description or 'No description'}" for tool in tools]) - def _format_conversation_history(self, conversations: list[Conversation]) -> str: - """Format conversation history with tool executions for prompt display.""" + def _format_session_history(self, contexts: list[Context]) -> str: + """Format session history with tool executions for prompt display.""" lines = [] - for conv in conversations: - lines.append(f"User: {conv.user_prompt.text}") - if conv.tool_execution_history: - for tool_exec in conv.tool_execution_history: + for ctx in contexts: + lines.append(f"User: {ctx.user_prompt.text}") + if ctx.tool_execution_history: + for tool_exec in ctx.tool_execution_history: lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})") lines.append(f"Tool: {tool_exec.tool_result.content}") - lines.append(f"Assistant: {conv.agent_response.text}") + lines.append(f"Assistant: {ctx.agent_response.text}") return "\n".join(lines) + def _has_text_content(self, msg: UserMessage | AssistantMessage) -> TypeGuard[UserMessage | AssistantMessage]: + """Check if a message object has accessible text content. + + Args: + msg: Message object to check (UserMessage or AssistantMessage) + + Returns: + True if msg has content attribute with at least one item that is TextContent + """ + return ( + hasattr(msg, "content") + and bool(msg.content) + and len(msg.content) > 0 + and isinstance(msg.content[0], TextContent) + ) + @classmethod def get_type_name(cls) -> str: """ diff --git a/src/strands_evals/evaluators/faithfulness_evaluator.py b/src/strands_evals/evaluators/faithfulness_evaluator.py new file mode 100644 index 0000000..d0ff37d --- /dev/null +++ b/src/strands_evals/evaluators/faithfulness_evaluator.py @@ -0,0 +1,105 @@ +from enum import Enum + +from pydantic import BaseModel, Field +from strands import Agent +from typing_extensions import TypeVar + +from ..types.evaluation import EvaluationData, EvaluationOutput +from ..types.trace import EvaluationLevel, TraceLevelInput +from .evaluator import Evaluator +from .prompt_templates.faithfulness import get_template + +InputT = TypeVar("InputT") +OutputT = TypeVar("OutputT") + + +class FaithfulnessScore(str, Enum): + """Categorical faithfulness ratings.""" + + NOT_AT_ALL = "Not At All" + NOT_GENERALLY = "Not Generally" + NEUTRAL = "Neutral/Mixed" + GENERALLY_YES = "Generally Yes" + COMPLETELY_YES = "Completely Yes" + + +class FaithfulnessRating(BaseModel): + """Structured output for faithfulness evaluation.""" + + reasoning: str = Field(description="Step by step reasoning to derive the final score") + score: FaithfulnessScore = Field(description="Categorical faithfulness rating") + + +class FaithfulnessEvaluator(Evaluator[InputT, OutputT]): + """Evaluates faithfulness of agent responses against conversation history.""" + + evaluation_level = EvaluationLevel.TRACE_LEVEL + + _score_mapping = { + FaithfulnessScore.NOT_AT_ALL: 0.0, + FaithfulnessScore.NOT_GENERALLY: 0.25, + FaithfulnessScore.NEUTRAL: 0.5, + FaithfulnessScore.GENERALLY_YES: 0.75, + FaithfulnessScore.COMPLETELY_YES: 1.0, + } + + def __init__( + self, + version: str = "v0", + model: str | None = None, + system_prompt: str | None = None, + ): + super().__init__() + self.system_prompt = system_prompt if system_prompt is not None else get_template(version).SYSTEM_PROMPT + self.version = version + self.model = model + + def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: + parsed_input = self._get_last_turn(evaluation_case) + prompt = self._format_prompt(parsed_input) + evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) + rating = evaluator_agent.structured_output(FaithfulnessRating, prompt) + normalized_score = self._score_mapping[rating.score] + result = EvaluationOutput(score=normalized_score, test_pass=normalized_score >= 0.5, reason=rating.reasoning) + return [result] + + async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: + parsed_input = self._get_last_turn(evaluation_case) + prompt = self._format_prompt(parsed_input) + evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) + rating = await evaluator_agent.structured_output_async(FaithfulnessRating, prompt) + normalized_score = self._score_mapping[rating.score] + result = EvaluationOutput(score=normalized_score, test_pass=normalized_score >= 0.5, reason=rating.reasoning) + return [result] + + def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput: + """Extract the most recent turn from the conversation for evaluation.""" + parsed_inputs = self._parse_trajectory(evaluation_case) + if not parsed_inputs: + raise ValueError( + "No turn-level inputs could be parsed from the trajectory. " + "Ensure actual_trajectory is a Session with at least one AgentInvocationSpan." + ) + return parsed_inputs[-1] + + def _format_prompt(self, parsed_input: TraceLevelInput) -> str: + """Format evaluation prompt from parsed turn data.""" + parts = [] + + if parsed_input.session_history: + history_lines = [] + for msg in parsed_input.session_history: + if isinstance(msg, list): + # Handle tool execution lists + for tool_exec in msg: + history_lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})") + history_lines.append(f"Tool: {tool_exec.tool_result.content}") + else: + text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else "" + history_lines.append(f"{msg.role.value.capitalize()}: {text}") + history_str = "\n".join(history_lines) + parts.append(f"# Conversation History:\n{history_str}") + + parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}") + + return "\n\n".join(parts) diff --git a/src/strands_evals/evaluators/goal_success_rate_evaluator.py b/src/strands_evals/evaluators/goal_success_rate_evaluator.py index 47cf511..3a90099 100644 --- a/src/strands_evals/evaluators/goal_success_rate_evaluator.py +++ b/src/strands_evals/evaluators/goal_success_rate_evaluator.py @@ -5,7 +5,7 @@ from typing_extensions import TypeVar from ..types.evaluation import EvaluationData, EvaluationOutput -from ..types.trace import ConversationLevelInput, EvaluationLevel +from ..types.trace import EvaluationLevel, SessionLevelInput from .evaluator import Evaluator from .prompt_templates.goal_success_rate import get_template @@ -30,7 +30,7 @@ class GoalSuccessRating(BaseModel): class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]): """Evaluates whether all user goals were successfully achieved in a conversation.""" - evaluation_level = EvaluationLevel.CONVERSATION_LEVEL + evaluation_level = EvaluationLevel.SESSION_LEVEL _score_mapping = { GoalSuccessScore.YES: 1.0, @@ -49,8 +49,8 @@ def __init__( self.model = model def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: - conversation_input = self._parse_trajectory(evaluation_case) - prompt = self._format_prompt(conversation_input) + session_input = self._parse_trajectory(evaluation_case) + prompt = self._format_prompt(session_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) rating = evaluator_agent.structured_output(GoalSuccessRating, prompt) normalized_score = self._score_mapping[rating.score] @@ -58,24 +58,22 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva return [result] async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: - conversation_input = self._parse_trajectory(evaluation_case) - prompt = self._format_prompt(conversation_input) + session_input = self._parse_trajectory(evaluation_case) + prompt = self._format_prompt(session_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt) normalized_score = self._score_mapping[rating.score] result = EvaluationOutput(score=normalized_score, test_pass=normalized_score >= 1.0, reason=rating.reasoning) return [result] - def _format_prompt(self, conversation_input: ConversationLevelInput) -> str: - """Format evaluation prompt from conversation-level input.""" + def _format_prompt(self, session_input: SessionLevelInput) -> str: + """Format evaluation prompt from session-level input.""" parts = [] - if conversation_input.available_tools: - parts.append(f"# Available tools\n{self._format_tools(conversation_input.available_tools)}") + if session_input.available_tools: + parts.append(f"# Available tools\n{self._format_tools(session_input.available_tools)}") - if conversation_input.conversation_history: - parts.append( - f"# Conversation record\n{self._format_conversation_history(conversation_input.conversation_history)}" - ) + if session_input.session_history: + parts.append(f"# Conversation record\n{self._format_session_history(session_input.session_history)}") return "\n\n".join(parts) diff --git a/src/strands_evals/evaluators/helpfulness_evaluator.py b/src/strands_evals/evaluators/helpfulness_evaluator.py index f167bba..5776ef6 100644 --- a/src/strands_evals/evaluators/helpfulness_evaluator.py +++ b/src/strands_evals/evaluators/helpfulness_evaluator.py @@ -5,7 +5,7 @@ from typing_extensions import TypeVar from ..types.evaluation import EvaluationData, EvaluationOutput -from ..types.trace import EvaluationLevel, TurnLevelInput +from ..types.trace import EvaluationLevel, TextContent, ToolExecution, TraceLevelInput from .evaluator import Evaluator from .prompt_templates.helpfulness import get_template @@ -35,7 +35,7 @@ class HelpfulnessRating(BaseModel): class HelpfulnessEvaluator(Evaluator[InputT, OutputT]): """Evaluates helpfulness of agent responses from the user's perspective.""" - evaluation_level = EvaluationLevel.TURN_LEVEL + evaluation_level = EvaluationLevel.TRACE_LEVEL _score_mapping = { HelpfulnessScore.NOT_HELPFUL: 0.0, @@ -78,7 +78,7 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) result = EvaluationOutput(score=normalized_score, test_pass=normalized_score >= 0.5, reason=rating.reasoning) return [result] - def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TurnLevelInput: + def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput: """Extract the most recent turn from the conversation for evaluation.""" parsed_inputs = self._parse_trajectory(evaluation_case) if not parsed_inputs: @@ -88,25 +88,50 @@ def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> Tu ) return parsed_inputs[-1] - def _format_prompt(self, parsed_input: TurnLevelInput) -> str: - """Format evaluation prompt from parsed turn data.""" + def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str: + """Extract user prompt from last message in session history. + + Args: + parsed_input: Trace-level input containing session history + + Returns: + User prompt text, or empty string if not available + """ + if not parsed_input.session_history: + return "" + + last_msg = parsed_input.session_history[-1] + if not isinstance(last_msg, list) and self._has_text_content(last_msg): + first_content = last_msg.content[0] + if isinstance(first_content, TextContent): + return first_content.text + + return "" + + def _format_prompt(self, parsed_input: TraceLevelInput) -> str: + """Format evaluation prompt from parsed trace data. + + Args: + parsed_input: Trace-level input containing agent response and session history + + Returns: + Formatted prompt string with conversation history and target turn + """ parts = [] - if parsed_input.conversation_history: + if parsed_input.session_history: history_lines = [] - for msg in parsed_input.conversation_history: - text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else "" - history_lines.append(f"{msg.role.value.capitalize()}: {text}") + for msg in parsed_input.session_history: + if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution): + continue # Skip tool execution lists + if not isinstance(msg, list) and self._has_text_content(msg): + first_content = msg.content[0] + if isinstance(first_content, TextContent): + history_lines.append(f"{msg.role.value.capitalize()}: {first_content.text}") history_str = "\n".join(history_lines) parts.append(f"# Previous turns:\n{history_str}") - # Extract user prompt from last message in history if available - user_prompt = "" - if parsed_input.conversation_history: - last_msg = parsed_input.conversation_history[-1] - if hasattr(last_msg, "content") and last_msg.content and hasattr(last_msg.content[0], "text"): - user_prompt = last_msg.content[0].text - + user_prompt = self._extract_user_prompt(parsed_input) parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}") return "\n\n".join(parts) diff --git a/src/strands_evals/evaluators/prompt_templates/faithfulness/__init__.py b/src/strands_evals/evaluators/prompt_templates/faithfulness/__init__.py new file mode 100644 index 0000000..bced619 --- /dev/null +++ b/src/strands_evals/evaluators/prompt_templates/faithfulness/__init__.py @@ -0,0 +1,11 @@ +from . import faithfulness_v0 + +VERSIONS = { + "v0": faithfulness_v0, +} + +DEFAULT_VERSION = "v0" + + +def get_template(version: str = DEFAULT_VERSION): + return VERSIONS[version] diff --git a/src/strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py b/src/strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py new file mode 100644 index 0000000..bc7cff8 --- /dev/null +++ b/src/strands_evals/evaluators/prompt_templates/faithfulness/faithfulness_v0.py @@ -0,0 +1,30 @@ +SYSTEM_PROMPT = """You are an objective judge evaluating an AI assistant's response quality as to faithfulness, i.e. judge whether the AI assistant's response conflicts with the conversation history. The conversation history has multiple turns including: +- User messages (User:) +- Assistant responses (Assistant:) +- API calls/actions taken by the assistant (Action:) +- Tool outputs (Tool:) + +Focus your evaluation on the last assistant message in the conversation history. + +# Evaluation Guidelines: +Rate the faithfulness of the assistant's response using this scale: + +1. Not At All +- Almost all information in the assistant's response conflicts with the conversation history + +2. Not Generally +- Most of the information in the assistant's response conflicts with the conversation history + +3. Neutral/Mixed +- Approximately half of the information in the assistant's response conflicts with the conversation history + +4. Generally Yes +- Information in the assistant's response rarely conflicts with the conversation history + +5. Completely Yes +- The assistant's response does not contain any information conflicting the conversation history +- If the AI assistant's response is chit chat, it is "Completely Yes" + +You should select "Completely Yes" unless you see any information in the AI assistant's response conflicting with the conversation history. + +Please include the analysis in your step by step reasoning.""" diff --git a/src/strands_evals/extractors/trace_extractor.py b/src/strands_evals/extractors/trace_extractor.py index ec16f48..93af08b 100644 --- a/src/strands_evals/extractors/trace_extractor.py +++ b/src/strands_evals/extractors/trace_extractor.py @@ -5,16 +5,17 @@ from ..types.trace import ( AgentInvocationSpan, AssistantMessage, - Conversation, - ConversationLevelInput, + Context, EvaluationLevel, Session, + SessionLevelInput, SpanInfo, TextContent, ToolConfig, + ToolExecution, ToolExecutionSpan, ToolLevelInput, - TurnLevelInput, + TraceLevelInput, UserMessage, ) @@ -27,23 +28,23 @@ class TraceExtractor: def __init__(self, evaluation_level: EvaluationLevel): self.evaluation_level = evaluation_level - def extract(self, session: Session) -> Union[list[TurnLevelInput], list[ToolLevelInput], ConversationLevelInput]: + def extract(self, session: Session) -> Union[list[TraceLevelInput], list[ToolLevelInput], SessionLevelInput]: """Extract evaluation inputs based on configured level.""" if not isinstance(session, Session): raise TypeError(f"Expected Session object, got {type(session).__name__}") - if self.evaluation_level == EvaluationLevel.TURN_LEVEL: - return self._extract_turn_level(session) + if self.evaluation_level == EvaluationLevel.TRACE_LEVEL: + return self._extract_trace_level(session) elif self.evaluation_level == EvaluationLevel.TOOL_LEVEL: return self._extract_tool_level(session) - elif self.evaluation_level == EvaluationLevel.CONVERSATION_LEVEL: - return self._extract_conversation_level(session) + elif self.evaluation_level == EvaluationLevel.SESSION_LEVEL: + return self._extract_session_level(session) else: raise ValueError(f"Unsupported evaluation level: {self.evaluation_level}") - def _extract_turn_level(self, session: Session) -> list[TurnLevelInput]: - """Extract turn-level inputs with conversation history up to each turn.""" - evaluation_inputs: list[TurnLevelInput] = [] + def _extract_trace_level(self, session: Session) -> list[TraceLevelInput]: + """Extract trace-level inputs with session history up to each turn.""" + evaluation_inputs: list[TraceLevelInput] = [] previous_turns: list[Union[UserMessage, AssistantMessage]] = [] for trace in session.traces: @@ -58,12 +59,12 @@ def _extract_turn_level(self, session: Session) -> list[TurnLevelInput]: logger.warning(f"Failed to create user message: {e}") continue - turn_input = TurnLevelInput( + trace_input = TraceLevelInput( span_info=span.span_info, agent_response=TextContent(text=span.agent_response), - conversation_history=list(previous_turns), + session_history=list(previous_turns), ) - evaluation_inputs.append(turn_input) + evaluation_inputs.append(trace_input) try: text_content = TextContent(text=span.agent_response) @@ -74,9 +75,9 @@ def _extract_turn_level(self, session: Session) -> list[TurnLevelInput]: return evaluation_inputs def _extract_tool_level(self, session: Session) -> list[ToolLevelInput]: - """Extract tool-level inputs with conversation and tool context.""" + """Extract tool-level inputs with session and tool context.""" evaluator_inputs: list[ToolLevelInput] = [] - conversation_history: list[Conversation] = [] + session_history: list[Union[UserMessage, list[ToolExecution], AssistantMessage]] = [] available_tools: list[ToolConfig] = [] for trace in session.traces: @@ -102,24 +103,24 @@ def _extract_tool_level(self, session: Session) -> list[ToolLevelInput]: span_info=span.span_info, available_tools=available_tools or [], tool_execution_details=span, - conversation_history=list(conversation_history), + session_history=list(session_history), ) ) if user_prompt and agent_response: - conversation_history.append( - Conversation( - user_prompt=TextContent(text=user_prompt), - agent_response=TextContent(text=agent_response), - tool_execution_history=tool_calls if tool_calls else None, - ) - ) + session_history.append(UserMessage(content=[TextContent(text=user_prompt)])) + if tool_calls: + tool_executions = [ + ToolExecution(tool_call=tc.tool_call, tool_result=tc.tool_result) for tc in tool_calls + ] + session_history.append(tool_executions) + session_history.append(AssistantMessage(content=[TextContent(text=agent_response)])) return evaluator_inputs - def _extract_conversation_level(self, session: Session) -> ConversationLevelInput: - """Extract conversation-level input with full history.""" - conversation_history: list[Conversation] = [] + def _extract_session_level(self, session: Session) -> SessionLevelInput: + """Extract session-level input with full history.""" + session_history: list[Context] = [] available_tools: list[ToolConfig] = [] span_info: SpanInfo | None = None @@ -137,19 +138,25 @@ def _extract_conversation_level(self, session: Session) -> ConversationLevelInpu if span.available_tools and not available_tools: available_tools = span.available_tools - conversation_history.append( - Conversation( + tool_executions = ( + [ToolExecution(tool_call=tc.tool_call, tool_result=tc.tool_result) for tc in tool_calls] + if tool_calls + else None + ) + + session_history.append( + Context( user_prompt=TextContent(text=span.user_prompt), agent_response=TextContent(text=span.agent_response), - tool_execution_history=tool_calls if tool_calls else None, + tool_execution_history=tool_executions, ) ) if not span_info: raise ValueError("No AgentInvocationSpan found in session") - return ConversationLevelInput( + return SessionLevelInput( span_info=span_info, - conversation_history=conversation_history, + session_history=session_history, available_tools=available_tools if available_tools else None, ) diff --git a/src/strands_evals/types/trace.py b/src/strands_evals/types/trace.py index b74b64d..9486ee0 100644 --- a/src/strands_evals/types/trace.py +++ b/src/strands_evals/types/trace.py @@ -31,8 +31,8 @@ class SpanType(str, Enum): class EvaluationLevel(str, Enum): """Type of evaluation based on trace granularity.""" - CONVERSATION_LEVEL = "Conversation" - TURN_LEVEL = "Turn" + SESSION_LEVEL = "Session" + TRACE_LEVEL = "Trace" TOOL_LEVEL = "ToolCall" @@ -140,24 +140,29 @@ class BaseEvaluationInput(BaseModel): span_info: SpanInfo -class Conversation(BaseModel): +class ToolExecution(BaseModel): + tool_call: ToolCall + tool_result: ToolResult + + +class Context(BaseModel): user_prompt: TextContent agent_response: TextContent - tool_execution_history: list[ToolExecutionSpan] | None = None + tool_execution_history: list[ToolExecution] | None = None -class ConversationLevelInput(BaseEvaluationInput): - """Input for conversation-level evaluators""" +class SessionLevelInput(BaseEvaluationInput): + """Input for session-level evaluators""" - conversation_history: list[Conversation] + session_history: list[Context] available_tools: list[ToolConfig] | None = None -class TurnLevelInput(BaseEvaluationInput): - """Input for turn-level evaluators""" +class TraceLevelInput(BaseEvaluationInput): + """Input for trace-level evaluators""" agent_response: TextContent - conversation_history: list[Union[UserMessage, AssistantMessage]] + session_history: list[Union[UserMessage, list[ToolExecution], AssistantMessage]] class ToolLevelInput(BaseEvaluationInput): @@ -165,7 +170,7 @@ class ToolLevelInput(BaseEvaluationInput): available_tools: list[ToolConfig] tool_execution_details: ToolExecutionSpan - conversation_history: list[Conversation] + session_history: list[Union[UserMessage, list[ToolExecution], AssistantMessage]] class EvaluatorScore(BaseModel): diff --git a/tests/strands_evals/evaluators/test_faithfulness_evaluator.py b/tests/strands_evals/evaluators/test_faithfulness_evaluator.py new file mode 100644 index 0000000..7100b46 --- /dev/null +++ b/tests/strands_evals/evaluators/test_faithfulness_evaluator.py @@ -0,0 +1,108 @@ +from datetime import datetime +from unittest.mock import Mock, patch + +import pytest + +from strands_evals.evaluators import FaithfulnessEvaluator +from strands_evals.evaluators.faithfulness_evaluator import FaithfulnessRating, FaithfulnessScore +from strands_evals.types import EvaluationData +from strands_evals.types.trace import ( + AgentInvocationSpan, + EvaluationLevel, + Session, + SpanInfo, + Trace, +) + + +@pytest.fixture +def evaluation_data(): + now = datetime.now() + span_info = SpanInfo(session_id="test-session", start_time=now, end_time=now) + agent_span = AgentInvocationSpan( + span_info=span_info, user_prompt="What is the capital of France?", agent_response="Paris", available_tools=[] + ) + trace = Trace(spans=[agent_span], trace_id="trace1", session_id="test-session") + session = Session(traces=[trace], session_id="test-session") + + return EvaluationData( + input="What is the capital of France?", actual_output="Paris", actual_trajectory=session, name="test" + ) + + +def test_init_with_defaults(): + evaluator = FaithfulnessEvaluator() + + assert evaluator.version == "v0" + assert evaluator.model is None + assert evaluator.system_prompt is not None + assert evaluator.evaluation_level == EvaluationLevel.TRACE_LEVEL + + +def test_init_with_custom_values(): + evaluator = FaithfulnessEvaluator(version="v1", model="gpt-4", system_prompt="Custom") + + assert evaluator.version == "v1" + assert evaluator.model == "gpt-4" + assert evaluator.system_prompt == "Custom" + + +@patch("strands_evals.evaluators.faithfulness_evaluator.Agent") +def test_evaluate(mock_agent_class, evaluation_data): + mock_agent = Mock() + mock_agent.structured_output.return_value = FaithfulnessRating( + reasoning="The response is faithful", score=FaithfulnessScore.COMPLETELY_YES + ) + mock_agent_class.return_value = mock_agent + evaluator = FaithfulnessEvaluator() + + result = evaluator.evaluate(evaluation_data) + + assert len(result) == 1 + assert result[0].score == 1.0 + assert result[0].test_pass is True + assert result[0].reason == "The response is faithful" + + +@pytest.mark.parametrize( + "score,expected_value,expected_pass", + [ + (FaithfulnessScore.NOT_AT_ALL, 0.0, False), + (FaithfulnessScore.NOT_GENERALLY, 0.25, False), + (FaithfulnessScore.NEUTRAL, 0.5, True), + (FaithfulnessScore.GENERALLY_YES, 0.75, True), + (FaithfulnessScore.COMPLETELY_YES, 1.0, True), + ], +) +@patch("strands_evals.evaluators.faithfulness_evaluator.Agent") +def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, expected_pass): + mock_agent = Mock() + mock_agent.structured_output.return_value = FaithfulnessRating(reasoning="Test", score=score) + mock_agent_class.return_value = mock_agent + evaluator = FaithfulnessEvaluator() + + result = evaluator.evaluate(evaluation_data) + + assert len(result) == 1 + assert result[0].score == expected_value + assert result[0].test_pass == expected_pass + + +@pytest.mark.asyncio +@patch("strands_evals.evaluators.faithfulness_evaluator.Agent") +async def test_evaluate_async(mock_agent_class, evaluation_data): + mock_agent = Mock() + + async def mock_structured_output_async(*args, **kwargs): + return FaithfulnessRating(reasoning="The response is faithful", score=FaithfulnessScore.COMPLETELY_YES) + + mock_agent.structured_output_async = mock_structured_output_async + mock_agent_class.return_value = mock_agent + evaluator = FaithfulnessEvaluator() + + result = await evaluator.evaluate_async(evaluation_data) + + assert len(result) == 1 + assert result[0].score == 1.0 + assert result[0].test_pass is True + assert result[0].reason == "The response is faithful" diff --git a/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py b/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py index 12db007..0f9baa5 100644 --- a/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py +++ b/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py @@ -53,7 +53,7 @@ def test_init_with_defaults(): assert evaluator.version == "v0" assert evaluator.model is None assert evaluator.system_prompt is not None - assert evaluator.evaluation_level == EvaluationLevel.CONVERSATION_LEVEL + assert evaluator.evaluation_level == EvaluationLevel.SESSION_LEVEL def test_init_with_custom_values(): @@ -75,6 +75,7 @@ def test_evaluate(mock_agent_class, evaluation_data): result = evaluator.evaluate(evaluation_data) + assert len(result) == 1 assert result[0].score == 1.0 assert result[0].test_pass is True assert result[0].reason == "All goals achieved" @@ -96,6 +97,7 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, result = evaluator.evaluate(evaluation_data) + assert len(result) == 1 assert result[0].score == expected_value assert result[0].test_pass == expected_pass @@ -114,6 +116,7 @@ async def mock_structured_output_async(*args, **kwargs): result = await evaluator.evaluate_async(evaluation_data) + assert len(result) == 1 assert result[0].score == 1.0 assert result[0].test_pass is True assert result[0].reason == "All goals achieved" diff --git a/tests/strands_evals/evaluators/test_helpfulness_evaluator.py b/tests/strands_evals/evaluators/test_helpfulness_evaluator.py index 9cb294e..66f1039 100644 --- a/tests/strands_evals/evaluators/test_helpfulness_evaluator.py +++ b/tests/strands_evals/evaluators/test_helpfulness_evaluator.py @@ -37,7 +37,7 @@ def test_init_with_defaults(): assert evaluator.model is None assert evaluator.include_inputs is True assert evaluator.system_prompt is not None - assert evaluator.evaluation_level == EvaluationLevel.TURN_LEVEL + assert evaluator.evaluation_level == EvaluationLevel.TRACE_LEVEL def test_init_with_custom_values(): diff --git a/tests/strands_evals/extractors/test_trace_extractor.py b/tests/strands_evals/extractors/test_trace_extractor.py index 01fce55..cafb624 100644 --- a/tests/strands_evals/extractors/test_trace_extractor.py +++ b/tests/strands_evals/extractors/test_trace_extractor.py @@ -5,9 +5,9 @@ from strands_evals.extractors import TraceExtractor from strands_evals.types.trace import ( AgentInvocationSpan, - ConversationLevelInput, EvaluationLevel, Session, + SessionLevelInput, SpanInfo, ToolCall, ToolConfig, @@ -15,7 +15,7 @@ ToolLevelInput, ToolResult, Trace, - TurnLevelInput, + TraceLevelInput, ) @@ -58,36 +58,36 @@ def session_with_tools(): def test_trace_extractor_initialization(): - extractor = TraceExtractor(EvaluationLevel.TURN_LEVEL) - assert extractor.evaluation_level == EvaluationLevel.TURN_LEVEL + extractor = TraceExtractor(EvaluationLevel.TRACE_LEVEL) + assert extractor.evaluation_level == EvaluationLevel.TRACE_LEVEL -def test_extract_turn_level(session_with_conversation): - extractor = TraceExtractor(EvaluationLevel.TURN_LEVEL) +def test_extract_trace_level(session_with_conversation): + extractor = TraceExtractor(EvaluationLevel.TRACE_LEVEL) result = extractor.extract(session_with_conversation) assert isinstance(result, list) assert len(result) == 2 - assert all(isinstance(item, TurnLevelInput) for item in result) + assert all(isinstance(item, TraceLevelInput) for item in result) assert result[0].agent_response.text == "4" assert result[1].agent_response.text == "6" -def test_extract_turn_level_with_conversation_history(session_with_conversation): - """Test that conversation history accumulates correctly across turns.""" - extractor = TraceExtractor(EvaluationLevel.TURN_LEVEL) +def test_extract_trace_level_with_session_history(session_with_conversation): + """Test that session history accumulates correctly across turns.""" + extractor = TraceExtractor(EvaluationLevel.TRACE_LEVEL) result = extractor.extract(session_with_conversation) assert len(result) == 2 # Second turn should have history of first turn assert result[1].agent_response.text == "6" - assert len(result[1].conversation_history) == 3 - assert result[1].conversation_history[0].role.value == "user" - assert result[1].conversation_history[0].content[0].text == "What is 2+2?" - assert result[1].conversation_history[1].role.value == "assistant" - assert result[1].conversation_history[1].content[0].text == "4" - assert result[1].conversation_history[2].role.value == "user" - assert result[1].conversation_history[2].content[0].text == "What is 3+3?" + assert len(result[1].session_history) == 3 + assert result[1].session_history[0].role.value == "user" + assert result[1].session_history[0].content[0].text == "What is 2+2?" + assert result[1].session_history[1].role.value == "assistant" + assert result[1].session_history[1].content[0].text == "4" + assert result[1].session_history[2].role.value == "user" + assert result[1].session_history[2].content[0].text == "What is 3+3?" def test_extract_tool_level(session_with_tools): @@ -102,20 +102,20 @@ def test_extract_tool_level(session_with_tools): assert result[0].tool_execution_details.tool_result.content == "4" -def test_extract_conversation_level(session_with_conversation): - extractor = TraceExtractor(EvaluationLevel.CONVERSATION_LEVEL) +def test_extract_session_level(session_with_conversation): + extractor = TraceExtractor(EvaluationLevel.SESSION_LEVEL) result = extractor.extract(session_with_conversation) - assert isinstance(result, ConversationLevelInput) - assert len(result.conversation_history) == 2 - assert result.conversation_history[0].user_prompt.text == "What is 2+2?" - assert result.conversation_history[0].agent_response.text == "4" - assert result.conversation_history[1].user_prompt.text == "What is 3+3?" - assert result.conversation_history[1].agent_response.text == "6" + assert isinstance(result, SessionLevelInput) + assert len(result.session_history) == 2 + assert result.session_history[0].user_prompt.text == "What is 2+2?" + assert result.session_history[0].agent_response.text == "4" + assert result.session_history[1].user_prompt.text == "What is 3+3?" + assert result.session_history[1].agent_response.text == "6" def test_extract_raises_on_invalid_session_type(): - extractor = TraceExtractor(EvaluationLevel.TURN_LEVEL) + extractor = TraceExtractor(EvaluationLevel.TRACE_LEVEL) with pytest.raises(TypeError, match="Expected Session object"): extractor.extract(["not", "a", "session"]) @@ -129,18 +129,18 @@ def test_extract_raises_on_unsupported_level(): def test_composability_multiple_extractors(session_with_conversation): """Test that multiple extractors can be composed for different purposes.""" - turn_extractor = TraceExtractor(EvaluationLevel.TURN_LEVEL) - conversation_extractor = TraceExtractor(EvaluationLevel.CONVERSATION_LEVEL) + trace_extractor = TraceExtractor(EvaluationLevel.TRACE_LEVEL) + session_extractor = TraceExtractor(EvaluationLevel.SESSION_LEVEL) - turn_result = turn_extractor.extract(session_with_conversation) - conversation_result = conversation_extractor.extract(session_with_conversation) + trace_result = trace_extractor.extract(session_with_conversation) + session_result = session_extractor.extract(session_with_conversation) - assert len(turn_result) == 2 - assert len(conversation_result.conversation_history) == 2 + assert len(trace_result) == 2 + assert len(session_result.session_history) == 2 -def test_extract_empty_session_turn_level(): - extractor = TraceExtractor(EvaluationLevel.TURN_LEVEL) +def test_extract_empty_session_trace_level(): + extractor = TraceExtractor(EvaluationLevel.TRACE_LEVEL) session = Session(traces=[], session_id="test") result = extractor.extract(session) diff --git a/tests/strands_evals/types/test_trace.py b/tests/strands_evals/types/test_trace.py index 8f92fca..67e75b4 100644 --- a/tests/strands_evals/types/test_trace.py +++ b/tests/strands_evals/types/test_trace.py @@ -4,7 +4,7 @@ AgentInvocationSpan, AssistantMessage, ContentType, - Conversation, + Context, InferenceSpan, Role, Session, @@ -17,7 +17,7 @@ ToolLevelInput, ToolResult, Trace, - TurnLevelInput, + TraceLevelInput, UserMessage, ) @@ -139,17 +139,17 @@ def test_session_creation(): assert len(session.traces) == 1 -def test_turn_level_input_creation(): - """Test TurnLevelInput model creation""" +def test_trace_level_input_creation(): + """Test TraceLevelInput model creation""" now = datetime.now() - turn_input = TurnLevelInput( + trace_input = TraceLevelInput( span_info=SpanInfo(session_id="test", start_time=now, end_time=now), agent_response=TextContent(text="4"), - conversation_history=[UserMessage(content=[TextContent(text="Hi")])], + session_history=[UserMessage(content=[TextContent(text="Hi")])], ) - assert turn_input.agent_response.text == "4" - assert len(turn_input.conversation_history) == 1 + assert trace_input.agent_response.text == "4" + assert len(trace_input.session_history) == 1 def test_tool_level_input_creation(): @@ -162,7 +162,7 @@ def test_tool_level_input_creation(): tool_result=ToolResult(content="4"), ) tool_input = ToolLevelInput( - span_info=span_info, available_tools=[], tool_execution_details=tool_exec, conversation_history=[] + span_info=span_info, available_tools=[], tool_execution_details=tool_exec, session_history=[] ) assert tool_input.tool_execution_details.tool_call.name == "calculator" @@ -170,12 +170,12 @@ def test_tool_level_input_creation(): assert tool_input.tool_execution_details.tool_result.content == "4" -def test_conversation_creation(): - """Test Conversation model creation""" - conversation = Conversation( +def test_context_creation(): + """Test Context model creation""" + context = Context( user_prompt=TextContent(text="What is 2+2?"), agent_response=TextContent(text="4"), tool_execution_history=None ) - assert conversation.user_prompt.text == "What is 2+2?" - assert conversation.agent_response.text == "4" - assert conversation.tool_execution_history is None + assert context.user_prompt.text == "What is 2+2?" + assert context.agent_response.text == "4" + assert context.tool_execution_history is None