Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions src/examples/faithfulness_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
from strands import Agent

from strands_evals import Case, Dataset
from strands_evals.evaluators import FaithfulnessEvaluator
from strands_evals.mappers import StrandsInMemorySessionMapper
from strands_evals.telemetry import StrandsEvalsTelemetry

# ======================================
# SETUP TELEMETRY
# ======================================
telemetry = StrandsEvalsTelemetry()
memory_exporter = InMemorySpanExporter()
span_processor = BatchSpanProcessor(memory_exporter)
telemetry.tracer_provider.add_span_processor(span_processor)


# ======================================
# SETUP AND RUN STRANDS EVAL
# ======================================


def user_task_function(query: str) -> str:
agent = Agent(callback_handler=None)
agent_response = agent(query)
finished_spans = memory_exporter.get_finished_spans()
mapper = StrandsInMemorySessionMapper()
session = mapper.map_to_session(finished_spans, session_id="test-session")

return {"output": str(agent_response), "trajectory": session}


test_cases = [
Case[str, str](name="knowledge-1", input="What is the capital of France?", metadata={"category": "knowledge"}),
Case[str, str](name="knowledge-2", input="What color is the ocean?", metadata={"category": "knowledge"}),
]

evaluator = FaithfulnessEvaluator()

dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator)

report = dataset.run_evaluations(user_task_function)
report.run_display()
2 changes: 2 additions & 0 deletions src/strands_evals/evaluators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .evaluator import Evaluator
from .faithfulness_evaluator import FaithfulnessEvaluator
from .goal_success_rate_evaluator import GoalSuccessRateEvaluator
from .helpfulness_evaluator import HelpfulnessEvaluator
from .interactions_evaluator import InteractionsEvaluator
Expand All @@ -12,4 +13,5 @@
"InteractionsEvaluator",
"HelpfulnessEvaluator",
"GoalSuccessRateEvaluator",
"FaithfulnessEvaluator",
]
34 changes: 25 additions & 9 deletions src/strands_evals/evaluators/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import inspect
import logging

from typing_extensions import Any, Generic, TypeVar
from typing_extensions import Any, Generic, TypeGuard, TypeVar

from ..extractors import TraceExtractor
from ..types.evaluation import EvaluationData, EvaluationOutput
from ..types.trace import Conversation, EvaluationLevel, Session, ToolConfig
from ..types.trace import AssistantMessage, Context, EvaluationLevel, Session, TextContent, ToolConfig, UserMessage

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -88,18 +88,34 @@ def _format_tools(self, tools: list[ToolConfig]) -> str:
"""Format available tools for prompt display."""
return "\n".join([f"- {tool.name}: {tool.description or 'No description'}" for tool in tools])

def _format_conversation_history(self, conversations: list[Conversation]) -> str:
"""Format conversation history with tool executions for prompt display."""
def _format_session_history(self, contexts: list[Context]) -> str:
"""Format session history with tool executions for prompt display."""
lines = []
for conv in conversations:
lines.append(f"User: {conv.user_prompt.text}")
if conv.tool_execution_history:
for tool_exec in conv.tool_execution_history:
for ctx in contexts:
lines.append(f"User: {ctx.user_prompt.text}")
if ctx.tool_execution_history:
for tool_exec in ctx.tool_execution_history:
lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
lines.append(f"Tool: {tool_exec.tool_result.content}")
lines.append(f"Assistant: {conv.agent_response.text}")
lines.append(f"Assistant: {ctx.agent_response.text}")
return "\n".join(lines)

def _has_text_content(self, msg: UserMessage | AssistantMessage) -> TypeGuard[UserMessage | AssistantMessage]:
"""Check if a message object has accessible text content.

Args:
msg: Message object to check (UserMessage or AssistantMessage)

Returns:
True if msg has content attribute with at least one item that is TextContent
"""
return (
hasattr(msg, "content")
and bool(msg.content)
and len(msg.content) > 0
and isinstance(msg.content[0], TextContent)
)

@classmethod
def get_type_name(cls) -> str:
"""
Expand Down
105 changes: 105 additions & 0 deletions src/strands_evals/evaluators/faithfulness_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from enum import Enum

from pydantic import BaseModel, Field
from strands import Agent
from typing_extensions import TypeVar

from ..types.evaluation import EvaluationData, EvaluationOutput
from ..types.trace import EvaluationLevel, TraceLevelInput
from .evaluator import Evaluator
from .prompt_templates.faithfulness import get_template

InputT = TypeVar("InputT")
OutputT = TypeVar("OutputT")


class FaithfulnessScore(str, Enum):
"""Categorical faithfulness ratings."""

NOT_AT_ALL = "Not At All"
NOT_GENERALLY = "Not Generally"
NEUTRAL = "Neutral/Mixed"
GENERALLY_YES = "Generally Yes"
COMPLETELY_YES = "Completely Yes"


class FaithfulnessRating(BaseModel):
"""Structured output for faithfulness evaluation."""

reasoning: str = Field(description="Step by step reasoning to derive the final score")
score: FaithfulnessScore = Field(description="Categorical faithfulness rating")


class FaithfulnessEvaluator(Evaluator[InputT, OutputT]):
"""Evaluates faithfulness of agent responses against conversation history."""

evaluation_level = EvaluationLevel.TRACE_LEVEL

_score_mapping = {
FaithfulnessScore.NOT_AT_ALL: 0.0,
FaithfulnessScore.NOT_GENERALLY: 0.25,
FaithfulnessScore.NEUTRAL: 0.5,
FaithfulnessScore.GENERALLY_YES: 0.75,
FaithfulnessScore.COMPLETELY_YES: 1.0,
}

def __init__(
self,
version: str = "v0",
model: str | None = None,
system_prompt: str | None = None,
):
super().__init__()
self.system_prompt = system_prompt if system_prompt is not None else get_template(version).SYSTEM_PROMPT
self.version = version
self.model = model

def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = evaluator_agent.structured_output(FaithfulnessRating, prompt)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(score=normalized_score, test_pass=normalized_score >= 0.5, reason=rating.reasoning)
return [result]

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = await evaluator_agent.structured_output_async(FaithfulnessRating, prompt)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(score=normalized_score, test_pass=normalized_score >= 0.5, reason=rating.reasoning)
return [result]

def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
parsed_inputs = self._parse_trajectory(evaluation_case)
if not parsed_inputs:
raise ValueError(
"No turn-level inputs could be parsed from the trajectory. "
"Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
)
return parsed_inputs[-1]

def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
"""Format evaluation prompt from parsed turn data."""
parts = []

if parsed_input.session_history:
history_lines = []
for msg in parsed_input.session_history:
if isinstance(msg, list):
# Handle tool execution lists
for tool_exec in msg:
history_lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
history_lines.append(f"Tool: {tool_exec.tool_result.content}")
else:
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
history_str = "\n".join(history_lines)
parts.append(f"# Conversation History:\n{history_str}")

parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}")

return "\n\n".join(parts)
26 changes: 12 additions & 14 deletions src/strands_evals/evaluators/goal_success_rate_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing_extensions import TypeVar

from ..types.evaluation import EvaluationData, EvaluationOutput
from ..types.trace import ConversationLevelInput, EvaluationLevel
from ..types.trace import EvaluationLevel, SessionLevelInput
from .evaluator import Evaluator
from .prompt_templates.goal_success_rate import get_template

Expand All @@ -30,7 +30,7 @@ class GoalSuccessRating(BaseModel):
class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
"""Evaluates whether all user goals were successfully achieved in a conversation."""

evaluation_level = EvaluationLevel.CONVERSATION_LEVEL
evaluation_level = EvaluationLevel.SESSION_LEVEL

_score_mapping = {
GoalSuccessScore.YES: 1.0,
Expand All @@ -49,33 +49,31 @@ def __init__(
self.model = model

def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
conversation_input = self._parse_trajectory(evaluation_case)
prompt = self._format_prompt(conversation_input)
session_input = self._parse_trajectory(evaluation_case)
prompt = self._format_prompt(session_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = evaluator_agent.structured_output(GoalSuccessRating, prompt)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(score=normalized_score, test_pass=normalized_score >= 1.0, reason=rating.reasoning)
return [result]

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
conversation_input = self._parse_trajectory(evaluation_case)
prompt = self._format_prompt(conversation_input)
session_input = self._parse_trajectory(evaluation_case)
prompt = self._format_prompt(session_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
rating = await evaluator_agent.structured_output_async(GoalSuccessRating, prompt)
normalized_score = self._score_mapping[rating.score]
result = EvaluationOutput(score=normalized_score, test_pass=normalized_score >= 1.0, reason=rating.reasoning)
return [result]

def _format_prompt(self, conversation_input: ConversationLevelInput) -> str:
"""Format evaluation prompt from conversation-level input."""
def _format_prompt(self, session_input: SessionLevelInput) -> str:
"""Format evaluation prompt from session-level input."""
parts = []

if conversation_input.available_tools:
parts.append(f"# Available tools\n{self._format_tools(conversation_input.available_tools)}")
if session_input.available_tools:
parts.append(f"# Available tools\n{self._format_tools(session_input.available_tools)}")

if conversation_input.conversation_history:
parts.append(
f"# Conversation record\n{self._format_conversation_history(conversation_input.conversation_history)}"
)
if session_input.session_history:
parts.append(f"# Conversation record\n{self._format_session_history(session_input.session_history)}")

return "\n\n".join(parts)
57 changes: 41 additions & 16 deletions src/strands_evals/evaluators/helpfulness_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing_extensions import TypeVar

from ..types.evaluation import EvaluationData, EvaluationOutput
from ..types.trace import EvaluationLevel, TurnLevelInput
from ..types.trace import EvaluationLevel, TextContent, ToolExecution, TraceLevelInput
from .evaluator import Evaluator
from .prompt_templates.helpfulness import get_template

Expand Down Expand Up @@ -35,7 +35,7 @@ class HelpfulnessRating(BaseModel):
class HelpfulnessEvaluator(Evaluator[InputT, OutputT]):
"""Evaluates helpfulness of agent responses from the user's perspective."""

evaluation_level = EvaluationLevel.TURN_LEVEL
evaluation_level = EvaluationLevel.TRACE_LEVEL

_score_mapping = {
HelpfulnessScore.NOT_HELPFUL: 0.0,
Expand Down Expand Up @@ -78,7 +78,7 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
result = EvaluationOutput(score=normalized_score, test_pass=normalized_score >= 0.5, reason=rating.reasoning)
return [result]

def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TurnLevelInput:
def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
parsed_inputs = self._parse_trajectory(evaluation_case)
if not parsed_inputs:
Expand All @@ -88,25 +88,50 @@ def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> Tu
)
return parsed_inputs[-1]

def _format_prompt(self, parsed_input: TurnLevelInput) -> str:
"""Format evaluation prompt from parsed turn data."""
def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str:
"""Extract user prompt from last message in session history.

Args:
parsed_input: Trace-level input containing session history

Returns:
User prompt text, or empty string if not available
"""
if not parsed_input.session_history:
return ""

last_msg = parsed_input.session_history[-1]
if not isinstance(last_msg, list) and self._has_text_content(last_msg):
first_content = last_msg.content[0]
if isinstance(first_content, TextContent):
return first_content.text

return ""

def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
"""Format evaluation prompt from parsed trace data.

Args:
parsed_input: Trace-level input containing agent response and session history

Returns:
Formatted prompt string with conversation history and target turn
"""
parts = []

if parsed_input.conversation_history:
if parsed_input.session_history:
history_lines = []
for msg in parsed_input.conversation_history:
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
for msg in parsed_input.session_history:
if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution):
continue # Skip tool execution lists
if not isinstance(msg, list) and self._has_text_content(msg):
first_content = msg.content[0]
if isinstance(first_content, TextContent):
history_lines.append(f"{msg.role.value.capitalize()}: {first_content.text}")
history_str = "\n".join(history_lines)
parts.append(f"# Previous turns:\n{history_str}")

# Extract user prompt from last message in history if available
user_prompt = ""
if parsed_input.conversation_history:
last_msg = parsed_input.conversation_history[-1]
if hasattr(last_msg, "content") and last_msg.content and hasattr(last_msg.content[0], "text"):
user_prompt = last_msg.content[0].text

user_prompt = self._extract_user_prompt(parsed_input)
parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}")

return "\n\n".join(parts)
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from . import faithfulness_v0

VERSIONS = {
"v0": faithfulness_v0,
}

DEFAULT_VERSION = "v0"


def get_template(version: str = DEFAULT_VERSION):
return VERSIONS[version]
Loading