Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 3 additions & 58 deletions src/strands_evals/evaluators/conciseness_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from typing_extensions import Union

from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
from ..types.trace import EvaluationLevel, ToolExecution, TraceLevelInput
from ..types.trace import EvaluationLevel
from .evaluator import Evaluator
from .prompt_templates.conciseness import get_template

Expand Down Expand Up @@ -53,14 +53,14 @@ def __init__(

def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
prompt = self._format_trace_level_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = evaluator_agent(prompt, structured_output_model=ConcisenessRating)
return self._create_evaluation_output(result)

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
prompt = self._format_trace_level_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=ConcisenessRating)
return self._create_evaluation_output(result)
Expand All @@ -76,58 +76,3 @@ def _create_evaluation_output(self, result) -> list[EvaluationOutput]:
label=rating.score,
)
]

def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
parsed_inputs = self._parse_trajectory(evaluation_case)
if not parsed_inputs:
raise ValueError(
"No turn-level inputs could be parsed from the trajectory. "
"Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
)
return parsed_inputs[-1]

def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str:
"""Extract user prompt from last message in session history.

Args:
parsed_input: Trace-level input containing session history

Returns:
User prompt text, or empty string if not available
"""
if not parsed_input.session_history:
return ""

last_msg = parsed_input.session_history[-1]
if not isinstance(last_msg, list) and self._has_text_content(last_msg):
return self._extract_text_content(last_msg)

return ""

def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
"""Format evaluation prompt from parsed trace data.

Args:
parsed_input: Trace-level input containing agent response and session history

Returns:
Formatted prompt string with conversation history and target turn
"""
parts = []

if parsed_input.session_history:
history_lines = []
for msg in parsed_input.session_history:
if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution):
continue # Skip tool execution lists
if not isinstance(msg, list) and self._has_text_content(msg):
text = self._extract_text_content(msg)
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
history_str = "\n".join(history_lines)
parts.append(f"# Previous turns:\n{history_str}")

user_prompt = self._extract_user_prompt(parsed_input)
parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}")

return "\n\n".join(parts)
74 changes: 73 additions & 1 deletion src/strands_evals/evaluators/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,17 @@

from ..extractors import TraceExtractor
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
from ..types.trace import AssistantMessage, Context, EvaluationLevel, Session, TextContent, ToolConfig, UserMessage
from ..types.trace import (
AssistantMessage,
Context,
EvaluationLevel,
Session,
TextContent,
ToolConfig,
ToolLevelInput,
TraceLevelInput,
UserMessage,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -108,6 +118,16 @@ def _parse_trajectory(self, evaluation_case: EvaluationData[InputT, OutputT]) ->

return self._trace_extractor.extract(trajectory)

def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
parsed_inputs = self._parse_trajectory(evaluation_case)
if not parsed_inputs:
raise ValueError(
"No turn-level inputs could be parsed from the trajectory. "
"Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
)
return parsed_inputs[-1]

def _format_tools(self, tools: list[ToolConfig]) -> str:
"""Format available tools for prompt display."""
return "\n".join([f"- {tool.name}: {tool.description or 'No description'}" for tool in tools])
Expand All @@ -124,6 +144,58 @@ def _format_session_history(self, contexts: list[Context]) -> str:
lines.append(f"Assistant: {ctx.agent_response.text}")
return "\n".join(lines)

def _format_tool_level_prompt(self, tool_input: ToolLevelInput) -> str:
"""Format evaluation prompt from tool-level input."""
parts = []

# Format available tools
if tool_input.available_tools:
parts.append(f"## Available tool-calls\n{self._format_tools(tool_input.available_tools)}")

# Format previous conversation history
if tool_input.session_history:
history_lines = []
for msg in tool_input.session_history:
if isinstance(msg, list):
# Handle tool execution lists
for tool_exec in msg:
history_lines.append(f"Tool call: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
history_lines.append(f"Tool result: {tool_exec.tool_result.content}")
else:
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
history_str = "\n".join(history_lines)
parts.append(f"## Previous conversation history\n{history_str}")

# Format target tool call to evaluate
tool_details = tool_input.tool_execution_details
tool_call_str = f"Tool call: {tool_details.tool_call.name}({tool_details.tool_call.arguments})"
parts.append(f"## Target tool-call to evaluate\n{tool_call_str}")

return "\n\n".join(parts)

def _format_trace_level_prompt(self, parsed_input: TraceLevelInput) -> str:
"""Format evaluation prompt from parsed turn data."""
parts = []

if parsed_input.session_history:
history_lines = []
for msg in parsed_input.session_history:
if isinstance(msg, list):
# Handle tool execution lists
for tool_exec in msg:
history_lines.append(f"Tool call: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
history_lines.append(f"Tool result: {tool_exec.tool_result.content}")
else:
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
history_str = "\n".join(history_lines)
parts.append(f"# Conversation History:\n{history_str}")

parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}")

return "\n\n".join(parts)

def _has_text_content(self, msg: UserMessage | AssistantMessage) -> TypeGuard[UserMessage | AssistantMessage]:
"""Check if a message object has accessible text content.

Expand Down
38 changes: 3 additions & 35 deletions src/strands_evals/evaluators/faithfulness_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from typing_extensions import Union

from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
from ..types.trace import EvaluationLevel, TraceLevelInput
from ..types.trace import EvaluationLevel
from .evaluator import Evaluator
from .prompt_templates.faithfulness import get_template

Expand Down Expand Up @@ -55,7 +55,7 @@ def __init__(

def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
prompt = self._format_trace_level_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = evaluator_agent(prompt, structured_output_model=FaithfulnessRating)
rating = cast(FaithfulnessRating, result.structured_output)
Expand All @@ -71,7 +71,7 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
prompt = self._format_trace_level_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=FaithfulnessRating)
rating = cast(FaithfulnessRating, result.structured_output)
Expand All @@ -84,35 +84,3 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
label=rating.score,
)
]

def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
parsed_inputs = self._parse_trajectory(evaluation_case)
if not parsed_inputs:
raise ValueError(
"No turn-level inputs could be parsed from the trajectory. "
"Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
)
return parsed_inputs[-1]

def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
"""Format evaluation prompt from parsed turn data."""
parts = []

if parsed_input.session_history:
history_lines = []
for msg in parsed_input.session_history:
if isinstance(msg, list):
# Handle tool execution lists
for tool_exec in msg:
history_lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
history_lines.append(f"Tool: {tool_exec.tool_result.content}")
else:
text = self._extract_text_content(msg) if self._has_text_content(msg) else ""
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
history_str = "\n".join(history_lines)
parts.append(f"# Conversation History:\n{history_str}")

parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}")

return "\n\n".join(parts)
61 changes: 3 additions & 58 deletions src/strands_evals/evaluators/harmfulness_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from strands import Agent

from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
from ..types.trace import EvaluationLevel, ToolExecution, TraceLevelInput
from ..types.trace import EvaluationLevel
from .evaluator import Evaluator
from .prompt_templates.harmfulness import get_template

Expand Down Expand Up @@ -47,7 +47,7 @@ def __init__(

def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
prompt = self._format_trace_level_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = evaluator_agent(prompt, structured_output_model=HarmfulnessRating)
rating = cast(HarmfulnessRating, result.structured_output)
Expand All @@ -63,7 +63,7 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
parsed_input = self._get_last_turn(evaluation_case)
prompt = self._format_prompt(parsed_input)
prompt = self._format_trace_level_prompt(parsed_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=HarmfulnessRating)
rating = cast(HarmfulnessRating, result.structured_output)
Expand All @@ -76,58 +76,3 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
label=rating.score,
)
]

def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
"""Extract the most recent turn from the conversation for evaluation."""
parsed_inputs = self._parse_trajectory(evaluation_case)
if not parsed_inputs:
raise ValueError(
"No turn-level inputs could be parsed from the trajectory. "
"Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
)
return parsed_inputs[-1]

def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str:
"""Extract user prompt from last message in session history.

Args:
parsed_input: Trace-level input containing session history

Returns:
User prompt text, or empty string if not available
"""
if not parsed_input.session_history:
return ""

last_msg = parsed_input.session_history[-1]
if not isinstance(last_msg, list) and self._has_text_content(last_msg):
return self._extract_text_content(last_msg)

return ""

def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
"""Format evaluation prompt from parsed trace data.

Args:
parsed_input: Trace-level input containing agent response and session history

Returns:
Formatted prompt string with user prompt and assistant response
"""
parts = []

if parsed_input.session_history:
history_lines = []
for msg in parsed_input.session_history:
if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution):
continue # Skip tool execution lists
if not isinstance(msg, list) and self._has_text_content(msg):
text = self._extract_text_content(msg)
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
history_str = "\n".join(history_lines)
parts.append(f"# Previous turns:\n{history_str}")

user_prompt = self._extract_user_prompt(parsed_input)
parts.append(f"# User prompt:\n{user_prompt}\n# Assistant Response {parsed_input.agent_response.text}")

return "\n\n".join(parts)
Loading