Skip to content

Commit bc5c27d

Browse files
authored
fix: add tool info to concisenss, harmfulness, helpfulness and response relevance evaluators (#132)
1 parent 16cd567 commit bc5c27d

13 files changed

+369
-390
lines changed

src/strands_evals/evaluators/conciseness_evaluator.py

Lines changed: 3 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from typing_extensions import Union
88

99
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
10-
from ..types.trace import EvaluationLevel, ToolExecution, TraceLevelInput
10+
from ..types.trace import EvaluationLevel
1111
from .evaluator import Evaluator
1212
from .prompt_templates.conciseness import get_template
1313

@@ -53,14 +53,14 @@ def __init__(
5353

5454
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
5555
parsed_input = self._get_last_turn(evaluation_case)
56-
prompt = self._format_prompt(parsed_input)
56+
prompt = self._format_trace_level_prompt(parsed_input)
5757
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
5858
result = evaluator_agent(prompt, structured_output_model=ConcisenessRating)
5959
return self._create_evaluation_output(result)
6060

6161
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
6262
parsed_input = self._get_last_turn(evaluation_case)
63-
prompt = self._format_prompt(parsed_input)
63+
prompt = self._format_trace_level_prompt(parsed_input)
6464
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
6565
result = await evaluator_agent.invoke_async(prompt, structured_output_model=ConcisenessRating)
6666
return self._create_evaluation_output(result)
@@ -76,58 +76,3 @@ def _create_evaluation_output(self, result) -> list[EvaluationOutput]:
7676
label=rating.score,
7777
)
7878
]
79-
80-
def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
81-
"""Extract the most recent turn from the conversation for evaluation."""
82-
parsed_inputs = self._parse_trajectory(evaluation_case)
83-
if not parsed_inputs:
84-
raise ValueError(
85-
"No turn-level inputs could be parsed from the trajectory. "
86-
"Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
87-
)
88-
return parsed_inputs[-1]
89-
90-
def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str:
91-
"""Extract user prompt from last message in session history.
92-
93-
Args:
94-
parsed_input: Trace-level input containing session history
95-
96-
Returns:
97-
User prompt text, or empty string if not available
98-
"""
99-
if not parsed_input.session_history:
100-
return ""
101-
102-
last_msg = parsed_input.session_history[-1]
103-
if not isinstance(last_msg, list) and self._has_text_content(last_msg):
104-
return self._extract_text_content(last_msg)
105-
106-
return ""
107-
108-
def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
109-
"""Format evaluation prompt from parsed trace data.
110-
111-
Args:
112-
parsed_input: Trace-level input containing agent response and session history
113-
114-
Returns:
115-
Formatted prompt string with conversation history and target turn
116-
"""
117-
parts = []
118-
119-
if parsed_input.session_history:
120-
history_lines = []
121-
for msg in parsed_input.session_history:
122-
if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution):
123-
continue # Skip tool execution lists
124-
if not isinstance(msg, list) and self._has_text_content(msg):
125-
text = self._extract_text_content(msg)
126-
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
127-
history_str = "\n".join(history_lines)
128-
parts.append(f"# Previous turns:\n{history_str}")
129-
130-
user_prompt = self._extract_user_prompt(parsed_input)
131-
parts.append(f"# Target turn to evaluate:\nUser: {user_prompt}\nAssistant: {parsed_input.agent_response.text}")
132-
133-
return "\n\n".join(parts)

src/strands_evals/evaluators/evaluator.py

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,17 @@
66

77
from ..extractors import TraceExtractor
88
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
9-
from ..types.trace import AssistantMessage, Context, EvaluationLevel, Session, TextContent, ToolConfig, UserMessage
9+
from ..types.trace import (
10+
AssistantMessage,
11+
Context,
12+
EvaluationLevel,
13+
Session,
14+
TextContent,
15+
ToolConfig,
16+
ToolLevelInput,
17+
TraceLevelInput,
18+
UserMessage,
19+
)
1020

1121
logger = logging.getLogger(__name__)
1222

@@ -108,6 +118,16 @@ def _parse_trajectory(self, evaluation_case: EvaluationData[InputT, OutputT]) ->
108118

109119
return self._trace_extractor.extract(trajectory)
110120

121+
def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
122+
"""Extract the most recent turn from the conversation for evaluation."""
123+
parsed_inputs = self._parse_trajectory(evaluation_case)
124+
if not parsed_inputs:
125+
raise ValueError(
126+
"No turn-level inputs could be parsed from the trajectory. "
127+
"Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
128+
)
129+
return parsed_inputs[-1]
130+
111131
def _format_tools(self, tools: list[ToolConfig]) -> str:
112132
"""Format available tools for prompt display."""
113133
return "\n".join([f"- {tool.name}: {tool.description or 'No description'}" for tool in tools])
@@ -124,6 +144,58 @@ def _format_session_history(self, contexts: list[Context]) -> str:
124144
lines.append(f"Assistant: {ctx.agent_response.text}")
125145
return "\n".join(lines)
126146

147+
def _format_tool_level_prompt(self, tool_input: ToolLevelInput) -> str:
148+
"""Format evaluation prompt from tool-level input."""
149+
parts = []
150+
151+
# Format available tools
152+
if tool_input.available_tools:
153+
parts.append(f"## Available tool-calls\n{self._format_tools(tool_input.available_tools)}")
154+
155+
# Format previous conversation history
156+
if tool_input.session_history:
157+
history_lines = []
158+
for msg in tool_input.session_history:
159+
if isinstance(msg, list):
160+
# Handle tool execution lists
161+
for tool_exec in msg:
162+
history_lines.append(f"Tool call: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
163+
history_lines.append(f"Tool result: {tool_exec.tool_result.content}")
164+
else:
165+
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
166+
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
167+
history_str = "\n".join(history_lines)
168+
parts.append(f"## Previous conversation history\n{history_str}")
169+
170+
# Format target tool call to evaluate
171+
tool_details = tool_input.tool_execution_details
172+
tool_call_str = f"Tool call: {tool_details.tool_call.name}({tool_details.tool_call.arguments})"
173+
parts.append(f"## Target tool-call to evaluate\n{tool_call_str}")
174+
175+
return "\n\n".join(parts)
176+
177+
def _format_trace_level_prompt(self, parsed_input: TraceLevelInput) -> str:
178+
"""Format evaluation prompt from parsed turn data."""
179+
parts = []
180+
181+
if parsed_input.session_history:
182+
history_lines = []
183+
for msg in parsed_input.session_history:
184+
if isinstance(msg, list):
185+
# Handle tool execution lists
186+
for tool_exec in msg:
187+
history_lines.append(f"Tool call: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
188+
history_lines.append(f"Tool result: {tool_exec.tool_result.content}")
189+
else:
190+
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
191+
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
192+
history_str = "\n".join(history_lines)
193+
parts.append(f"# Conversation History:\n{history_str}")
194+
195+
parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}")
196+
197+
return "\n\n".join(parts)
198+
127199
def _has_text_content(self, msg: UserMessage | AssistantMessage) -> TypeGuard[UserMessage | AssistantMessage]:
128200
"""Check if a message object has accessible text content.
129201

src/strands_evals/evaluators/faithfulness_evaluator.py

Lines changed: 3 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from typing_extensions import Union
88

99
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
10-
from ..types.trace import EvaluationLevel, TraceLevelInput
10+
from ..types.trace import EvaluationLevel
1111
from .evaluator import Evaluator
1212
from .prompt_templates.faithfulness import get_template
1313

@@ -55,7 +55,7 @@ def __init__(
5555

5656
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
5757
parsed_input = self._get_last_turn(evaluation_case)
58-
prompt = self._format_prompt(parsed_input)
58+
prompt = self._format_trace_level_prompt(parsed_input)
5959
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
6060
result = evaluator_agent(prompt, structured_output_model=FaithfulnessRating)
6161
rating = cast(FaithfulnessRating, result.structured_output)
@@ -71,7 +71,7 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
7171

7272
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
7373
parsed_input = self._get_last_turn(evaluation_case)
74-
prompt = self._format_prompt(parsed_input)
74+
prompt = self._format_trace_level_prompt(parsed_input)
7575
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
7676
result = await evaluator_agent.invoke_async(prompt, structured_output_model=FaithfulnessRating)
7777
rating = cast(FaithfulnessRating, result.structured_output)
@@ -84,35 +84,3 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
8484
label=rating.score,
8585
)
8686
]
87-
88-
def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
89-
"""Extract the most recent turn from the conversation for evaluation."""
90-
parsed_inputs = self._parse_trajectory(evaluation_case)
91-
if not parsed_inputs:
92-
raise ValueError(
93-
"No turn-level inputs could be parsed from the trajectory. "
94-
"Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
95-
)
96-
return parsed_inputs[-1]
97-
98-
def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
99-
"""Format evaluation prompt from parsed turn data."""
100-
parts = []
101-
102-
if parsed_input.session_history:
103-
history_lines = []
104-
for msg in parsed_input.session_history:
105-
if isinstance(msg, list):
106-
# Handle tool execution lists
107-
for tool_exec in msg:
108-
history_lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
109-
history_lines.append(f"Tool: {tool_exec.tool_result.content}")
110-
else:
111-
text = self._extract_text_content(msg) if self._has_text_content(msg) else ""
112-
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
113-
history_str = "\n".join(history_lines)
114-
parts.append(f"# Conversation History:\n{history_str}")
115-
116-
parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}")
117-
118-
return "\n\n".join(parts)

src/strands_evals/evaluators/harmfulness_evaluator.py

Lines changed: 3 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from strands import Agent
66

77
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
8-
from ..types.trace import EvaluationLevel, ToolExecution, TraceLevelInput
8+
from ..types.trace import EvaluationLevel
99
from .evaluator import Evaluator
1010
from .prompt_templates.harmfulness import get_template
1111

@@ -47,7 +47,7 @@ def __init__(
4747

4848
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
4949
parsed_input = self._get_last_turn(evaluation_case)
50-
prompt = self._format_prompt(parsed_input)
50+
prompt = self._format_trace_level_prompt(parsed_input)
5151
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
5252
result = evaluator_agent(prompt, structured_output_model=HarmfulnessRating)
5353
rating = cast(HarmfulnessRating, result.structured_output)
@@ -63,7 +63,7 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
6363

6464
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
6565
parsed_input = self._get_last_turn(evaluation_case)
66-
prompt = self._format_prompt(parsed_input)
66+
prompt = self._format_trace_level_prompt(parsed_input)
6767
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
6868
result = await evaluator_agent.invoke_async(prompt, structured_output_model=HarmfulnessRating)
6969
rating = cast(HarmfulnessRating, result.structured_output)
@@ -76,58 +76,3 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
7676
label=rating.score,
7777
)
7878
]
79-
80-
def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TraceLevelInput:
81-
"""Extract the most recent turn from the conversation for evaluation."""
82-
parsed_inputs = self._parse_trajectory(evaluation_case)
83-
if not parsed_inputs:
84-
raise ValueError(
85-
"No turn-level inputs could be parsed from the trajectory. "
86-
"Ensure actual_trajectory is a Session with at least one AgentInvocationSpan."
87-
)
88-
return parsed_inputs[-1]
89-
90-
def _extract_user_prompt(self, parsed_input: TraceLevelInput) -> str:
91-
"""Extract user prompt from last message in session history.
92-
93-
Args:
94-
parsed_input: Trace-level input containing session history
95-
96-
Returns:
97-
User prompt text, or empty string if not available
98-
"""
99-
if not parsed_input.session_history:
100-
return ""
101-
102-
last_msg = parsed_input.session_history[-1]
103-
if not isinstance(last_msg, list) and self._has_text_content(last_msg):
104-
return self._extract_text_content(last_msg)
105-
106-
return ""
107-
108-
def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
109-
"""Format evaluation prompt from parsed trace data.
110-
111-
Args:
112-
parsed_input: Trace-level input containing agent response and session history
113-
114-
Returns:
115-
Formatted prompt string with user prompt and assistant response
116-
"""
117-
parts = []
118-
119-
if parsed_input.session_history:
120-
history_lines = []
121-
for msg in parsed_input.session_history:
122-
if isinstance(msg, list) and msg and isinstance(msg[0], ToolExecution):
123-
continue # Skip tool execution lists
124-
if not isinstance(msg, list) and self._has_text_content(msg):
125-
text = self._extract_text_content(msg)
126-
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
127-
history_str = "\n".join(history_lines)
128-
parts.append(f"# Previous turns:\n{history_str}")
129-
130-
user_prompt = self._extract_user_prompt(parsed_input)
131-
parts.append(f"# User prompt:\n{user_prompt}\n# Assistant Response {parsed_input.agent_response.text}")
132-
133-
return "\n\n".join(parts)

0 commit comments

Comments
 (0)