Skip to content

Commit 910ab32

Browse files
committed
extract and improve prompt formatting methods
1 parent dd18f6c commit 910ab32

13 files changed

+94
-211
lines changed

src/strands_evals/evaluators/conciseness_evaluator.py

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from typing_extensions import Union
88

99
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
10-
from ..types.trace import EvaluationLevel, TraceLevelInput
10+
from ..types.trace import EvaluationLevel
1111
from .evaluator import Evaluator
1212
from .prompt_templates.conciseness import get_template
1313

@@ -53,14 +53,14 @@ def __init__(
5353

5454
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
5555
parsed_input = self._get_last_turn(evaluation_case)
56-
prompt = self._format_prompt(parsed_input)
56+
prompt = self._format_trace_level_prompt(parsed_input)
5757
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
5858
result = evaluator_agent(prompt, structured_output_model=ConcisenessRating)
5959
return self._create_evaluation_output(result)
6060

6161
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
6262
parsed_input = self._get_last_turn(evaluation_case)
63-
prompt = self._format_prompt(parsed_input)
63+
prompt = self._format_trace_level_prompt(parsed_input)
6464
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
6565
result = await evaluator_agent.invoke_async(prompt, structured_output_model=ConcisenessRating)
6666
return self._create_evaluation_output(result)
@@ -76,25 +76,3 @@ def _create_evaluation_output(self, result) -> list[EvaluationOutput]:
7676
label=rating.score,
7777
)
7878
]
79-
80-
def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
81-
"""Format evaluation prompt from parsed turn data."""
82-
parts = []
83-
84-
if parsed_input.session_history:
85-
history_lines = []
86-
for msg in parsed_input.session_history:
87-
if isinstance(msg, list):
88-
# Handle tool execution lists
89-
for tool_exec in msg:
90-
history_lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
91-
history_lines.append(f"Tool: {tool_exec.tool_result.content}")
92-
else:
93-
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
94-
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
95-
history_str = "\n".join(history_lines)
96-
parts.append(f"# Conversation History:\n{history_str}")
97-
98-
parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}")
99-
100-
return "\n\n".join(parts)

src/strands_evals/evaluators/evaluator.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
Session,
1414
TextContent,
1515
ToolConfig,
16+
ToolLevelInput,
1617
TraceLevelInput,
1718
UserMessage,
1819
)
@@ -143,6 +144,58 @@ def _format_session_history(self, contexts: list[Context]) -> str:
143144
lines.append(f"Assistant: {ctx.agent_response.text}")
144145
return "\n".join(lines)
145146

147+
def _format_tool_level_prompt(self, tool_input: ToolLevelInput) -> str:
148+
"""Format evaluation prompt from tool-level input."""
149+
parts = []
150+
151+
# Format available tools
152+
if tool_input.available_tools:
153+
parts.append(f"## Available tool-calls\n{self._format_tools(tool_input.available_tools)}")
154+
155+
# Format previous conversation history
156+
if tool_input.session_history:
157+
history_lines = []
158+
for msg in tool_input.session_history:
159+
if isinstance(msg, list):
160+
# Handle tool execution lists
161+
for tool_exec in msg:
162+
history_lines.append(f"Tool call: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
163+
history_lines.append(f"Tool result: {tool_exec.tool_result.content}")
164+
else:
165+
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
166+
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
167+
history_str = "\n".join(history_lines)
168+
parts.append(f"## Previous conversation history\n{history_str}")
169+
170+
# Format target tool call to evaluate
171+
tool_details = tool_input.tool_execution_details
172+
tool_call_str = f"Tool call: {tool_details.tool_call.name}({tool_details.tool_call.arguments})"
173+
parts.append(f"## Target tool-call to evaluate\n{tool_call_str}")
174+
175+
return "\n\n".join(parts)
176+
177+
def _format_trace_level_prompt(self, parsed_input: TraceLevelInput) -> str:
178+
"""Format evaluation prompt from parsed turn data."""
179+
parts = []
180+
181+
if parsed_input.session_history:
182+
history_lines = []
183+
for msg in parsed_input.session_history:
184+
if isinstance(msg, list):
185+
# Handle tool execution lists
186+
for tool_exec in msg:
187+
history_lines.append(f"Tool call: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
188+
history_lines.append(f"Tool result: {tool_exec.tool_result.content}")
189+
else:
190+
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
191+
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
192+
history_str = "\n".join(history_lines)
193+
parts.append(f"# Conversation History:\n{history_str}")
194+
195+
parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}")
196+
197+
return "\n\n".join(parts)
198+
146199
def _has_text_content(self, msg: UserMessage | AssistantMessage) -> TypeGuard[UserMessage | AssistantMessage]:
147200
"""Check if a message object has accessible text content.
148201

src/strands_evals/evaluators/faithfulness_evaluator.py

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from typing_extensions import Union
88

99
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
10-
from ..types.trace import EvaluationLevel, TraceLevelInput
10+
from ..types.trace import EvaluationLevel
1111
from .evaluator import Evaluator
1212
from .prompt_templates.faithfulness import get_template
1313

@@ -55,7 +55,7 @@ def __init__(
5555

5656
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
5757
parsed_input = self._get_last_turn(evaluation_case)
58-
prompt = self._format_prompt(parsed_input)
58+
prompt = self._format_trace_level_prompt(parsed_input)
5959
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
6060
result = evaluator_agent(prompt, structured_output_model=FaithfulnessRating)
6161
rating = cast(FaithfulnessRating, result.structured_output)
@@ -71,7 +71,7 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
7171

7272
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
7373
parsed_input = self._get_last_turn(evaluation_case)
74-
prompt = self._format_prompt(parsed_input)
74+
prompt = self._format_trace_level_prompt(parsed_input)
7575
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
7676
result = await evaluator_agent.invoke_async(prompt, structured_output_model=FaithfulnessRating)
7777
rating = cast(FaithfulnessRating, result.structured_output)
@@ -84,25 +84,3 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
8484
label=rating.score,
8585
)
8686
]
87-
88-
def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
89-
"""Format evaluation prompt from parsed turn data."""
90-
parts = []
91-
92-
if parsed_input.session_history:
93-
history_lines = []
94-
for msg in parsed_input.session_history:
95-
if isinstance(msg, list):
96-
# Handle tool execution lists
97-
for tool_exec in msg:
98-
history_lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
99-
history_lines.append(f"Tool: {tool_exec.tool_result.content}")
100-
else:
101-
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
102-
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
103-
history_str = "\n".join(history_lines)
104-
parts.append(f"# Conversation History:\n{history_str}")
105-
106-
parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}")
107-
108-
return "\n\n".join(parts)

src/strands_evals/evaluators/harmfulness_evaluator.py

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from strands import Agent
66

77
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
8-
from ..types.trace import EvaluationLevel, TraceLevelInput
8+
from ..types.trace import EvaluationLevel
99
from .evaluator import Evaluator
1010
from .prompt_templates.harmfulness import get_template
1111

@@ -47,7 +47,7 @@ def __init__(
4747

4848
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
4949
parsed_input = self._get_last_turn(evaluation_case)
50-
prompt = self._format_prompt(parsed_input)
50+
prompt = self._format_trace_level_prompt(parsed_input)
5151
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
5252
result = evaluator_agent(prompt, structured_output_model=HarmfulnessRating)
5353
rating = cast(HarmfulnessRating, result.structured_output)
@@ -63,7 +63,7 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
6363

6464
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
6565
parsed_input = self._get_last_turn(evaluation_case)
66-
prompt = self._format_prompt(parsed_input)
66+
prompt = self._format_trace_level_prompt(parsed_input)
6767
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
6868
result = await evaluator_agent.invoke_async(prompt, structured_output_model=HarmfulnessRating)
6969
rating = cast(HarmfulnessRating, result.structured_output)
@@ -76,25 +76,3 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
7676
label=rating.score,
7777
)
7878
]
79-
80-
def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
81-
"""Format evaluation prompt from parsed turn data."""
82-
parts = []
83-
84-
if parsed_input.session_history:
85-
history_lines = []
86-
for msg in parsed_input.session_history:
87-
if isinstance(msg, list):
88-
# Handle tool execution lists
89-
for tool_exec in msg:
90-
history_lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
91-
history_lines.append(f"Tool: {tool_exec.tool_result.content}")
92-
else:
93-
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
94-
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
95-
history_str = "\n".join(history_lines)
96-
parts.append(f"# Conversation History:\n{history_str}")
97-
98-
parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}")
99-
100-
return "\n\n".join(parts)

src/strands_evals/evaluators/helpfulness_evaluator.py

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from typing_extensions import Union
88

99
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
10-
from ..types.trace import EvaluationLevel, TraceLevelInput
10+
from ..types.trace import EvaluationLevel
1111
from .evaluator import Evaluator
1212
from .prompt_templates.helpfulness import get_template
1313

@@ -61,7 +61,7 @@ def __init__(
6161

6262
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
6363
parsed_input = self._get_last_turn(evaluation_case)
64-
prompt = self._format_prompt(parsed_input)
64+
prompt = self._format_trace_level_prompt(parsed_input)
6565
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
6666
result = evaluator_agent(prompt, structured_output_model=HelpfulnessRating)
6767
rating = cast(HelpfulnessRating, result.structured_output)
@@ -77,7 +77,7 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
7777

7878
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
7979
parsed_input = self._get_last_turn(evaluation_case)
80-
prompt = self._format_prompt(parsed_input)
80+
prompt = self._format_trace_level_prompt(parsed_input)
8181
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
8282
result = await evaluator_agent.invoke_async(prompt, structured_output_model=HelpfulnessRating)
8383
rating = cast(HelpfulnessRating, result.structured_output)
@@ -90,25 +90,3 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
9090
label=rating.score,
9191
)
9292
]
93-
94-
def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
95-
"""Format evaluation prompt from parsed turn data."""
96-
parts = []
97-
98-
if parsed_input.session_history:
99-
history_lines = []
100-
for msg in parsed_input.session_history:
101-
if isinstance(msg, list):
102-
# Handle tool execution lists
103-
for tool_exec in msg:
104-
history_lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
105-
history_lines.append(f"Tool: {tool_exec.tool_result.content}")
106-
else:
107-
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
108-
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
109-
history_str = "\n".join(history_lines)
110-
parts.append(f"# Conversation History:\n{history_str}")
111-
112-
parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}")
113-
114-
return "\n\n".join(parts)

src/strands_evals/evaluators/response_relevance_evaluator.py

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from typing_extensions import Union
99

1010
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
11-
from ..types.trace import EvaluationLevel, TraceLevelInput
11+
from ..types.trace import EvaluationLevel
1212
from .evaluator import Evaluator
1313
from .prompt_templates.response_relevance import get_template
1414

@@ -58,14 +58,14 @@ def __init__(
5858

5959
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
6060
parsed_input = self._get_last_turn(evaluation_case)
61-
prompt = self._format_prompt(parsed_input)
61+
prompt = self._format_trace_level_prompt(parsed_input)
6262
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
6363
result = evaluator_agent(prompt, structured_output_model=ResponseRelevanceRating)
6464
return self._create_evaluation_output(result)
6565

6666
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
6767
parsed_input = self._get_last_turn(evaluation_case)
68-
prompt = self._format_prompt(parsed_input)
68+
prompt = self._format_trace_level_prompt(parsed_input)
6969
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
7070
result = await evaluator_agent.invoke_async(prompt, structured_output_model=ResponseRelevanceRating)
7171
return self._create_evaluation_output(result)
@@ -81,25 +81,3 @@ def _create_evaluation_output(self, result: AgentResult) -> list[EvaluationOutpu
8181
label=rating.score,
8282
)
8383
]
84-
85-
def _format_prompt(self, parsed_input: TraceLevelInput) -> str:
86-
"""Format evaluation prompt from parsed turn data."""
87-
parts = []
88-
89-
if parsed_input.session_history:
90-
history_lines = []
91-
for msg in parsed_input.session_history:
92-
if isinstance(msg, list):
93-
# Handle tool execution lists
94-
for tool_exec in msg:
95-
history_lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
96-
history_lines.append(f"Tool: {tool_exec.tool_result.content}")
97-
else:
98-
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
99-
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
100-
history_str = "\n".join(history_lines)
101-
parts.append(f"# Conversation History:\n{history_str}")
102-
103-
parts.append(f"# Assistant's Response:\n{parsed_input.agent_response.text}")
104-
105-
return "\n\n".join(parts)

src/strands_evals/evaluators/tool_parameter_accuracy_evaluator.py

Lines changed: 3 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from typing_extensions import Union
88

99
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
10-
from ..types.trace import EvaluationLevel, ToolLevelInput
10+
from ..types.trace import EvaluationLevel
1111
from .evaluator import Evaluator
1212
from .prompt_templates.tool_parameter_accuracy import get_template
1313

@@ -52,7 +52,7 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
5252
results = []
5353

5454
for tool_input in tool_inputs:
55-
prompt = self._format_prompt(tool_input)
55+
prompt = self._format_tool_level_prompt(tool_input)
5656
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
5757
result = evaluator_agent(prompt, structured_output_model=ToolParameterAccuracyRating)
5858
rating = cast(ToolParameterAccuracyRating, result.structured_output)
@@ -73,7 +73,7 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
7373
results = []
7474

7575
for tool_input in tool_inputs:
76-
prompt = self._format_prompt(tool_input)
76+
prompt = self._format_tool_level_prompt(tool_input)
7777
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
7878
result = await evaluator_agent.invoke_async(prompt, structured_output_model=ToolParameterAccuracyRating)
7979
rating = cast(ToolParameterAccuracyRating, result.structured_output)
@@ -88,33 +88,3 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
8888
)
8989

9090
return results
91-
92-
def _format_prompt(self, tool_input: ToolLevelInput) -> str:
93-
"""Format evaluation prompt from tool-level input."""
94-
parts = []
95-
96-
# Format available tools
97-
if tool_input.available_tools:
98-
parts.append(f"## Available tool-calls\n{self._format_tools(tool_input.available_tools)}")
99-
100-
# Format previous conversation history
101-
if tool_input.session_history:
102-
history_lines = []
103-
for msg in tool_input.session_history:
104-
if isinstance(msg, list):
105-
# Handle tool execution lists
106-
for tool_exec in msg:
107-
history_lines.append(f"Action: {tool_exec.tool_call.name}({tool_exec.tool_call.arguments})")
108-
history_lines.append(f"Tool: {tool_exec.tool_result.content}")
109-
else:
110-
text = msg.content[0].text if msg.content and hasattr(msg.content[0], "text") else ""
111-
history_lines.append(f"{msg.role.value.capitalize()}: {text}")
112-
history_str = "\n".join(history_lines)
113-
parts.append(f"## Previous conversation history\n{history_str}")
114-
115-
# Format target tool call to evaluate
116-
tool_details = tool_input.tool_execution_details
117-
tool_call_str = f"Action: {tool_details.tool_call.name}({tool_details.tool_call.arguments})"
118-
parts.append(f"## Target tool-call to evaluate\n{tool_call_str}")
119-
120-
return "\n\n".join(parts)

0 commit comments

Comments
 (0)