Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,5 @@ repl_state
dataset_files
report_files
.venv
*.DS_Store*
*.DS_Store*
uv.lock
4 changes: 2 additions & 2 deletions src/strands_evals/evaluators/coherence_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ class CoherenceRating(BaseModel):

class CoherenceEvaluator(Evaluator[InputT, OutputT]):
"""Evaluates the logical cohesion of the assistant's response.
This evaluator assesses whether the assistant's response maintains logical consistency,
flows naturally, and presents ideas in a well-organized manner. It uses an LLM-as-judge
approach to provide categorical ratings that are then normalized to numeric scores.
Scores:
- NOT_AT_ALL (0.0): Response is completely incoherent or contradictory
- NOT_GENERALLY (0.25): Response has significant logical gaps or inconsistencies
Expand Down
81 changes: 38 additions & 43 deletions src/strands_evals/extractors/tools_use_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,51 +21,46 @@ def extract_agent_tools_used_from_messages(agent_messages):
for i, message in enumerate(agent_messages):
if message.get("role") == "assistant":
message_info = message.get("content")
if len(message_info) > 0:
tools = []
for message in message_info:
if "toolUse" in message:
tools.append(message.get("toolUse"))
if message_info:
# Collect tool uses from this message
tools = [cb.get("toolUse") for cb in message_info if cb.get("toolUse")]
if not tools:
continue

# Build lookup dict of tool results from subsequent user messages
tool_ids_needed = {tool.get("toolUseId") for tool in tools}
tool_results_by_id: dict[str, dict] = {}
for next_message in agent_messages[i + 1 :]:
if next_message.get("role") == "user":
for content_block in next_message.get("content") or []:
tool_result_dict = content_block.get("toolResult")
if tool_result_dict:
tool_id = tool_result_dict.get("toolUseId")
if tool_id in tool_ids_needed and tool_id not in tool_results_by_id:
tool_results_by_id[tool_id] = tool_result_dict
if len(tool_results_by_id) == len(tool_ids_needed):
break

for tool in tools:
if tool:
tool_name = tool.get("name")
tool_input = tool.get("input")
tool_id = tool.get("toolUseId")
# get the tool result from the next message
tool_result = None
is_error = False
next_message_i = i + 1
while next_message_i < len(agent_messages):
next_message = agent_messages[next_message_i]
next_message_i += 1

if next_message.get("role") == "user":
content = next_message.get("content")
if content:
# Find toolResult in content blocks - may not be at index 0
tool_result_dict = None
for content_block in content:
if "toolResult" in content_block:
tool_result_dict = content_block.get("toolResult")
break

if tool_result_dict and tool_result_dict.get("toolUseId") == tool_id:
tool_result_content = tool_result_dict.get("content", [])
# Find first text in tool result content - may not be at index 0
tool_result = None
if tool_result_content:
for result_item in tool_result_content:
if isinstance(result_item, dict) and "text" in result_item:
tool_result = result_item.get("text")
break
is_error = tool_result_dict.get("status") == "error"
break

tools_used.append(
{"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
)
tool = message.get("toolUse")
tool_name = tool.get("name")
tool_input = tool.get("input")
tool_id = tool.get("toolUseId")
tool_result = None
is_error = False

# Find the matching tool result block
tool_result_dict = tool_results_by_id.get(tool_id)
if tool_result_dict:
tool_result_content = tool_result_dict.get("content", [])
for result_item in tool_result_content:
if isinstance(result_item, dict) and "text" in result_item:
tool_result = result_item.get("text")
break
is_error = tool_result_dict.get("status") == "error"

tools_used.append(
{"name": tool_name, "input": tool_input, "tool_result": tool_result, "is_error": is_error}
)
return tools_used


Expand Down
66 changes: 66 additions & 0 deletions tests/strands_evals/extractors/test_tools_use_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,3 +342,69 @@ def test_tools_use_extractor_extract_from_messages_user_message_without_tool_res
assert result[0]["input"] == {"expression": "5+5"}
assert result[0]["tool_result"] == "Result: 10"
assert result[0]["is_error"] is False


def test_tools_use_extractor_extract_from_messages_parallel_tool_calls():
"""Test extracting multiple parallel tool calls with results in same user message."""
messages = [
{"role": "user", "content": [{"text": "Calculate 2+2 and 3+3"}]},
{
"role": "assistant",
"content": [
{"toolUse": {"toolUseId": "tool_1", "name": "calculator", "input": {"expression": "2+2"}}},
{"toolUse": {"toolUseId": "tool_2", "name": "calculator", "input": {"expression": "3+3"}}},
],
},
{
"role": "user",
"content": [
{"toolResult": {"status": "success", "content": [{"text": "4"}], "toolUseId": "tool_1"}},
{"toolResult": {"status": "success", "content": [{"text": "6"}], "toolUseId": "tool_2"}},
],
},
]

result = extract_agent_tools_used_from_messages(messages)

assert len(result) == 2
assert result[0]["name"] == "calculator"
assert result[0]["input"] == {"expression": "2+2"}
assert result[0]["tool_result"] == "4"
assert result[1]["name"] == "calculator"
assert result[1]["input"] == {"expression": "3+3"}
assert result[1]["tool_result"] == "6"


def test_tools_use_extractor_extract_from_messages_reused_tool_ids():
"""Test extracting tool calls when tool IDs are reused across the session."""
messages = [
{"role": "user", "content": [{"text": "Calculate 2+2"}]},
{
"role": "assistant",
"content": [{"toolUse": {"toolUseId": "call_123", "name": "calculator", "input": {"expression": "2+2"}}}],
},
{
"role": "user",
"content": [{"toolResult": {"status": "success", "content": [{"text": "4"}], "toolUseId": "call_123"}}],
},
{"role": "assistant", "content": [{"text": "The answer is 4"}]},
{"role": "user", "content": [{"text": "Now calculate 5+5"}]},
{
"role": "assistant",
"content": [{"toolUse": {"toolUseId": "call_123", "name": "calculator", "input": {"expression": "5+5"}}}],
},
{
"role": "user",
"content": [{"toolResult": {"status": "success", "content": [{"text": "10"}], "toolUseId": "call_123"}}],
},
]

result = extract_agent_tools_used_from_messages(messages)

assert len(result) == 2
assert result[0]["name"] == "calculator"
assert result[0]["input"] == {"expression": "2+2"}
assert result[0]["tool_result"] == "4"
assert result[1]["name"] == "calculator"
assert result[1]["input"] == {"expression": "5+5"}
assert result[1]["tool_result"] == "10"