Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added src/__init__.py
Empty file.
63 changes: 63 additions & 0 deletions src/examples/actor_simulator_basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
from strands import Agent

from strands_evals import ActorSimulator, Case, Dataset
from strands_evals.evaluators import HelpfulnessEvaluator
from strands_evals.mappers import StrandsInMemorySessionMapper
from strands_evals.telemetry import StrandsEvalsTelemetry

# ======================================
# SETUP TELEMETRY
# ======================================
telemetry = StrandsEvalsTelemetry()
memory_exporter = InMemorySpanExporter()
span_processor = BatchSpanProcessor(memory_exporter)
telemetry.tracer_provider.add_span_processor(span_processor)


# ======================================
# SETUP AND RUN STRANDS EVAL
# ======================================


def task_function(case: Case) -> dict:
# Create simulator
user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=3)

# Create target agent
agent = Agent(system_prompt="You are a helpful travel assistant.", callback_handler=None)

# Accumulate target spans across all turns
all_target_spans = []

user_message = case.input
while user_sim.has_next():
# Clear before each target agent call to ensure we don't capture simulator traces.
memory_exporter.clear()
agent_response = agent(user_message)
agent_message = str(agent_response)
turn_spans = list(memory_exporter.get_finished_spans())
all_target_spans.extend(turn_spans)
user_result = user_sim.act(agent_message)
user_message = str(user_result.structured_output.message)

mapper = StrandsInMemorySessionMapper()
session = mapper.map_to_session(all_target_spans, session_id="test-session")

return {"output": agent_message, "trajectory": session}


test_cases = [
Case[str, str](
name="booking-simple",
input="I need to book a flight to Paris next week",
metadata={"category": "booking", "task_description": "Flight booking confirmed"},
)
]

evaluator = HelpfulnessEvaluator()
dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator)

report = dataset.run_evaluations(task_function)
report.run_display()
4 changes: 2 additions & 2 deletions src/examples/agents_as_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ async def async_agents_as_tools_example():
"""

### Step 1: Define task ###
def customer_support(task: str):
def customer_support(case: Case):
@tool
def technical_support(query: str) -> str:
"""Handle technical issues, bugs, and troubleshooting."""
Expand Down Expand Up @@ -158,7 +158,7 @@ def returns_exchanges(query: str) -> str:
callback_handler=None,
tools=[technical_support, billing_support, product_info, returns_exchanges],
)
response = orchestrator(task)
response = orchestrator(case.input)
description = tools_use_extractor.extract_tools_description(orchestrator)
trajectory_evaluator.update_trajectory_description(description)
interaction_evaluator.update_interaction_description(description)
Expand Down
4 changes: 2 additions & 2 deletions src/examples/bank_tools_trajectory.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ async def async_descriptive_tools_trajectory_example():
"""

### Step 1: Define task ###
async def get_response(query: str) -> dict:
async def get_response(case: Case) -> dict:
bank_prompt = (
"You are a banker, ensure that only people with sufficient balance can spend them."
" Collect debt from people with negative balance."
Expand All @@ -83,7 +83,7 @@ async def get_response(query: str) -> dict:
agent = Agent(
tools=[get_balance, modify_balance, collect_debt], system_prompt=bank_prompt, callback_handler=None
)
response = await agent.invoke_async(query)
response = await agent.invoke_async(case.input)
trajectory_evaluator.update_trajectory_description(tools_use_extractor.extract_tools_description(agent))
return TaskOutput(
output=str(response), trajectory=tools_use_extractor.extract_agent_tools_used_from_messages(agent.messages)
Expand Down
5 changes: 3 additions & 2 deletions src/examples/dataset_generator/simple_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from strands import Agent

from strands_evals import Case
from strands_evals.evaluators.output_evaluator import OutputEvaluator
from strands_evals.generators.dataset_generator import DatasetGenerator

Expand All @@ -21,12 +22,12 @@ async def simple_dataset_generator():
"""

### Step 1: Define task ###
async def get_response(query: str) -> str:
async def get_response(case: Case) -> str:
"""
Simple task example to get a response from an agent given a query.
"""
agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
response = await agent.invoke_async(query)
response = await agent.invoke_async(case.input)
return str(response)

# Step 2: Initialize the dataset generator for string types
Expand Down
4 changes: 2 additions & 2 deletions src/examples/evaluate_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ async def async_graph_example():
"""

### Step 1: Define task ###
def research_graph(task: str):
def research_graph(case: Case):
# Create specialized agents
researcher = Agent(name="researcher", system_prompt="You are a research specialist...")
analyst = Agent(name="analyst", system_prompt="You are a data analysis specialist...")
Expand All @@ -52,7 +52,7 @@ def research_graph(task: str):
# Build the graph
graph = builder.build()

result = graph(task)
result = graph(case.input)
interactions = graph_extractor.extract_graph_interactions(result)

return {"interactions": interactions, "trajectory": [node.node_id for node in result.execution_order]}
Expand Down
4 changes: 2 additions & 2 deletions src/examples/evaluate_swarm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ async def async_swarm_example():
"""

### Step 1: Define task ###
def sde_swarm(task: str):
def sde_swarm(case: Case):
# Create specialized agents
researcher = Agent(name="researcher", system_prompt="You are a research specialist...", callback_handler=None)
coder = Agent(name="coder", system_prompt="You are a coding specialist...", callback_handler=None)
Expand All @@ -45,7 +45,7 @@ def sde_swarm(task: str):
repetitive_handoff_min_unique_agents=2,
)

result = swarm(task)
result = swarm(case.input)
interaction_info = swarm_extractor.extract_swarm_interactions(result)

return {"interactions": interaction_info, "trajectory": [node.node_id for node in result.node_history]}
Expand Down
4 changes: 2 additions & 2 deletions src/examples/multi_shots.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ async def async_multi_shots_interactions():
"""

### Step 1: Define task ###
def multi_turns_hacking(query: str) -> str:
def multi_turns_hacking(case: Case) -> str:
"""
Simulates a multi-turn adversarial conversation to test agent safety.

Expand All @@ -38,7 +38,7 @@ def multi_turns_hacking(query: str) -> str:
agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)

new_input = []
agent_repsonse = query
agent_repsonse = case.input
hacker_response = None
interactions = []
turns = 5
Expand Down
4 changes: 2 additions & 2 deletions src/examples/safety_judge_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ async def async_safety_output_judge_example():
"""

### Step 1: Define task ###
async def get_response(query: str) -> str:
async def get_response(case: Case) -> str:
"""
Simple task example to get a response from an agent given a query.
"""
agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
response = await agent.invoke_async(query)
response = await agent.invoke_async(case.input)
return str(response)

### Step 2: Create test cases ###
Expand Down
8 changes: 4 additions & 4 deletions src/examples/third_party_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ def third_party_example():
"""

### Step 1: Define task ###
def get_response(query: str) -> str:
def get_response(case: Case) -> str:
agent = Agent(callback_handler=None)
return str(agent(query))
return str(agent(case.input))

### Step 2: Create test cases ###
test_case1 = Case[str, str](
Expand Down Expand Up @@ -105,9 +105,9 @@ async def async_third_party_example():
"""

### Step 1: Define task ###
async def get_response(query: str) -> str:
async def get_response(case: Case) -> str:
agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
response = await agent.invoke_async(query)
response = await agent.invoke_async(case.input)
return str(response)

### Step 2: Create test cases ###
Expand Down
6 changes: 5 additions & 1 deletion src/strands_evals/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
__version__ = "0.1.0"

from . import evaluators, extractors, generators, telemetry, types
from . import evaluators, extractors, generators, simulation, telemetry, types
from .case import Case
from .dataset import Dataset
from .simulation import ActorSimulator, UserSimulator
from .telemetry import StrandsEvalsTelemetry, get_tracer

__all__ = [
Expand All @@ -12,7 +13,10 @@
"extractors",
"types",
"generators",
"simulation",
"telemetry",
"StrandsEvalsTelemetry",
"get_tracer",
"ActorSimulator",
"UserSimulator",
]
12 changes: 6 additions & 6 deletions src/strands_evals/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def evaluator(self, new_evaluator: Evaluator[InputT, OutputT]):
self._evaluator = new_evaluator

def _run_task(
self, task: Callable[[InputT], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
) -> EvaluationData[InputT, OutputT]:
"""
Run the task with the inputs from the test case.
Expand All @@ -128,7 +128,7 @@ def _run_task(
expected_interactions=case.expected_interactions,
metadata=case.metadata,
)
task_output = task(case.input)
task_output = task(case)
if isinstance(task_output, dict): # could be evaluating the trajectory as well
evaluation_context.actual_output = task_output.get("output")
evaluation_context.actual_trajectory = task_output.get("trajectory")
Expand All @@ -141,7 +141,7 @@ def _run_task(
return evaluation_context

async def _run_task_async(
self, task: Callable[[InputT], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
) -> EvaluationData[InputT, OutputT]:
"""
Run the task with the inputs from the test case asynchronously.
Expand All @@ -167,10 +167,10 @@ async def _run_task_async(

# Handle both async and sync tasks
if asyncio.iscoroutinefunction(task):
task_output = await task(case.input)
task_output = await task(case)
else:
# Run sync function in separate thread to avoid blocking
task_output = await asyncio.to_thread(task, case.input)
task_output = await asyncio.to_thread(task, case)

if isinstance(task_output, dict):
evaluation_context.actual_output = task_output.get("output")
Expand Down Expand Up @@ -277,7 +277,7 @@ async def _worker(self, queue: asyncio.Queue, task: Callable, results: list):
finally:
queue.task_done()

def run_evaluations(self, task: Callable[[InputT], OutputT | dict[str, Any]]) -> EvaluationReport:
def run_evaluations(self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]]) -> EvaluationReport:
"""
Run the evaluations for all of the test cases with the evaluator.

Expand Down
Loading