strands-agents · jjbuck · Nov 11, 2025 · Nov 6, 2025
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/examples/actor_simulator_basic.py b/src/examples/actor_simulator_basic.py
@@ -0,0 +1,63 @@
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+from strands import Agent
+
+from strands_evals import ActorSimulator, Case, Dataset
+from strands_evals.evaluators import HelpfulnessEvaluator
+from strands_evals.mappers import StrandsInMemorySessionMapper
+from strands_evals.telemetry import StrandsEvalsTelemetry
+
+# ======================================
+# SETUP TELEMETRY
+# ======================================
+telemetry = StrandsEvalsTelemetry()
+memory_exporter = InMemorySpanExporter()
+span_processor = BatchSpanProcessor(memory_exporter)
+telemetry.tracer_provider.add_span_processor(span_processor)
+
+
+# ======================================
+# SETUP AND RUN STRANDS EVAL
+# ======================================
+
+
+def task_function(case: Case) -> dict:
+    # Create simulator
+    user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=3)
+
+    # Create target agent
+    agent = Agent(system_prompt="You are a helpful travel assistant.", callback_handler=None)
+
+    # Accumulate target spans across all turns
+    all_target_spans = []
+
+    user_message = case.input
+    while user_sim.has_next():
+        # Clear before each target agent call to ensure we don't capture simulator traces.
+        memory_exporter.clear()
+        agent_response = agent(user_message)
+        agent_message = str(agent_response)
+        turn_spans = list(memory_exporter.get_finished_spans())
+        all_target_spans.extend(turn_spans)
+        user_result = user_sim.act(agent_message)
+        user_message = str(user_result.structured_output.message)
+
+    mapper = StrandsInMemorySessionMapper()
+    session = mapper.map_to_session(all_target_spans, session_id="test-session")
+
+    return {"output": agent_message, "trajectory": session}
+
+
+test_cases = [
+    Case[str, str](
+        name="booking-simple",
+        input="I need to book a flight to Paris next week",
+        metadata={"category": "booking", "task_description": "Flight booking confirmed"},
+    )
+]
+
+evaluator = HelpfulnessEvaluator()
+dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator)
+
+report = dataset.run_evaluations(task_function)
+report.run_display()
diff --git a/src/examples/agents_as_tools.py b/src/examples/agents_as_tools.py
@@ -102,7 +102,7 @@ async def async_agents_as_tools_example():
     """
 
     ### Step 1: Define task ###
-    def customer_support(task: str):
+    def customer_support(case: Case):
         @tool
         def technical_support(query: str) -> str:
             """Handle technical issues, bugs, and troubleshooting."""
@@ -158,7 +158,7 @@ def returns_exchanges(query: str) -> str:
             callback_handler=None,
             tools=[technical_support, billing_support, product_info, returns_exchanges],
         )
-        response = orchestrator(task)
+        response = orchestrator(case.input)
         description = tools_use_extractor.extract_tools_description(orchestrator)
         trajectory_evaluator.update_trajectory_description(description)
         interaction_evaluator.update_interaction_description(description)

diff --git a/src/examples/bank_tools_trajectory.py b/src/examples/bank_tools_trajectory.py
@@ -74,7 +74,7 @@ async def async_descriptive_tools_trajectory_example():
     """
 
     ### Step 1: Define task ###
-    async def get_response(query: str) -> dict:
+    async def get_response(case: Case) -> dict:
         bank_prompt = (
             "You are a banker, ensure that only people with sufficient balance can spend them."
             " Collect debt from people with negative balance."
@@ -83,7 +83,7 @@ async def get_response(query: str) -> dict:
         agent = Agent(
             tools=[get_balance, modify_balance, collect_debt], system_prompt=bank_prompt, callback_handler=None
         )
-        response = await agent.invoke_async(query)
+        response = await agent.invoke_async(case.input)
         trajectory_evaluator.update_trajectory_description(tools_use_extractor.extract_tools_description(agent))
         return TaskOutput(
             output=str(response), trajectory=tools_use_extractor.extract_agent_tools_used_from_messages(agent.messages)

diff --git a/src/examples/dataset_generator/simple_dataset.py b/src/examples/dataset_generator/simple_dataset.py
@@ -2,6 +2,7 @@
 
 from strands import Agent
 
+from strands_evals import Case
 from strands_evals.evaluators.output_evaluator import OutputEvaluator
 from strands_evals.generators.dataset_generator import DatasetGenerator
 
@@ -21,12 +22,12 @@ async def simple_dataset_generator():
     """
 
     ### Step 1: Define task ###
-    async def get_response(query: str) -> str:
+    async def get_response(case: Case) -> str:
         """
         Simple task example to get a response from an agent given a query.
         """
         agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
-        response = await agent.invoke_async(query)
+        response = await agent.invoke_async(case.input)
         return str(response)
 
     # Step 2: Initialize the dataset generator for string types

diff --git a/src/examples/evaluate_graph.py b/src/examples/evaluate_graph.py
@@ -25,7 +25,7 @@ async def async_graph_example():
     """
 
     ### Step 1: Define task ###
-    def research_graph(task: str):
+    def research_graph(case: Case):
         # Create specialized agents
         researcher = Agent(name="researcher", system_prompt="You are a research specialist...")
         analyst = Agent(name="analyst", system_prompt="You are a data analysis specialist...")
@@ -52,7 +52,7 @@ def research_graph(task: str):
         # Build the graph
         graph = builder.build()
 
-        result = graph(task)
+        result = graph(case.input)
         interactions = graph_extractor.extract_graph_interactions(result)
 
         return {"interactions": interactions, "trajectory": [node.node_id for node in result.execution_order]}

diff --git a/src/examples/evaluate_swarm.py b/src/examples/evaluate_swarm.py
@@ -25,7 +25,7 @@ async def async_swarm_example():
     """
 
     ### Step 1: Define task ###
-    def sde_swarm(task: str):
+    def sde_swarm(case: Case):
         # Create specialized agents
         researcher = Agent(name="researcher", system_prompt="You are a research specialist...", callback_handler=None)
         coder = Agent(name="coder", system_prompt="You are a coding specialist...", callback_handler=None)
@@ -45,7 +45,7 @@ def sde_swarm(task: str):
             repetitive_handoff_min_unique_agents=2,
         )
 
-        result = swarm(task)
+        result = swarm(case.input)
         interaction_info = swarm_extractor.extract_swarm_interactions(result)
 
         return {"interactions": interaction_info, "trajectory": [node.node_id for node in result.node_history]}

diff --git a/src/examples/multi_shots.py b/src/examples/multi_shots.py
@@ -24,7 +24,7 @@ async def async_multi_shots_interactions():
     """
 
     ### Step 1: Define task ###
-    def multi_turns_hacking(query: str) -> str:
+    def multi_turns_hacking(case: Case) -> str:
         """
         Simulates a multi-turn adversarial conversation to test agent safety.
 
@@ -38,7 +38,7 @@ def multi_turns_hacking(query: str) -> str:
         agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
 
         new_input = []
-        agent_repsonse = query
+        agent_repsonse = case.input
         hacker_response = None
         interactions = []
         turns = 5

diff --git a/src/examples/safety_judge_output.py b/src/examples/safety_judge_output.py
@@ -23,12 +23,12 @@ async def async_safety_output_judge_example():
     """
 
     ### Step 1: Define task ###
-    async def get_response(query: str) -> str:
+    async def get_response(case: Case) -> str:
         """
         Simple task example to get a response from an agent given a query.
         """
         agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
-        response = await agent.invoke_async(query)
+        response = await agent.invoke_async(case.input)
         return str(response)
 
     ### Step 2: Create test cases ###

diff --git a/src/examples/third_party_evaluator.py b/src/examples/third_party_evaluator.py
@@ -30,9 +30,9 @@ def third_party_example():
     """
 
     ### Step 1: Define task ###
-    def get_response(query: str) -> str:
+    def get_response(case: Case) -> str:
         agent = Agent(callback_handler=None)
-        return str(agent(query))
+        return str(agent(case.input))
 
     ### Step 2: Create test cases ###
     test_case1 = Case[str, str](
@@ -105,9 +105,9 @@ async def async_third_party_example():
     """
 
     ### Step 1: Define task ###
-    async def get_response(query: str) -> str:
+    async def get_response(case: Case) -> str:
         agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
-        response = await agent.invoke_async(query)
+        response = await agent.invoke_async(case.input)
         return str(response)
 
     ### Step 2: Create test cases ###

diff --git a/src/strands_evals/__init__.py b/src/strands_evals/__init__.py
@@ -1,8 +1,9 @@
 __version__ = "0.1.0"
 
-from . import evaluators, extractors, generators, telemetry, types
+from . import evaluators, extractors, generators, simulation, telemetry, types
 from .case import Case
 from .dataset import Dataset
+from .simulation import ActorSimulator, UserSimulator
 from .telemetry import StrandsEvalsTelemetry, get_tracer
 
 __all__ = [
@@ -12,7 +13,10 @@
     "extractors",
     "types",
     "generators",
+    "simulation",
     "telemetry",
     "StrandsEvalsTelemetry",
     "get_tracer",
+    "ActorSimulator",
+    "UserSimulator",
 ]
diff --git a/src/strands_evals/dataset.py b/src/strands_evals/dataset.py
@@ -104,7 +104,7 @@ def evaluator(self, new_evaluator: Evaluator[InputT, OutputT]):
         self._evaluator = new_evaluator
 
     def _run_task(
-        self, task: Callable[[InputT], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
+        self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
     ) -> EvaluationData[InputT, OutputT]:
         """
         Run the task with the inputs from the test case.
@@ -128,7 +128,7 @@ def _run_task(
             expected_interactions=case.expected_interactions,
             metadata=case.metadata,
         )
-        task_output = task(case.input)
+        task_output = task(case)
         if isinstance(task_output, dict):  # could be evaluating the trajectory as well
             evaluation_context.actual_output = task_output.get("output")
             evaluation_context.actual_trajectory = task_output.get("trajectory")
@@ -141,7 +141,7 @@ def _run_task(
         return evaluation_context
 
     async def _run_task_async(
-        self, task: Callable[[InputT], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
+        self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
     ) -> EvaluationData[InputT, OutputT]:
         """
         Run the task with the inputs from the test case asynchronously.
@@ -167,10 +167,10 @@ async def _run_task_async(
 
         # Handle both async and sync tasks
         if asyncio.iscoroutinefunction(task):
-            task_output = await task(case.input)
+            task_output = await task(case)
         else:
             # Run sync function in separate thread to avoid blocking
-            task_output = await asyncio.to_thread(task, case.input)
+            task_output = await asyncio.to_thread(task, case)
 
         if isinstance(task_output, dict):
             evaluation_context.actual_output = task_output.get("output")
@@ -277,7 +277,7 @@ async def _worker(self, queue: asyncio.Queue, task: Callable, results: list):
                 finally:
                     queue.task_done()
 
-    def run_evaluations(self, task: Callable[[InputT], OutputT | dict[str, Any]]) -> EvaluationReport:
+    def run_evaluations(self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]]) -> EvaluationReport:
         """
         Run the evaluations for all of the test cases with the evaluator.