diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/examples/actor_simulator_basic.py b/src/examples/actor_simulator_basic.py
new file mode 100644
index 0000000..641e36d
--- /dev/null
+++ b/src/examples/actor_simulator_basic.py
@@ -0,0 +1,63 @@
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+from strands import Agent
+
+from strands_evals import ActorSimulator, Case, Dataset
+from strands_evals.evaluators import HelpfulnessEvaluator
+from strands_evals.mappers import StrandsInMemorySessionMapper
+from strands_evals.telemetry import StrandsEvalsTelemetry
+
+# ======================================
+# SETUP TELEMETRY
+# ======================================
+telemetry = StrandsEvalsTelemetry()
+memory_exporter = InMemorySpanExporter()
+span_processor = BatchSpanProcessor(memory_exporter)
+telemetry.tracer_provider.add_span_processor(span_processor)
+
+
+# ======================================
+# SETUP AND RUN STRANDS EVAL
+# ======================================
+
+
+def task_function(case: Case) -> dict:
+ # Create simulator
+ user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=3)
+
+ # Create target agent
+ agent = Agent(system_prompt="You are a helpful travel assistant.", callback_handler=None)
+
+ # Accumulate target spans across all turns
+ all_target_spans = []
+
+ user_message = case.input
+ while user_sim.has_next():
+ # Clear before each target agent call to ensure we don't capture simulator traces.
+ memory_exporter.clear()
+ agent_response = agent(user_message)
+ agent_message = str(agent_response)
+ turn_spans = list(memory_exporter.get_finished_spans())
+ all_target_spans.extend(turn_spans)
+ user_result = user_sim.act(agent_message)
+ user_message = str(user_result.structured_output.message)
+
+ mapper = StrandsInMemorySessionMapper()
+ session = mapper.map_to_session(all_target_spans, session_id="test-session")
+
+ return {"output": agent_message, "trajectory": session}
+
+
+test_cases = [
+ Case[str, str](
+ name="booking-simple",
+ input="I need to book a flight to Paris next week",
+ metadata={"category": "booking", "task_description": "Flight booking confirmed"},
+ )
+]
+
+evaluator = HelpfulnessEvaluator()
+dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator)
+
+report = dataset.run_evaluations(task_function)
+report.run_display()
diff --git a/src/examples/agents_as_tools.py b/src/examples/agents_as_tools.py
index 3daa40a..6d0c1bd 100644
--- a/src/examples/agents_as_tools.py
+++ b/src/examples/agents_as_tools.py
@@ -102,7 +102,7 @@ async def async_agents_as_tools_example():
"""
### Step 1: Define task ###
- def customer_support(task: str):
+ def customer_support(case: Case):
@tool
def technical_support(query: str) -> str:
"""Handle technical issues, bugs, and troubleshooting."""
@@ -158,7 +158,7 @@ def returns_exchanges(query: str) -> str:
callback_handler=None,
tools=[technical_support, billing_support, product_info, returns_exchanges],
)
- response = orchestrator(task)
+ response = orchestrator(case.input)
description = tools_use_extractor.extract_tools_description(orchestrator)
trajectory_evaluator.update_trajectory_description(description)
interaction_evaluator.update_interaction_description(description)
diff --git a/src/examples/bank_tools_trajectory.py b/src/examples/bank_tools_trajectory.py
index 9ad7b5a..6f52b36 100644
--- a/src/examples/bank_tools_trajectory.py
+++ b/src/examples/bank_tools_trajectory.py
@@ -74,7 +74,7 @@ async def async_descriptive_tools_trajectory_example():
"""
### Step 1: Define task ###
- async def get_response(query: str) -> dict:
+ async def get_response(case: Case) -> dict:
bank_prompt = (
"You are a banker, ensure that only people with sufficient balance can spend them."
" Collect debt from people with negative balance."
@@ -83,7 +83,7 @@ async def get_response(query: str) -> dict:
agent = Agent(
tools=[get_balance, modify_balance, collect_debt], system_prompt=bank_prompt, callback_handler=None
)
- response = await agent.invoke_async(query)
+ response = await agent.invoke_async(case.input)
trajectory_evaluator.update_trajectory_description(tools_use_extractor.extract_tools_description(agent))
return TaskOutput(
output=str(response), trajectory=tools_use_extractor.extract_agent_tools_used_from_messages(agent.messages)
diff --git a/src/examples/dataset_generator/simple_dataset.py b/src/examples/dataset_generator/simple_dataset.py
index 4d1c634..7e4f96e 100644
--- a/src/examples/dataset_generator/simple_dataset.py
+++ b/src/examples/dataset_generator/simple_dataset.py
@@ -2,6 +2,7 @@
from strands import Agent
+from strands_evals import Case
from strands_evals.evaluators.output_evaluator import OutputEvaluator
from strands_evals.generators.dataset_generator import DatasetGenerator
@@ -21,12 +22,12 @@ async def simple_dataset_generator():
"""
### Step 1: Define task ###
- async def get_response(query: str) -> str:
+ async def get_response(case: Case) -> str:
"""
Simple task example to get a response from an agent given a query.
"""
agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
- response = await agent.invoke_async(query)
+ response = await agent.invoke_async(case.input)
return str(response)
# Step 2: Initialize the dataset generator for string types
diff --git a/src/examples/evaluate_graph.py b/src/examples/evaluate_graph.py
index 0159e42..dfc8f0d 100644
--- a/src/examples/evaluate_graph.py
+++ b/src/examples/evaluate_graph.py
@@ -25,7 +25,7 @@ async def async_graph_example():
"""
### Step 1: Define task ###
- def research_graph(task: str):
+ def research_graph(case: Case):
# Create specialized agents
researcher = Agent(name="researcher", system_prompt="You are a research specialist...")
analyst = Agent(name="analyst", system_prompt="You are a data analysis specialist...")
@@ -52,7 +52,7 @@ def research_graph(task: str):
# Build the graph
graph = builder.build()
- result = graph(task)
+ result = graph(case.input)
interactions = graph_extractor.extract_graph_interactions(result)
return {"interactions": interactions, "trajectory": [node.node_id for node in result.execution_order]}
diff --git a/src/examples/evaluate_swarm.py b/src/examples/evaluate_swarm.py
index a19a4fe..d0584ae 100644
--- a/src/examples/evaluate_swarm.py
+++ b/src/examples/evaluate_swarm.py
@@ -25,7 +25,7 @@ async def async_swarm_example():
"""
### Step 1: Define task ###
- def sde_swarm(task: str):
+ def sde_swarm(case: Case):
# Create specialized agents
researcher = Agent(name="researcher", system_prompt="You are a research specialist...", callback_handler=None)
coder = Agent(name="coder", system_prompt="You are a coding specialist...", callback_handler=None)
@@ -45,7 +45,7 @@ def sde_swarm(task: str):
repetitive_handoff_min_unique_agents=2,
)
- result = swarm(task)
+ result = swarm(case.input)
interaction_info = swarm_extractor.extract_swarm_interactions(result)
return {"interactions": interaction_info, "trajectory": [node.node_id for node in result.node_history]}
diff --git a/src/examples/multi_shots.py b/src/examples/multi_shots.py
index 3278f6f..29ae7e7 100644
--- a/src/examples/multi_shots.py
+++ b/src/examples/multi_shots.py
@@ -24,7 +24,7 @@ async def async_multi_shots_interactions():
"""
### Step 1: Define task ###
- def multi_turns_hacking(query: str) -> str:
+ def multi_turns_hacking(case: Case) -> str:
"""
Simulates a multi-turn adversarial conversation to test agent safety.
@@ -38,7 +38,7 @@ def multi_turns_hacking(query: str) -> str:
agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
new_input = []
- agent_repsonse = query
+ agent_repsonse = case.input
hacker_response = None
interactions = []
turns = 5
diff --git a/src/examples/safety_judge_output.py b/src/examples/safety_judge_output.py
index 617f1cd..9989ebd 100644
--- a/src/examples/safety_judge_output.py
+++ b/src/examples/safety_judge_output.py
@@ -23,12 +23,12 @@ async def async_safety_output_judge_example():
"""
### Step 1: Define task ###
- async def get_response(query: str) -> str:
+ async def get_response(case: Case) -> str:
"""
Simple task example to get a response from an agent given a query.
"""
agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
- response = await agent.invoke_async(query)
+ response = await agent.invoke_async(case.input)
return str(response)
### Step 2: Create test cases ###
diff --git a/src/examples/third_party_evaluator.py b/src/examples/third_party_evaluator.py
index e7fd758..174238d 100644
--- a/src/examples/third_party_evaluator.py
+++ b/src/examples/third_party_evaluator.py
@@ -30,9 +30,9 @@ def third_party_example():
"""
### Step 1: Define task ###
- def get_response(query: str) -> str:
+ def get_response(case: Case) -> str:
agent = Agent(callback_handler=None)
- return str(agent(query))
+ return str(agent(case.input))
### Step 2: Create test cases ###
test_case1 = Case[str, str](
@@ -105,9 +105,9 @@ async def async_third_party_example():
"""
### Step 1: Define task ###
- async def get_response(query: str) -> str:
+ async def get_response(case: Case) -> str:
agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
- response = await agent.invoke_async(query)
+ response = await agent.invoke_async(case.input)
return str(response)
### Step 2: Create test cases ###
diff --git a/src/strands_evals/__init__.py b/src/strands_evals/__init__.py
index 8548f25..137906b 100644
--- a/src/strands_evals/__init__.py
+++ b/src/strands_evals/__init__.py
@@ -1,8 +1,9 @@
__version__ = "0.1.0"
-from . import evaluators, extractors, generators, telemetry, types
+from . import evaluators, extractors, generators, simulation, telemetry, types
from .case import Case
from .dataset import Dataset
+from .simulation import ActorSimulator, UserSimulator
from .telemetry import StrandsEvalsTelemetry, get_tracer
__all__ = [
@@ -12,7 +13,10 @@
"extractors",
"types",
"generators",
+ "simulation",
"telemetry",
"StrandsEvalsTelemetry",
"get_tracer",
+ "ActorSimulator",
+ "UserSimulator",
]
diff --git a/src/strands_evals/dataset.py b/src/strands_evals/dataset.py
index dbd7d71..f60eab1 100644
--- a/src/strands_evals/dataset.py
+++ b/src/strands_evals/dataset.py
@@ -104,7 +104,7 @@ def evaluator(self, new_evaluator: Evaluator[InputT, OutputT]):
self._evaluator = new_evaluator
def _run_task(
- self, task: Callable[[InputT], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
+ self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
) -> EvaluationData[InputT, OutputT]:
"""
Run the task with the inputs from the test case.
@@ -128,7 +128,7 @@ def _run_task(
expected_interactions=case.expected_interactions,
metadata=case.metadata,
)
- task_output = task(case.input)
+ task_output = task(case)
if isinstance(task_output, dict): # could be evaluating the trajectory as well
evaluation_context.actual_output = task_output.get("output")
evaluation_context.actual_trajectory = task_output.get("trajectory")
@@ -141,7 +141,7 @@ def _run_task(
return evaluation_context
async def _run_task_async(
- self, task: Callable[[InputT], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
+ self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
) -> EvaluationData[InputT, OutputT]:
"""
Run the task with the inputs from the test case asynchronously.
@@ -167,10 +167,10 @@ async def _run_task_async(
# Handle both async and sync tasks
if asyncio.iscoroutinefunction(task):
- task_output = await task(case.input)
+ task_output = await task(case)
else:
# Run sync function in separate thread to avoid blocking
- task_output = await asyncio.to_thread(task, case.input)
+ task_output = await asyncio.to_thread(task, case)
if isinstance(task_output, dict):
evaluation_context.actual_output = task_output.get("output")
@@ -277,7 +277,7 @@ async def _worker(self, queue: asyncio.Queue, task: Callable, results: list):
finally:
queue.task_done()
- def run_evaluations(self, task: Callable[[InputT], OutputT | dict[str, Any]]) -> EvaluationReport:
+ def run_evaluations(self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]]) -> EvaluationReport:
"""
Run the evaluations for all of the test cases with the evaluator.
diff --git a/src/strands_evals/simulation/README.md b/src/strands_evals/simulation/README.md
new file mode 100644
index 0000000..5e4b158
--- /dev/null
+++ b/src/strands_evals/simulation/README.md
@@ -0,0 +1,323 @@
+# Actor Simulator
+
+A framework for simulating realistic multi-turn conversations with AI-powered actors for agent evaluation.
+
+## Overview
+
+ActorSimulator creates realistic actor personas that interact with agents in multi-turn conversations. It automatically generates actor profiles from test cases, maintains conversation context, and produces contextually appropriate responses aligned with the actor's goals and traits.
+
+## Quick Start
+
+```python
+from strands import Agent
+from strands_evals import ActorSimulator, Case
+
+# Create agent under test
+agent = Agent(system_prompt="You are a helpful travel assistant.", callback_handler=None)
+
+# Create test case
+case = Case(
+ input="I want to plan a trip to Tokyo with hotel and activities",
+ metadata={"task_description": "Complete travel package arranged"}
+)
+
+# Create user simulator with max_turns
+user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=5)
+
+# Run conversation
+user_message = case.input
+while user_sim.has_next():
+ agent_response = agent(user_message)
+ user_result = user_sim.act(str(agent_response))
+ user_message = str(user_result.structured_output.message)
+```
+
+## How It Works
+
+1. **Profile Generation**: Creates a realistic actor profile with traits, context, and goals from the test case
+2. **Conversation Initialization**: Sets up conversation with a greeting and the actor's initial query
+3. **Contextual Responses**: Generates responses that maintain consistency with the actor's profile and goals
+4. **Goal Tracking**: Built-in tool allows actors to assess progress toward their goals
+
+## API Reference
+
+### ActorSimulator
+
+Main class for simulating actor behavior in conversations.
+
+#### Factory Method (Recommended)
+
+```python
+ActorSimulator.from_case_for_user_simulator(
+ case: Case,
+ system_prompt_template: str | None = None,
+ tools: list | None = None,
+ model: str | None = None,
+ max_turns: int = 10
+) -> ActorSimulator
+```
+
+Creates an ActorSimulator configured as a user simulator from a test case. Automatically generates a realistic actor profile from `case.input` and optionally `case.metadata["task_description"]`.
+
+**Parameters:**
+- `case`: Test case with input (initial query) and optional task_description in metadata
+- `system_prompt_template`: Custom system prompt template (uses default if None)
+- `tools`: Additional tools for the actor (defaults to goal completion tool only)
+- `model`: Model identifier (uses Strands default if None)
+- `max_turns`: Maximum number of conversation turns (default: 10)
+
+**Example:**
+```python
+case = Case(
+ input="I need help booking a flight to Paris",
+ metadata={"task_description": "Book round-trip flight under $800"}
+)
+
+user_sim = ActorSimulator.from_case_for_user_simulator(
+ case=case,
+ max_turns=5
+)
+```
+
+#### Direct Initialization
+
+```python
+ActorSimulator(
+ actor_profile: ActorProfile,
+ initial_query: str,
+ system_prompt_template: str,
+ tools: list | None = None,
+ model: str | None = None,
+ max_turns: int = 10
+)
+```
+
+Initialize with an existing actor profile. Use this when you have a pre-defined profile instead of generating one from a test case.
+
+**Parameters:**
+- `actor_profile`: ActorProfile object with traits, context, and actor_goal
+- `initial_query`: The actor's first query or message
+- `system_prompt_template`: Template string for actor behavior (formatted with profile)
+- `tools`: Additional tools for the actor
+- `model`: Model identifier
+- `max_turns`: Maximum number of conversation turns (default: 10)
+
+#### Methods
+
+**`act(agent_message: str) -> AgentResult`**
+
+Generate the actor's next message in response to the agent's message.
+
+**Parameters:**
+- `agent_message`: The agent's response to react to
+
+**Returns:**
+- `AgentResult` containing the actor's structured response with reasoning and message
+
+**Example:**
+```python
+agent_response = agent("I can help you book that flight")
+user_result = user_sim.act(str(agent_response))
+user_message = str(user_result.structured_output.message)
+```
+
+**`has_next() -> bool`**
+
+Check if the conversation should continue. Returns False if the stop token (``) is present in the last message or if the maximum number of turns has been reached.
+
+**Returns:**
+- `True` if the conversation should continue, `False` otherwise
+
+**Example:**
+```python
+while user_sim.has_next():
+ agent_response = agent(user_message)
+ user_result = user_sim.act(str(agent_response))
+ user_message = str(user_result.structured_output.message)
+```
+
+### Data Models
+
+**ActorProfile:**
+```python
+class ActorProfile(BaseModel):
+ traits: dict[str, Any] # Actor characteristics and personality
+ context: str # Background information and situation
+ actor_goal: str # What the actor wants to achieve
+```
+
+**ActorResponse:**
+```python
+class ActorResponse(BaseModel):
+ reasoning: str # Actor's internal reasoning process
+ message: str # The actual message to send
+```
+
+## Usage Examples
+
+### Complete Multi-Turn Conversation Example
+
+```python
+from strands import Agent
+from strands_evals import ActorSimulator, Case
+
+# Create agent under test
+agent = Agent(system_prompt="You are a helpful travel assistant.", callback_handler=None)
+
+# Create test case
+case = Case(
+ input="I want to plan a trip to Tokyo with hotel and activities",
+ metadata={"task_description": "Complete travel package arranged"}
+)
+
+# Create user simulator
+user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=5)
+
+# Run conversation
+conversation = []
+user_message = case.input
+
+while user_sim.has_next():
+ # Agent responds
+ agent_response = agent(user_message)
+ agent_message = str(agent_response)
+ conversation.append({"role": "assistant", "content": agent_message})
+
+ # User responds
+ user_result = user_sim.act(agent_message)
+ user_message = str(user_result.structured_output.message)
+ conversation.append({"role": "user", "content": user_message})
+
+print(f"Conversation completed in {len(conversation) // 2} turns")
+```
+
+### Custom Actor Profile
+
+```python
+from strands_evals.types.simulation import ActorProfile
+from strands_evals.simulation.prompt_templates.actor_system_prompt import (
+ DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE
+)
+
+# Create custom actor profile
+actor_profile = ActorProfile(
+ traits={
+ "personality": "analytical and detail-oriented",
+ "communication_style": "direct and concise",
+ "technical_level": "expert"
+ },
+ context="Experienced business traveler with elite status",
+ actor_goal="Book business class flight with specific seat preferences"
+)
+
+# Initialize with custom profile
+user_sim = ActorSimulator(
+ actor_profile=actor_profile,
+ initial_query="I need to book a business class flight to London",
+ system_prompt_template=DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE,
+ max_turns=15
+)
+```
+
+## Tools
+
+### Built-in Goal Completion Tool
+
+ActorSimulator automatically includes a goal completion assessment tool that actors can use to evaluate their progress:
+
+```python
+from strands_evals.simulation.tools.goal_completion import (
+ get_conversation_goal_completion
+)
+
+# The actor can call this tool during conversation to assess progress
+assessment = get_conversation_goal_completion(
+ initial_goal="Book a flight to Tokyo",
+ conversation=[
+ {"role": "user", "content": "I need a flight to Tokyo"},
+ {"role": "assistant", "content": "I can help with that..."}
+ ]
+)
+# Returns assessment with score and reasoning
+```
+
+### Adding Custom Tools
+
+Extend actor capabilities with custom tools:
+
+```python
+from strands import tool
+
+@tool
+def check_booking_status(booking_id: str) -> str:
+ """Check the status of a booking."""
+ return f"Booking {booking_id} is confirmed"
+
+# Add custom tools to the simulator
+user_sim = ActorSimulator.from_case_for_user_simulator(
+ case=case,
+ tools=[check_booking_status]
+)
+```
+
+## Advanced Configuration
+
+### Custom System Prompt Templates
+
+Customize actor behavior with a custom system prompt template. The template receives the actor profile as a format parameter:
+
+```python
+custom_prompt_template = """
+You are simulating a user with the following profile:
+{actor_profile}
+
+Behavior guidelines:
+- Be persistent but professional
+- Express concerns clearly
+- Stay focused on your goal
+
+Respond naturally based on your profile and the conversation context.
+"""
+
+user_sim = ActorSimulator.from_case_for_user_simulator(
+ case=case,
+ system_prompt_template=custom_prompt_template
+)
+```
+
+### Conversation Initialization
+
+ActorSimulator automatically initializes conversations with a random greeting from a predefined set:
+
+```python
+# Built-in greetings:
+# - "hi! how can I help you today?"
+# - "hello! what can I assist you with?"
+# - "hi there! how may I help you?"
+# - "good day! what can I do for you?"
+# - "hello! what would you like to know?"
+
+# The conversation starts with:
+# 1. Random greeting (as user message)
+# 2. Actor's initial query (as assistant message)
+```
+
+### Model Selection
+
+Specify a custom model for the actor simulator:
+
+```python
+user_sim = ActorSimulator.from_case_for_user_simulator(
+ case=case,
+ model="anthropic.claude-3-5-sonnet-20241022-v2:0",
+ max_turns=10
+)
+```
+
+## Best Practices
+
+1. **Include Task Description**: Add `task_description` in case metadata for better goal generation
+2. **Set max_turns**: Configure `max_turns` during initialization to prevent infinite conversations
+3. **Use has_next()**: Always use `has_next()` in your conversation loop to respect turn limits and stop tokens
+4. **Track Conversation**: Append messages to a conversation list for evaluation and debugging
+5. **Access Structured Output**: Use `result.structured_output.message` to get the actor's message and `result.structured_output.reasoning` to see internal reasoning
\ No newline at end of file
diff --git a/src/strands_evals/simulation/__init__.py b/src/strands_evals/simulation/__init__.py
new file mode 100644
index 0000000..6a4be0f
--- /dev/null
+++ b/src/strands_evals/simulation/__init__.py
@@ -0,0 +1,6 @@
+from .actor_simulator import ActorSimulator
+
+# Alias for backward compatibility
+UserSimulator = ActorSimulator
+
+__all__ = ["ActorSimulator", "UserSimulator"]
diff --git a/src/strands_evals/simulation/actor_simulator.py b/src/strands_evals/simulation/actor_simulator.py
new file mode 100644
index 0000000..fb1d9c3
--- /dev/null
+++ b/src/strands_evals/simulation/actor_simulator.py
@@ -0,0 +1,292 @@
+import logging
+import random
+
+from strands import Agent
+from strands.agent.agent_result import AgentResult
+from strands.types.content import Message
+from typing_extensions import cast
+
+from strands_evals.case import Case
+from strands_evals.simulation.profiles.actor_profile import DEFAULT_USER_PROFILE_SCHEMA
+from strands_evals.simulation.prompt_templates.actor_profile_extraction import ACTOR_PROFILE_PROMPT_TEMPLATE
+from strands_evals.simulation.prompt_templates.actor_system_prompt import DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE
+from strands_evals.simulation.tools.goal_completion import get_conversation_goal_completion
+from strands_evals.types.simulation import ActorProfile, ActorResponse
+
+logger = logging.getLogger(__name__)
+
+
+class ActorSimulator:
+ """
+ Simulates an actor in multi-turn conversations for agent evaluation.
+
+ ActorSimulator wraps a Strands Agent configured to behave as a specific actor
+ (typically a user) in conversation scenarios. It maintains conversation history,
+ generates contextually appropriate responses, and can assess goal completion.
+
+ Attributes:
+ agent: The underlying Strands Agent configured with actor behavior.
+ actor_profile: The actor's profile containing traits, context, and goal.
+ initial_query: The actor's first query in the conversation.
+ conversation_history: List of conversation messages in Strands format.
+ model_id: Model identifier for the underlying agent.
+ """
+
+ INITIAL_GREETINGS = [
+ "hi! how can I help you today?",
+ "hello! what can I assist you with?",
+ "hi there! how may I help you?",
+ "good day! what can I do for you?",
+ "hello! what would you like to know?",
+ ]
+
+ @classmethod
+ def from_case_for_user_simulator(
+ cls,
+ case: Case,
+ system_prompt_template: str | None = None,
+ tools: list | None = None,
+ model: str | None = None,
+ max_turns: int = 10,
+ ) -> "ActorSimulator":
+ """
+ Create an ActorSimulator configured as a user simulator from a test case.
+
+ Generates a realistic user profile and goal from case.input and optionally
+ case.metadata["task_description"], then configures the simulator with
+ user-specific defaults. If you already have a profile, use __init__() directly.
+
+ Args:
+ case: Test case containing input (initial query) and optional metadata with "task_description".
+ system_prompt_template: Custom system prompt template. Uses DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE if None.
+ tools: Additional tools available to the user. Defaults to goal completion tool only.
+ model: Model identifier for the underlying agent. Uses Strands default if None.
+ max_turns: Maximum number of conversation turns before stopping (default: 10).
+
+ Returns:
+ ActorSimulator configured for user simulation.
+
+ Example:
+ ```python
+ from strands_evals import Case, ActorSimulator
+ from strands import Agent
+
+ # Create test case
+ case = Case(
+ input="I need to book a flight to Paris",
+ metadata={"task_description": "Flight booking confirmed"}
+ )
+
+ # Create user simulator
+ user_sim = ActorSimulator.from_case_for_user_simulator(
+ case=case,
+ max_turns=5
+ )
+
+ # Create target agent to evaluate
+ agent = Agent(system_prompt="You are a travel assistant.")
+
+ # Run conversation
+ user_message = case.input
+ while user_sim.has_next():
+ agent_response = agent(user_message)
+ user_result = user_sim.act(str(agent_response))
+ user_message = str(user_result.structured_output.message)
+ ```
+ """
+ actor_profile = cls._generate_profile_from_case(case)
+
+ if system_prompt_template is None:
+ system_prompt_template = DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE
+
+ return cls(
+ actor_profile=actor_profile,
+ initial_query=case.input,
+ system_prompt_template=system_prompt_template,
+ tools=tools,
+ model=model,
+ max_turns=max_turns,
+ )
+
+ @staticmethod
+ def _generate_profile_from_case(case: Case) -> ActorProfile:
+ """
+ Generate user profile from case.
+
+ Private helper for from_case_for_user_simulator factory method.
+ Uses case.input and optionally case.metadata["task_description"] if present.
+
+ Args:
+ case: Test case with input and optional task_description in metadata.
+
+ Returns:
+ ActorProfile with generated traits, context, and goal.
+ """
+ initial_query = case.input
+ task_description = case.metadata.get("task_description", "") if case.metadata else ""
+
+ profile_prompt = ACTOR_PROFILE_PROMPT_TEMPLATE.format(
+ initial_query=initial_query,
+ task_description=task_description,
+ example=DEFAULT_USER_PROFILE_SCHEMA,
+ )
+ profile_agent = Agent(callback_handler=None)
+ result = profile_agent(profile_prompt, structured_output_model=ActorProfile)
+ return result.structured_output
+
+ def __init__(
+ self,
+ actor_profile: ActorProfile,
+ initial_query: str,
+ system_prompt_template: str,
+ tools: list | None = None,
+ model: str | None = None,
+ max_turns: int = 10,
+ ):
+ """
+ Initialize an ActorSimulator with profile and goal.
+
+ Use this constructor when you have a pre-defined ActorProfile. For automatic
+ profile generation from test cases, use from_case_for_user_simulator() instead.
+
+ Args:
+ actor_profile: ActorProfile object containing traits, context, and actor_goal.
+ initial_query: The actor's first query or message.
+ system_prompt_template: Template string for system prompt. Must include {actor_profile} placeholder.
+ tools: Additional tools available to the actor. Defaults to goal completion tool only.
+ model: Model identifier for the underlying agent. Uses Strands default if None.
+ max_turns: Maximum number of conversation turns before stopping (default: 10).
+
+ Example:
+ ```python
+ from strands_evals.simulation import ActorSimulator
+ from strands_evals.types.simulation import ActorProfile
+
+ # Define custom actor profile
+ profile = ActorProfile(
+ traits={
+ "expertise_level": "expert",
+ "communication_style": "technical"
+ },
+ context="A software engineer debugging a production issue.",
+ actor_goal="Identify and resolve the memory leak."
+ )
+
+ # Create simulator with custom profile
+ simulator = ActorSimulator(
+ actor_profile=profile,
+ initial_query="Our service is experiencing high memory usage.",
+ system_prompt_template="You are simulating: {actor_profile}",
+ max_turns=15
+ )
+ ```
+ """
+ self.actor_profile = actor_profile
+ self.initial_query = initial_query
+ self.conversation_history: list[Message] = []
+ self.model_id = model
+ self._turn_count = 0
+ self._last_message = ""
+ self._max_turns = max_turns
+
+ system_prompt = system_prompt_template.format(actor_profile=actor_profile.model_dump())
+
+ # Combine tools
+ all_tools = [get_conversation_goal_completion]
+ if tools:
+ all_tools.extend(tools)
+
+ self._initialize_conversation()
+
+ # Create agent
+ self.agent = Agent(
+ system_prompt=system_prompt,
+ messages=self.conversation_history,
+ tools=all_tools,
+ model=self.model_id,
+ callback_handler=None,
+ )
+
+ def _initialize_conversation(self):
+ """
+ Initialize the conversation history with a greeting and initial query.
+
+ Sets up the conversation with a random greeting from the assistant followed
+ by the actor's initial query. This establishes the conversation context.
+
+ Note: This is a private method called during initialization.
+ """
+ selected_greeting = random.choice(self.INITIAL_GREETINGS)
+ greeting_message = {"role": "user", "content": [{"text": selected_greeting}]}
+ self.conversation_history.append(greeting_message)
+
+ initial_query_message = {"role": "assistant", "content": [{"text": self.initial_query.strip()}]}
+ self.conversation_history.append(initial_query_message)
+
+ def act(self, agent_message: str) -> AgentResult:
+ """
+ Generate the next actor message in the conversation.
+
+ Processes the agent's message and generates a contextually appropriate
+ response from the actor's perspective, maintaining consistency with the actor's
+ profile and goal. The response includes reasoning about the actor's thought
+ process and the actual message to send.
+
+ Args:
+ agent_message: The agent's response to react to (required).
+
+ Returns:
+ AgentResult containing the actor's structured response with:
+ - structured_output.reasoning: Actor's internal reasoning
+ - structured_output.message: Actor's response message
+
+ Example:
+ ```python
+ # Agent responds to user
+ agent_response = agent("I need help booking a flight")
+
+ # User simulator generates next message
+ user_result = user_sim.act(str(agent_response))
+
+ # Access the response
+ print(user_result.structured_output.reasoning) # Why the actor responded this way
+ print(user_result.structured_output.message) # The actual message
+
+ # Continue conversation
+ next_message = str(user_result.structured_output.message)
+ ```
+ """
+ response = self.agent(agent_message.strip(), structured_output_model=ActorResponse)
+ self._turn_count += 1
+ self._last_message = str(cast(ActorResponse, response.structured_output).message)
+ return response
+
+ def has_next(self) -> bool:
+ """
+ Check if the conversation should continue.
+
+ Returns False if the stop token () is present in the last message or if
+ the maximum number of turns has been reached. Use this in a loop to control
+ multi-turn conversations.
+
+ Returns:
+ True if the conversation should continue, False otherwise.
+
+ Example:
+ ```python
+ user_message = case.input
+
+ # Continue conversation until completion
+ while user_sim.has_next():
+ agent_response = agent(user_message)
+ user_result = user_sim.act(str(agent_response))
+ user_message = str(user_result.structured_output.message)
+
+ # Conversation ended either by:
+ # - Actor including token in message
+ # - Reaching max_turns limit
+ ```
+ """
+ if self._turn_count >= self._max_turns:
+ return False
+ return "" not in self._last_message
diff --git a/src/strands_evals/simulation/profiles/__init__.py b/src/strands_evals/simulation/profiles/__init__.py
new file mode 100644
index 0000000..6d4e1d5
--- /dev/null
+++ b/src/strands_evals/simulation/profiles/__init__.py
@@ -0,0 +1,5 @@
+"""Profile templates for actor simulation."""
+
+from .actor_profile import DEFAULT_USER_PROFILE_SCHEMA
+
+__all__ = ["DEFAULT_USER_PROFILE_SCHEMA"]
diff --git a/src/strands_evals/simulation/profiles/actor_profile.py b/src/strands_evals/simulation/profiles/actor_profile.py
new file mode 100644
index 0000000..7d3c467
--- /dev/null
+++ b/src/strands_evals/simulation/profiles/actor_profile.py
@@ -0,0 +1,26 @@
+"""
+Actor profile templates for simulation.
+
+This module provides actor profile structures used as templates
+for generating realistic actor profiles in conversation simulation.
+"""
+
+DEFAULT_USER_PROFILE_SCHEMA = {
+ "traits": {
+ "personal_profile": {
+ "identity": {
+ "first_name": "User",
+ "last_name": "Default",
+ "preferred_name": "User",
+ "gender": "other",
+ "birthdate": "1990-01-01",
+ "email": "user@example.com",
+ },
+ "location": {"address1": "123 Main St", "city": "Default City", "province": "CA", "country": "USA"},
+ "languages": [{"language": "English", "proficiency": "Advanced"}],
+ },
+ "persona": "Friendly and helpful user seeking assistance with general topics.",
+ "supplementary_profile": "Default user profile for simulation.",
+ },
+ "context": "some context",
+}
diff --git a/src/strands_evals/simulation/prompt_templates/__init__.py b/src/strands_evals/simulation/prompt_templates/__init__.py
new file mode 100644
index 0000000..0d0771d
--- /dev/null
+++ b/src/strands_evals/simulation/prompt_templates/__init__.py
@@ -0,0 +1,11 @@
+"""Prompt templates for actor simulation."""
+
+from .actor_profile_extraction import ACTOR_PROFILE_PROMPT_TEMPLATE
+from .actor_system_prompt import DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE
+from .goal_completion import GOAL_COMPLETION_PROMPT
+
+__all__ = [
+ "ACTOR_PROFILE_PROMPT_TEMPLATE",
+ "DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE",
+ "GOAL_COMPLETION_PROMPT",
+]
diff --git a/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py b/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py
new file mode 100644
index 0000000..79623cd
--- /dev/null
+++ b/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py
@@ -0,0 +1,25 @@
+"""
+Prompt template for actor profile generation.
+
+This module contains the prompt template used to generate realistic actor profiles
+from scenario information for conversation simulation.
+"""
+
+from textwrap import dedent
+
+ACTOR_PROFILE_PROMPT_TEMPLATE = dedent("""Generate exactly 1 realistic actor profile for the following task:
+
+Actor's Initial Query: {initial_query}
+Tasks Description: {task_description}
+
+Generate a complete actor profile with the following structure with:
+1. Traits: Key traits (as key-value pairs)
+2. Context: Background context (as a paragraph in 2-3 sentences)
+3. Actor Goal: What the actor ultimately wants to achieve in this interaction - should be
+ specific, actionable, and written from the actor's perspective
+
+IMPORTANT: Return JSON in the following format! IT MUST HAVE THE EXACT STRUCTURE YOU SEE HERE WITH EXACTLY THESE KEYS.
+
+{example}
+
+Be specific and realistic.""")
diff --git a/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py b/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py
new file mode 100644
index 0000000..2b863fd
--- /dev/null
+++ b/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py
@@ -0,0 +1,64 @@
+"""
+Default system prompt for actor simulation.
+
+This module contains the default system prompt that configures the actor's behavior,
+communication style, and response protocols for realistic conversation simulation.
+"""
+
+from textwrap import dedent
+
+DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE = dedent("""## User Simulation
+
+Core Identity:
+- You are simulating a user seeking assistance from an AI assistant
+- You speak in first person only
+- You strictly follow your defined User Goal and User Profile throughout the conversation
+
+## User Profile
+{actor_profile}
+
+
+Response Protocols:
+ When assistant requests information:
+ - Provide brief, specific information
+ - Maximum 2-3 sentences
+
+ When assistant provides solutions/answers:
+ - Ask follow-ups, seek clarification, or express satisfaction. Do no deviate from the User Goal.
+ - While following up, do not increase the conversation scope beyond your User Goal.
+
+Communication Rules:
+1. STRICT maximum response length: 2-3 sentences
+2. You are seeking help, NOT providing help - never give solutions!
+3. Maintain your user profile and expertise level consistently
+4. Express more of your user profile - let your background, expertise level, and personality
+ shine through in your responses
+5. Don't break character by mentioning "assistant" or "AI" explicitly
+6. Address AI assistant responses in second person ("Your suggestion..." not "The assistant's suggestion...")
+7. Do not explicitly mention conversation redirection
+8. Never include meta-references or self-instructions in your responses. These reveal you
+ are a simulator and is not how a real human would communicate. Don't write phrases like:
+ - I need to respond as the user would ...
+ - As the simulated user, I should ...
+ - Here's how the user might respond ...
+ - Based on my user goal, I need to ...
+9. Use the Exit Conditions strictly to stick to User Goal.
+10. Use all relevant tools first to ground your responses, and then respond
+
+Exit Conditions:
+1. Use get_conversation_goal_completion tool to check if your User Goal is met. When your User Goal is met:
+ - Just generate "" to terminate conversation
+2. If conversation becomes unproductive or unsafe:
+ - Naturally steer back towards your User Goal
+ - If this becomes impossible, just generate: "" to terminate conversation
+
+CRITICAL BEHAVIORAL CONSTRAINTS:
+- You are ONLY a user seeking assistance, NEVER the one providing assistance.
+- NEVER generate comprehensive responses, detailed plans, or extensive information.
+- NEVER solve problems yourself - that's the assistant's job. Under no circumstances,
+ you can use your tools to solve your user goal/sub goals.
+- If you find yourself writing more than 3 sentences, you're doing it wrong.
+- Generate only "" to terminate conversation
+
+Response Format:
+Generate ONLY the next SHORT message (1-3 sentences). No explanations, no solutions, no comprehensive information.""")
diff --git a/src/strands_evals/simulation/prompt_templates/goal_completion.py b/src/strands_evals/simulation/prompt_templates/goal_completion.py
new file mode 100644
index 0000000..d27871c
--- /dev/null
+++ b/src/strands_evals/simulation/prompt_templates/goal_completion.py
@@ -0,0 +1,27 @@
+"""
+Goal completion assessment prompt template for actor simulation.
+
+This module contains the prompt template used to evaluate whether a conversation
+has successfully achieved the actor's initial goals using a 3-point assessment scale.
+"""
+
+from textwrap import dedent
+
+GOAL_COMPLETION_PROMPT = dedent(
+ """Please evaluate the following conversation against its intended goals using this
+3-point assessment scale:
+
+1 = Does not meet the goal at all
+2 = Partially meets the goal with significant gaps
+3 = Fully meets the goal
+
+Initial Goal:
+{initial_goal}
+
+Conversation to evaluate:
+{conversation}
+
+Please provide:
+- A score (1-3)
+- Brief one line justification"""
+)
diff --git a/src/strands_evals/simulation/tools/__init__.py b/src/strands_evals/simulation/tools/__init__.py
new file mode 100644
index 0000000..6d0145b
--- /dev/null
+++ b/src/strands_evals/simulation/tools/__init__.py
@@ -0,0 +1,5 @@
+"""Tools for actor simulation."""
+
+from .goal_completion import get_conversation_goal_completion
+
+__all__ = ["get_conversation_goal_completion"]
diff --git a/src/strands_evals/simulation/tools/goal_completion.py b/src/strands_evals/simulation/tools/goal_completion.py
new file mode 100644
index 0000000..f18abb5
--- /dev/null
+++ b/src/strands_evals/simulation/tools/goal_completion.py
@@ -0,0 +1,93 @@
+import logging
+
+from strands import Agent, tool
+from typing_extensions import Any
+
+from strands_evals.simulation.prompt_templates.goal_completion import GOAL_COMPLETION_PROMPT
+
+logger = logging.getLogger(__name__)
+
+
+@tool
+def get_conversation_goal_completion(initial_goal: str, conversation: list[dict[str, str]]) -> str:
+ """
+ Evaluate conversation goal completion using a 3-point assessment scale.
+
+ Analyzes the conversation against the actor's initial goal and provides a score
+ with justification.
+
+ Args:
+ initial_goal: The actor's original goal or objective.
+ conversation: List of conversation turns, each with 'role' and 'content' keys.
+
+ Returns:
+ Assessment string with score (1-3) and brief justification.
+
+ Raises:
+ ValueError: If the conversation format is invalid.
+ """
+ # Format conversation for the prompt
+ conversation_text = _format_conversation_for_assessment(conversation)
+
+ # Create the assessment prompt
+ prompt = GOAL_COMPLETION_PROMPT.format(initial_goal=initial_goal, conversation=conversation_text)
+
+ goal_completion_agent = Agent(callback_handler=None)
+ response = goal_completion_agent(prompt)
+ logger.info("Successfully completed goal completion assessment")
+ return str(response)
+
+
+def _format_conversation_for_assessment(conversation: list[dict[str, Any]]) -> str:
+ """
+ Format conversation history for goal completion assessment.
+
+ Args:
+ conversation: List of conversation turns with 'role' and 'content' keys.
+ Content can be either a string or a list of content blocks.
+
+ Returns:
+ Formatted conversation string with each turn on a separate line.
+
+ Raises:
+ ValueError: If conversation format is invalid.
+ """
+ try:
+ formatted_turns = []
+
+ for i, turn in enumerate(conversation):
+ if not isinstance(turn, dict):
+ raise ValueError(f"Conversation turn {i} must be a dictionary")
+
+ role = turn.get("role", "").strip()
+ content_raw = turn.get("content", "")
+
+ # Handle both string format and list of content blocks
+ if isinstance(content_raw, str):
+ content = content_raw.strip()
+ elif isinstance(content_raw, list):
+ content_parts = []
+ for block in content_raw:
+ if isinstance(block, dict) and "text" in block:
+ content_parts.append(block["text"])
+ content = " ".join(content_parts).strip()
+ else:
+ logger.warning(f"Skipping conversation turn {i} with invalid content type: {type(content_raw)}")
+ continue
+
+ if not role or not content:
+ logger.warning(f"Skipping conversation turn {i} with missing role or content")
+ continue
+
+ formatted_turn = f"{role.upper()}: {content}"
+ formatted_turns.append(formatted_turn)
+
+ if not formatted_turns:
+ raise ValueError("No valid conversation turns found")
+
+ return "\n\n".join(formatted_turns)
+
+ except ValueError:
+ raise
+ except Exception as e:
+ raise ValueError("Error formatting conversation") from e
diff --git a/src/strands_evals/types/__init__.py b/src/strands_evals/types/__init__.py
index 77f81bf..60b322c 100644
--- a/src/strands_evals/types/__init__.py
+++ b/src/strands_evals/types/__init__.py
@@ -1,3 +1,11 @@
from .evaluation import EvaluationData, EvaluationOutput, Interaction, TaskOutput
+from .simulation import ActorProfile, ActorResponse
-__all__ = ["Interaction", "TaskOutput", "EvaluationData", "EvaluationOutput"]
+__all__ = [
+ "Interaction",
+ "TaskOutput",
+ "EvaluationData",
+ "EvaluationOutput",
+ "ActorProfile",
+ "ActorResponse",
+]
diff --git a/src/strands_evals/types/simulation/__init__.py b/src/strands_evals/types/simulation/__init__.py
new file mode 100644
index 0000000..13a94b0
--- /dev/null
+++ b/src/strands_evals/types/simulation/__init__.py
@@ -0,0 +1,5 @@
+"""Data models for actor simulation."""
+
+from .actor import ActorProfile, ActorResponse
+
+__all__ = ["ActorProfile", "ActorResponse"]
diff --git a/src/strands_evals/types/simulation/actor.py b/src/strands_evals/types/simulation/actor.py
new file mode 100644
index 0000000..d30be94
--- /dev/null
+++ b/src/strands_evals/types/simulation/actor.py
@@ -0,0 +1,34 @@
+from pydantic import BaseModel, Field
+from typing_extensions import Any
+
+
+class ActorProfile(BaseModel):
+ """
+ Profile for actor simulation.
+
+ Attributes:
+ traits: Dictionary of actor characteristics and attributes.
+ context: Supplementary background information about the actor.
+ actor_goal: What the actor ultimately wants to achieve in the interaction.
+ """
+
+ traits: dict[str, Any] = Field(..., description="Actor traits for simulation")
+ context: str = Field(..., description="Supplementary actor background details")
+ actor_goal: str = Field(
+ ...,
+ description="What the actor ultimately wants to achieve in this interaction - "
+ "should be specific, actionable, and written from the actor's perspective",
+ )
+
+
+class ActorResponse(BaseModel):
+ """
+ Structured response from an actor.
+
+ Attributes:
+ reasoning: Internal reasoning process for the response.
+ message: The actual message content from the actor.
+ """
+
+ reasoning: str = Field(..., description="Reasoning for the actor's response")
+ message: str = Field(..., description="Message from the actor")
diff --git a/tests/strands_evals/simulation/__init__.py b/tests/strands_evals/simulation/__init__.py
new file mode 100644
index 0000000..9ad0280
--- /dev/null
+++ b/tests/strands_evals/simulation/__init__.py
@@ -0,0 +1 @@
+"""Tests for actor simulation module."""
diff --git a/tests/strands_evals/simulation/test_actor_simulator.py b/tests/strands_evals/simulation/test_actor_simulator.py
new file mode 100644
index 0000000..c491a6f
--- /dev/null
+++ b/tests/strands_evals/simulation/test_actor_simulator.py
@@ -0,0 +1,213 @@
+"""Tests for ActorSimulator class."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+from strands.agent.agent_result import AgentResult
+
+from strands_evals import Case
+from strands_evals.simulation import ActorSimulator
+from strands_evals.types.simulation import ActorProfile, ActorResponse
+
+
+@pytest.fixture
+def sample_actor_profile():
+ """Fixture providing a sample actor profile."""
+ return ActorProfile(
+ traits={
+ "expertise_level": "beginner",
+ "communication_style": "casual",
+ "patience_level": "high",
+ },
+ context="A beginner user learning about travel planning.",
+ actor_goal="Book a complete trip to Tokyo including flights and hotel.",
+ )
+
+
+@pytest.fixture
+def sample_case():
+ """Fixture providing a sample test case."""
+ return Case(
+ input="I want to plan a trip to Tokyo",
+ metadata={"task_description": "Complete travel package arranged"},
+ )
+
+
+def test_actor_simulator_init(sample_actor_profile):
+ """Test ActorSimulator initialization with profile."""
+ simulator = ActorSimulator(
+ actor_profile=sample_actor_profile,
+ initial_query="Hello, I need help",
+ system_prompt_template="Test prompt: {actor_profile}",
+ tools=None,
+ model=None,
+ )
+
+ assert simulator.actor_profile == sample_actor_profile
+ assert simulator.initial_query == "Hello, I need help"
+ assert simulator.agent is not None
+ assert len(simulator.conversation_history) == 2 # greeting + initial query
+
+
+def test_initialize_conversation(sample_actor_profile):
+ """Test conversation initialization creates greeting and initial query."""
+ simulator = ActorSimulator(
+ actor_profile=sample_actor_profile,
+ initial_query="I need help with travel",
+ system_prompt_template="Test: {actor_profile}",
+ )
+
+ history = simulator.conversation_history
+ assert len(history) == 2
+ assert history[0]["role"] == "user"
+ assert any(greeting in history[0]["content"][0]["text"] for greeting in ActorSimulator.INITIAL_GREETINGS)
+ assert history[1]["role"] == "assistant"
+ assert history[1]["content"][0]["text"] == "I need help with travel"
+
+
+@patch("strands_evals.simulation.actor_simulator.Agent")
+def test_from_case_for_user_simulator(mock_agent_class, sample_case):
+ """Test factory method creates simulator from case."""
+ # Mock the profile generation agent
+ mock_profile_agent = MagicMock()
+ mock_profile = ActorProfile(
+ traits={"test": "trait"},
+ context="Test context",
+ actor_goal="Test goal",
+ )
+ mock_result = MagicMock()
+ mock_result.structured_output = mock_profile
+ mock_profile_agent.return_value = mock_result
+
+ # Mock the main simulator agent
+ mock_simulator_agent = MagicMock()
+
+ # Configure mock to return different instances
+ mock_agent_class.side_effect = [mock_profile_agent, mock_simulator_agent]
+
+ simulator = ActorSimulator.from_case_for_user_simulator(case=sample_case)
+
+ assert simulator.actor_profile == mock_profile
+ assert simulator.initial_query == sample_case.input
+ assert mock_agent_class.call_count == 2 # Once for profile gen, once for simulator
+
+
+@patch("strands_evals.simulation.actor_simulator.Agent")
+def test_generate_profile_from_case(mock_agent_class, sample_case):
+ """Test profile generation from case."""
+ mock_agent = MagicMock()
+ mock_profile = ActorProfile(
+ traits={"generated": "trait"},
+ context="Generated context",
+ actor_goal="Generated goal",
+ )
+ mock_result = MagicMock()
+ mock_result.structured_output = mock_profile
+ mock_agent.return_value = mock_result
+ mock_agent_class.return_value = mock_agent
+
+ profile = ActorSimulator._generate_profile_from_case(sample_case)
+
+ assert profile == mock_profile
+ assert mock_agent.called
+ # Verify structured_output_model was passed
+ call_args = mock_agent.call_args
+ assert call_args[1]["structured_output_model"] == ActorProfile
+
+
+def test_act_generates_response(sample_actor_profile):
+ """Test act method generates actor response."""
+ simulator = ActorSimulator(
+ actor_profile=sample_actor_profile,
+ initial_query="Hello",
+ system_prompt_template="Test: {actor_profile}",
+ )
+
+ # Mock the agent's response
+ mock_response = MagicMock(spec=AgentResult)
+ mock_actor_response = ActorResponse(
+ reasoning="Test reasoning",
+ message="Test response message",
+ )
+ mock_response.structured_output = mock_actor_response
+ simulator.agent = MagicMock(return_value=mock_response)
+
+ result = simulator.act("What can I help you with?")
+
+ assert result == mock_response
+ assert result.structured_output.message == "Test response message"
+ simulator.agent.assert_called_once()
+
+
+def test_act_uses_structured_output(sample_actor_profile):
+ """Test act method requests structured output."""
+ simulator = ActorSimulator(
+ actor_profile=sample_actor_profile,
+ initial_query="Hello",
+ system_prompt_template="Test: {actor_profile}",
+ )
+
+ mock_response = MagicMock(spec=AgentResult)
+ mock_actor_response = ActorResponse(reasoning="Test", message="Test message")
+ mock_response.structured_output = mock_actor_response
+ simulator.agent = MagicMock(return_value=mock_response)
+
+ simulator.act("Test message")
+
+ # Verify structured_output_model parameter
+ call_kwargs = simulator.agent.call_args[1]
+ assert call_kwargs["structured_output_model"] == ActorResponse
+
+
+def test_has_next_returns_true_initially(sample_actor_profile):
+ """Test has_next returns True before any turns."""
+ simulator = ActorSimulator(
+ actor_profile=sample_actor_profile,
+ initial_query="Hello",
+ system_prompt_template="Test: {actor_profile}",
+ )
+
+ assert simulator.has_next() is True
+
+
+def test_has_next_respects_max_turns(sample_actor_profile):
+ """Test has_next returns False after max_turns reached."""
+ simulator = ActorSimulator(
+ actor_profile=sample_actor_profile,
+ initial_query="Hello",
+ system_prompt_template="Test: {actor_profile}",
+ max_turns=3,
+ )
+
+ # Mock responses
+ mock_response = MagicMock(spec=AgentResult)
+ mock_actor_response = ActorResponse(reasoning="Test", message="Continue")
+ mock_response.structured_output = mock_actor_response
+ simulator.agent = MagicMock(return_value=mock_response)
+
+ # Simulate 3 turns with max_turns=3
+ for _ in range(3):
+ assert simulator.has_next() is True
+ simulator.act("Test message")
+
+ # After 3 turns, should return False
+ assert simulator.has_next() is False
+
+
+def test_has_next_detects_stop_token(sample_actor_profile):
+ """Test has_next returns False when stop token is present."""
+ simulator = ActorSimulator(
+ actor_profile=sample_actor_profile,
+ initial_query="Hello",
+ system_prompt_template="Test: {actor_profile}",
+ )
+
+ # Mock response with stop token
+ mock_response = MagicMock(spec=AgentResult)
+ mock_actor_response = ActorResponse(reasoning="Done", message="Thanks! ")
+ mock_response.structured_output = mock_actor_response
+ simulator.agent = MagicMock(return_value=mock_response)
+
+ # After act with stop token, has_next should return False
+ simulator.act("Test message")
+ assert simulator.has_next() is False
diff --git a/tests/strands_evals/simulation/test_goal_completion.py b/tests/strands_evals/simulation/test_goal_completion.py
new file mode 100644
index 0000000..5a7a99a
--- /dev/null
+++ b/tests/strands_evals/simulation/test_goal_completion.py
@@ -0,0 +1,196 @@
+"""Tests for goal completion assessment tool."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from strands_evals.simulation.tools.goal_completion import (
+ _format_conversation_for_assessment,
+ get_conversation_goal_completion,
+)
+
+
+@pytest.fixture
+def sample_conversation():
+ """Fixture providing a sample conversation."""
+ return [
+ {"role": "user", "content": "I want to book a flight to Tokyo"},
+ {"role": "assistant", "content": "I can help with that. What dates?"},
+ {"role": "user", "content": "Next month"},
+ ]
+
+
+@pytest.fixture
+def sample_conversation_with_blocks():
+ """Fixture providing conversation with content blocks."""
+ return [
+ {"role": "user", "content": [{"text": "Hello"}]},
+ {"role": "assistant", "content": [{"text": "Hi there"}, {"text": "How can I help?"}]},
+ ]
+
+
+def test_format_conversation_simple(sample_conversation):
+ """Test formatting simple conversation."""
+ result = _format_conversation_for_assessment(sample_conversation)
+
+ assert "USER: I want to book a flight to Tokyo" in result
+ assert "ASSISTANT: I can help with that. What dates?" in result
+ assert "USER: Next month" in result
+ assert result.count("\n\n") == 2 # Two separators for three turns
+
+
+def test_format_conversation_with_content_blocks(sample_conversation_with_blocks):
+ """Test formatting conversation with content blocks."""
+ result = _format_conversation_for_assessment(sample_conversation_with_blocks)
+
+ assert "USER: Hello" in result
+ assert "ASSISTANT: Hi there How can I help?" in result
+
+
+def test_format_conversation_mixed_formats():
+ """Test formatting conversation with mixed string and block formats."""
+ conversation = [
+ {"role": "user", "content": "String content"},
+ {"role": "assistant", "content": [{"text": "Block content"}]},
+ ]
+
+ result = _format_conversation_for_assessment(conversation)
+
+ assert "USER: String content" in result
+ assert "ASSISTANT: Block content" in result
+
+
+def test_format_conversation_empty_list():
+ """Test formatting empty conversation raises error."""
+ with pytest.raises(ValueError, match="No valid conversation turns found"):
+ _format_conversation_for_assessment([])
+
+
+def test_format_conversation_invalid_turn_type():
+ """Test formatting conversation with invalid turn type."""
+ with pytest.raises(ValueError, match="must be a dictionary"):
+ _format_conversation_for_assessment(["not a dict"])
+
+
+def test_format_conversation_missing_role():
+ """Test formatting conversation skips turns with missing role."""
+ conversation = [
+ {"role": "user", "content": "Valid turn"},
+ {"content": "Missing role"},
+ {"role": "assistant", "content": "Another valid turn"},
+ ]
+
+ result = _format_conversation_for_assessment(conversation)
+
+ assert "USER: Valid turn" in result
+ assert "ASSISTANT: Another valid turn" in result
+ assert "Missing role" not in result
+
+
+def test_format_conversation_missing_content():
+ """Test formatting conversation skips turns with missing content."""
+ conversation = [
+ {"role": "user", "content": "Valid turn"},
+ {"role": "assistant"},
+ {"role": "user", "content": "Another valid turn"},
+ ]
+
+ result = _format_conversation_for_assessment(conversation)
+
+ assert "USER: Valid turn" in result
+ assert "USER: Another valid turn" in result
+ assert result.count("ASSISTANT") == 0
+
+
+def test_format_conversation_empty_strings():
+ """Test formatting conversation skips turns with empty strings."""
+ conversation = [
+ {"role": "user", "content": "Valid turn"},
+ {"role": "", "content": "Empty role"},
+ {"role": "assistant", "content": ""},
+ ]
+
+ result = _format_conversation_for_assessment(conversation)
+
+ assert "USER: Valid turn" in result
+ assert "Empty role" not in result
+
+
+def test_format_conversation_whitespace_handling():
+ """Test formatting conversation strips whitespace."""
+ conversation = [
+ {"role": " user ", "content": " Content with spaces "},
+ ]
+
+ result = _format_conversation_for_assessment(conversation)
+
+ assert "USER: Content with spaces" in result
+
+
+def test_format_conversation_invalid_content_type():
+ """Test formatting conversation with invalid content type logs warning."""
+ conversation = [
+ {"role": "user", "content": "Valid"},
+ {"role": "assistant", "content": 123}, # Invalid type
+ ]
+
+ # Should not raise, but skip invalid turn
+ result = _format_conversation_for_assessment(conversation)
+ assert "USER: Valid" in result
+
+
+@patch("strands_evals.simulation.tools.goal_completion.Agent")
+def test_get_conversation_goal_completion(mock_agent_class, sample_conversation):
+ """Test goal completion assessment."""
+ mock_agent = MagicMock()
+ mock_response = MagicMock()
+ mock_response.__str__ = MagicMock(return_value="Score: 3 - Goal fully met")
+ mock_agent.return_value = mock_response
+ mock_agent_class.return_value = mock_agent
+
+ result = get_conversation_goal_completion(
+ initial_goal="Book a flight to Tokyo",
+ conversation=sample_conversation,
+ )
+
+ assert result == "Score: 3 - Goal fully met"
+ mock_agent.assert_called_once()
+
+
+@patch("strands_evals.simulation.tools.goal_completion.Agent")
+def test_get_conversation_goal_completion_formats_prompt(mock_agent_class, sample_conversation):
+ """Test goal completion formats prompt correctly."""
+ mock_agent = MagicMock()
+ mock_response = MagicMock()
+ mock_response.__str__ = MagicMock(return_value="Assessment")
+ mock_agent.return_value = mock_response
+ mock_agent_class.return_value = mock_agent
+
+ get_conversation_goal_completion(
+ initial_goal="Test goal",
+ conversation=sample_conversation,
+ )
+
+ # Verify prompt contains goal and conversation
+ call_args = mock_agent.call_args[0][0]
+ assert "Test goal" in call_args
+ assert "USER:" in call_args
+ assert "ASSISTANT:" in call_args
+
+
+def test_get_conversation_goal_completion_empty_conversation():
+ """Test goal completion with empty conversation raises error."""
+ with pytest.raises(ValueError):
+ get_conversation_goal_completion(
+ initial_goal="Test goal",
+ conversation=[],
+ )
+
+
+def test_get_conversation_goal_completion_invalid_conversation():
+ """Test goal completion with invalid conversation raises error."""
+ with pytest.raises(ValueError):
+ get_conversation_goal_completion(
+ initial_goal="Test goal",
+ conversation=["not", "valid"],
+ )
diff --git a/tests/strands_evals/test_dataset.py b/tests/strands_evals/test_dataset.py
index c80f61c..609b730 100644
--- a/tests/strands_evals/test_dataset.py
+++ b/tests/strands_evals/test_dataset.py
@@ -39,8 +39,8 @@ def mock_span():
def simple_task():
"""Fixture that provides a simple echo task function"""
- def task(input_val):
- return input_val
+ def task(case):
+ return case.input
return task
@@ -124,8 +124,8 @@ def test_dataset__run_task_simple_output(mock_evaluator):
case = Case(name="test", input="hello", expected_output="world")
dataset = Dataset(cases=[case], evaluator=mock_evaluator)
- def simple_task(input_val):
- return f"response to {input_val}"
+ def simple_task(c):
+ return f"response to {c.input}"
result = dataset._run_task(simple_task, case)
@@ -145,8 +145,8 @@ def test_dataset__run_task_dict_output(mock_evaluator):
case = Case(name="test", input="hello", expected_output="world")
dataset = Dataset(cases=[case], evaluator=mock_evaluator)
- def dict_task(input_val):
- return {"output": f"response to {input_val}", "trajectory": ["step1", "step2"]}
+ def dict_task(c):
+ return {"output": f"response to {c.input}", "trajectory": ["step1", "step2"]}
result = dataset._run_task(dict_task, case)
@@ -160,9 +160,9 @@ def test_dataset_run_task_dict_output_with_interactions(mock_evaluator):
case = Case(name="test", input="hello", expected_output="world", expected_interactions=interactions)
dataset = Dataset(cases=[case], evaluator=mock_evaluator)
- def dict_task(input_val):
+ def dict_task(c):
return {
- "output": f"response to {input_val}",
+ "output": f"response to {c.input}",
"trajectory": ["step1", "step2"],
"interactions": interactions,
}
@@ -182,8 +182,8 @@ def test_dataset__run_task_dict_output_with_input_update(mock_evaluator):
case = Case(name="test", input="original_input", expected_output="world")
dataset = Dataset(cases=[case], evaluator=mock_evaluator)
- def task_with_input_update(input_val):
- return {"output": f"response to {input_val}", "input": "updated_input", "trajectory": ["step1"]}
+ def task_with_input_update(c):
+ return {"output": f"response to {c.input}", "input": "updated_input", "trajectory": ["step1"]}
result = dataset._run_task(task_with_input_update, case)
@@ -198,8 +198,8 @@ async def test_dataset__run_task_async_with_input_update():
case = Case(name="test", input="original_input", expected_output="world")
dataset = Dataset(cases=[case], evaluator=MockEvaluator())
- def task_with_input_update(input_val):
- return {"output": f"response to {input_val}", "input": "async_updated_input"}
+ def task_with_input_update(c):
+ return {"output": f"response to {c.input}", "input": "async_updated_input"}
result = await dataset._run_task_async(task_with_input_update, case)
@@ -212,8 +212,8 @@ def test_dataset__run_task_async_function_raises_error(mock_evaluator):
case = Case(name="test", input="hello", expected_output="world")
dataset = Dataset(cases=[case], evaluator=mock_evaluator)
- async def async_task(input_val):
- return f"response to {input_val}"
+ async def async_task(c):
+ return f"response to {c.input}"
with pytest.raises(ValueError, match="Async task is not supported. Please use run_evaluations_async instead."):
dataset._run_task(async_task, case)
@@ -223,8 +223,8 @@ async def async_task(input_val):
async def test_dataset__run_task_async_with_sync_task():
"""Test _run_task_async with a synchronous task function"""
- def sync_task(input_val):
- return input_val
+ def sync_task(c):
+ return c.input
case = Case(name="test", input="hello", expected_output="world")
dataset = Dataset(cases=[case], evaluator=MockEvaluator())
@@ -238,8 +238,8 @@ def sync_task(input_val):
async def test_dataset__run_task_async_with_async_task():
"""Test _run_task_async with an asynchronous task function"""
- async def async_task(input_val):
- return input_val
+ async def async_task(c):
+ return c.input
case = Case(name="test", input="hello", expected_output="world")
dataset = Dataset(cases=[case], evaluator=MockEvaluator())
@@ -257,8 +257,8 @@ def test_dataset_run_evaluations(mock_evaluator):
]
dataset = Dataset(cases=cases, evaluator=mock_evaluator)
- def echo_task(input_val):
- return input_val
+ def echo_task(c):
+ return c.input
report = dataset.run_evaluations(echo_task)
@@ -600,8 +600,8 @@ def test_dataset_from_dict_InteractionsEvaluator_defaults():
async def test_dataset_run_evaluations_async():
"""Test run_evaluations_async with a simple task"""
- def task(input_str):
- return input_str
+ def task(c):
+ return c.input
case = Case(name="test", input="hello", expected_output="hello")
case1 = Case(name="test1", input="world", expected_output="world")
@@ -619,9 +619,9 @@ def task(input_str):
async def test_dataset_run_evaluations_async_with_async_task():
"""Test run_evaluations_async with an async task"""
- async def async_task(input_str):
+ async def async_task(c):
await asyncio.sleep(0.01)
- return input_str
+ return c.input
case = Case(name="test", input="hello", expected_output="hello")
case1 = Case(name="test1", input="world", expected_output="world")
@@ -638,10 +638,10 @@ async def async_task(input_str):
async def test_datset_run_evaluations_async_with_errors():
"""Test run_evaluations_async handles errors gracefully"""
- def failing_task(input_str):
- if input_str == "hello":
+ def failing_task(c):
+ if c.input == "hello":
raise ValueError("Test error")
- return input_str
+ return c.input
case = Case(name="test", input="hello", expected_output="hello")
case1 = Case(name="test1", input="world", expected_output="world")
@@ -660,8 +660,8 @@ def test_dataset_run_evaluations_with_interactions():
case = Case(name="test", input="hello", expected_output="world", expected_interactions=interactions)
dataset = Dataset(cases=[case], evaluator=MockEvaluator())
- def task_with_interactions(input_val):
- return {"output": input_val, "interactions": interactions}
+ def task_with_interactions(c):
+ return {"output": c.input, "interactions": interactions}
report = dataset.run_evaluations(task_with_interactions)
@@ -744,8 +744,8 @@ def test_dataset_run_evaluations_with_trajectory_in_span(mock_span):
with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span):
- def task_with_trajectory(input_val):
- return {"output": input_val, "trajectory": ["step1", "step2"]}
+ def task_with_trajectory(c):
+ return {"output": c.input, "trajectory": ["step1", "step2"]}
dataset.run_evaluations(task_with_trajectory)
@@ -766,8 +766,8 @@ def test_dataset_run_evaluations_with_interactions_in_span(mock_span):
with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span):
- def task_with_interactions(input_val):
- return {"output": input_val, "interactions": interactions}
+ def task_with_interactions(c):
+ return {"output": c.input, "interactions": interactions}
dataset.run_evaluations(task_with_interactions)
@@ -788,7 +788,7 @@ def test_dataset_run_evaluations_records_exception_in_span(mock_span):
with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span):
- def failing_task(input_val):
+ def failing_task(c):
raise ValueError("Test error")
dataset.run_evaluations(failing_task)
@@ -819,8 +819,8 @@ async def test_dataset_run_evaluations_async_creates_spans(mock_span):
with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span) as mock_start_span:
- async def async_task(input_val):
- return input_val
+ async def async_task(c):
+ return c.input
await dataset.run_evaluations_async(async_task)
@@ -842,7 +842,7 @@ async def test_dataset_run_evaluations_async_records_exception(mock_span):
with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span):
- async def failing_async_task(input_val):
+ async def failing_async_task(c):
raise ValueError("Async test error")
await dataset.run_evaluations_async(failing_async_task)
@@ -860,8 +860,8 @@ async def test_dataset_run_evaluations_async_with_dict_output(mock_span):
with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span):
- async def async_task_with_dict(input_val):
- return {"output": input_val, "trajectory": ["step1"], "interactions": interactions}
+ async def async_task_with_dict(c):
+ return {"output": c.input, "trajectory": ["step1"], "interactions": interactions}
await dataset.run_evaluations_async(async_task_with_dict)
diff --git a/tests/test_integration.py b/tests/test_integration.py
index b1f61ed..ee09b44 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -70,8 +70,8 @@ def test_integration_dataset_with_simple_evaluator(cases):
"""Test complete workflow: Dataset + Cases + SimpleEvaluator + EvaluationReport"""
dataset = Dataset(cases=cases, evaluator=SimpleEvaluator())
- def echo_task(input_val):
- return input_val
+ def echo_task(case):
+ return case.input
report = dataset.run_evaluations(echo_task)
@@ -89,9 +89,9 @@ def test_integration_dataset_with_dict_output_task(cases):
"""Test Dataset with task returning dictionary output"""
dataset = Dataset(cases=cases, evaluator=SimpleEvaluator())
- def dict_task(input_val):
+ def dict_task(case):
return TaskOutput(
- output=input_val,
+ output=case.input,
trajectory=["step1", "step2"],
interactions=[Interaction(node_name="agent1", dependencies=[], messages=["processing hello"])],
)
@@ -115,8 +115,8 @@ def test_integration_dataset_with_output_evaluator(mock_agent_class, cases, mock
output_evaluator = OutputEvaluator(rubric="Test if outputs match exactly")
dataset = Dataset(cases=cases, evaluator=output_evaluator)
- def simple_task(input_val):
- return f"processed_{input_val}"
+ def simple_task(case):
+ return f"processed_{case.input}"
report = dataset.run_evaluations(simple_task)
@@ -131,8 +131,8 @@ def test_integration_evaluation_report_display(cases):
"""Test that EvaluationReport display works with real data"""
dataset = Dataset(cases=cases, evaluator=SimpleEvaluator())
- def mixed_task(input_val):
- if input_val == "hello":
+ def mixed_task(case):
+ if case.input == "hello":
return "hello"
return "different"
@@ -156,8 +156,8 @@ def test_integration_dataset_with_trajectory_evaluator(mock_agent_class, cases,
trajectory_evaluator = TrajectoryEvaluator(rubric="Test if trajectories match exactly")
dataset = Dataset(cases=cases, evaluator=trajectory_evaluator)
- def simple_task(input_val):
- return {"output": f"processed_{input_val}", "trajectory": ["step1", "step2"]}
+ def simple_task(case):
+ return {"output": f"processed_{case.input}", "trajectory": ["step1", "step2"]}
report = dataset.run_evaluations(simple_task)
@@ -175,8 +175,8 @@ def test_integration_dataset_with_list_inputs():
]
dataset = Dataset(cases=cases, evaluator=SimpleEvaluator())
- def list_task(input_val):
- return input_val
+ def list_task(case):
+ return case.input
report = dataset.run_evaluations(list_task)
@@ -195,8 +195,8 @@ async def test_integration_async_dataset_with_simple_evaluator(cases):
"""Test async workflow: Dataset + Cases + SimpleEvaluator"""
dataset = Dataset(cases=cases, evaluator=SimpleEvaluator())
- def echo_task(input_val):
- return input_val
+ def echo_task(case):
+ return case.input
report = await dataset.run_evaluations_async(echo_task)
@@ -215,9 +215,9 @@ async def test_integration_async_dataset_with_async_task(cases):
"""Test async workflow with async task function"""
dataset = Dataset(cases=cases, evaluator=SimpleEvaluator())
- async def async_echo_task(input_val):
+ async def async_echo_task(case):
await asyncio.sleep(0.01) # Simulate async work
- return input_val
+ return case.input
report = await dataset.run_evaluations_async(async_echo_task)
@@ -240,8 +240,8 @@ async def test_integration_async_dataset_with_output_evaluator(mock_agent_class,
output_evaluator = OutputEvaluator(rubric="Test if outputs match exactly")
dataset = Dataset(cases=cases, evaluator=output_evaluator)
- def simple_task(input_val):
- return f"processed_{input_val}"
+ def simple_task(case):
+ return f"processed_{case.input}"
report = await dataset.run_evaluations_async(simple_task)
@@ -259,9 +259,9 @@ async def test_integration_async_dataset_concurrency():
dataset = Dataset(cases=many_cases, evaluator=SimpleEvaluator())
# Create a task with noticeable delay
- async def slow_task(input_val):
+ async def slow_task(case):
await asyncio.sleep(0.1) # Each task takes 0.1s
- return input_val
+ return case.input
# Time the execution
start_time = asyncio.get_event_loop().time()
@@ -285,7 +285,7 @@ def test_dataset_with_interactions_evaluator(mock_agent_class, interaction_case,
interactions_evaluator = InteractionsEvaluator(rubric="Test if interactions match expected sequence")
dataset = Dataset(cases=interaction_case, evaluator=interactions_evaluator)
- def task_with_interactions(input_val):
+ def task_with_interactions(case):
return {
"output": "world",
"interactions": [
@@ -308,7 +308,7 @@ async def test_async_dataset_with_interactions(interaction_case):
"""Test async Dataset with interactions data"""
dataset = Dataset(cases=interaction_case, evaluator=SimpleEvaluator())
- async def async_interactions_task(input_val):
+ async def async_interactions_task(case):
await asyncio.sleep(0.01)
return {
"output": "world",