From 5aa4cbb6cb3d083db8ecd626a132eb3348a3799e Mon Sep 17 00:00:00 2001 From: Jonathan Buck Date: Thu, 6 Nov 2025 09:17:46 -0800 Subject: [PATCH] feat: add ActorSimulator for multi-turn conversation evaluation ntroduces ActorSimulator framework for simulating realistic actors (typically users) in multi-turn conversations with agents under test. Enables systematic evaluation of conversational agents through synthetic user interactions. Key capabilities: 1. Generic ActorSimulator class configurable with arbitrary system prompts 2. from_case_for_user_simulator() factory method to condition simulator to act as a user on the basis of a given Case 3. Automatic profile generation from test cases using LLM inference 4. Built-in goal completion assessment tool for conversation evaluation 5. Support for custom tools and behaviors Design principles: 1. Generic base class (ActorSimulator) with specialized factory methods 2. Clear separation: init() for generic construction, factory for specialization 3. Optional task_description in Case metadata (handles vague initial queries) --- src/__init__.py | 0 src/examples/actor_simulator_basic.py | 63 ++++ src/examples/agents_as_tools.py | 4 +- src/examples/bank_tools_trajectory.py | 4 +- .../dataset_generator/simple_dataset.py | 5 +- src/examples/evaluate_graph.py | 4 +- src/examples/evaluate_swarm.py | 4 +- src/examples/multi_shots.py | 4 +- src/examples/safety_judge_output.py | 4 +- src/examples/third_party_evaluator.py | 8 +- src/strands_evals/__init__.py | 6 +- src/strands_evals/dataset.py | 12 +- src/strands_evals/simulation/README.md | 323 ++++++++++++++++++ src/strands_evals/simulation/__init__.py | 6 + .../simulation/actor_simulator.py | 292 ++++++++++++++++ .../simulation/profiles/__init__.py | 5 + .../simulation/profiles/actor_profile.py | 26 ++ .../simulation/prompt_templates/__init__.py | 11 + .../actor_profile_extraction.py | 25 ++ .../prompt_templates/actor_system_prompt.py | 64 ++++ .../prompt_templates/goal_completion.py | 27 ++ .../simulation/tools/__init__.py | 5 + .../simulation/tools/goal_completion.py | 93 +++++ src/strands_evals/types/__init__.py | 10 +- .../types/simulation/__init__.py | 5 + src/strands_evals/types/simulation/actor.py | 34 ++ tests/strands_evals/simulation/__init__.py | 1 + .../simulation/test_actor_simulator.py | 213 ++++++++++++ .../simulation/test_goal_completion.py | 196 +++++++++++ tests/strands_evals/test_dataset.py | 78 ++--- tests/test_integration.py | 44 +-- 31 files changed, 1489 insertions(+), 87 deletions(-) create mode 100644 src/__init__.py create mode 100644 src/examples/actor_simulator_basic.py create mode 100644 src/strands_evals/simulation/README.md create mode 100644 src/strands_evals/simulation/__init__.py create mode 100644 src/strands_evals/simulation/actor_simulator.py create mode 100644 src/strands_evals/simulation/profiles/__init__.py create mode 100644 src/strands_evals/simulation/profiles/actor_profile.py create mode 100644 src/strands_evals/simulation/prompt_templates/__init__.py create mode 100644 src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py create mode 100644 src/strands_evals/simulation/prompt_templates/actor_system_prompt.py create mode 100644 src/strands_evals/simulation/prompt_templates/goal_completion.py create mode 100644 src/strands_evals/simulation/tools/__init__.py create mode 100644 src/strands_evals/simulation/tools/goal_completion.py create mode 100644 src/strands_evals/types/simulation/__init__.py create mode 100644 src/strands_evals/types/simulation/actor.py create mode 100644 tests/strands_evals/simulation/__init__.py create mode 100644 tests/strands_evals/simulation/test_actor_simulator.py create mode 100644 tests/strands_evals/simulation/test_goal_completion.py diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/examples/actor_simulator_basic.py b/src/examples/actor_simulator_basic.py new file mode 100644 index 0000000..641e36d --- /dev/null +++ b/src/examples/actor_simulator_basic.py @@ -0,0 +1,63 @@ +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter +from strands import Agent + +from strands_evals import ActorSimulator, Case, Dataset +from strands_evals.evaluators import HelpfulnessEvaluator +from strands_evals.mappers import StrandsInMemorySessionMapper +from strands_evals.telemetry import StrandsEvalsTelemetry + +# ====================================== +# SETUP TELEMETRY +# ====================================== +telemetry = StrandsEvalsTelemetry() +memory_exporter = InMemorySpanExporter() +span_processor = BatchSpanProcessor(memory_exporter) +telemetry.tracer_provider.add_span_processor(span_processor) + + +# ====================================== +# SETUP AND RUN STRANDS EVAL +# ====================================== + + +def task_function(case: Case) -> dict: + # Create simulator + user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=3) + + # Create target agent + agent = Agent(system_prompt="You are a helpful travel assistant.", callback_handler=None) + + # Accumulate target spans across all turns + all_target_spans = [] + + user_message = case.input + while user_sim.has_next(): + # Clear before each target agent call to ensure we don't capture simulator traces. + memory_exporter.clear() + agent_response = agent(user_message) + agent_message = str(agent_response) + turn_spans = list(memory_exporter.get_finished_spans()) + all_target_spans.extend(turn_spans) + user_result = user_sim.act(agent_message) + user_message = str(user_result.structured_output.message) + + mapper = StrandsInMemorySessionMapper() + session = mapper.map_to_session(all_target_spans, session_id="test-session") + + return {"output": agent_message, "trajectory": session} + + +test_cases = [ + Case[str, str]( + name="booking-simple", + input="I need to book a flight to Paris next week", + metadata={"category": "booking", "task_description": "Flight booking confirmed"}, + ) +] + +evaluator = HelpfulnessEvaluator() +dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator) + +report = dataset.run_evaluations(task_function) +report.run_display() diff --git a/src/examples/agents_as_tools.py b/src/examples/agents_as_tools.py index 3daa40a..6d0c1bd 100644 --- a/src/examples/agents_as_tools.py +++ b/src/examples/agents_as_tools.py @@ -102,7 +102,7 @@ async def async_agents_as_tools_example(): """ ### Step 1: Define task ### - def customer_support(task: str): + def customer_support(case: Case): @tool def technical_support(query: str) -> str: """Handle technical issues, bugs, and troubleshooting.""" @@ -158,7 +158,7 @@ def returns_exchanges(query: str) -> str: callback_handler=None, tools=[technical_support, billing_support, product_info, returns_exchanges], ) - response = orchestrator(task) + response = orchestrator(case.input) description = tools_use_extractor.extract_tools_description(orchestrator) trajectory_evaluator.update_trajectory_description(description) interaction_evaluator.update_interaction_description(description) diff --git a/src/examples/bank_tools_trajectory.py b/src/examples/bank_tools_trajectory.py index 9ad7b5a..6f52b36 100644 --- a/src/examples/bank_tools_trajectory.py +++ b/src/examples/bank_tools_trajectory.py @@ -74,7 +74,7 @@ async def async_descriptive_tools_trajectory_example(): """ ### Step 1: Define task ### - async def get_response(query: str) -> dict: + async def get_response(case: Case) -> dict: bank_prompt = ( "You are a banker, ensure that only people with sufficient balance can spend them." " Collect debt from people with negative balance." @@ -83,7 +83,7 @@ async def get_response(query: str) -> dict: agent = Agent( tools=[get_balance, modify_balance, collect_debt], system_prompt=bank_prompt, callback_handler=None ) - response = await agent.invoke_async(query) + response = await agent.invoke_async(case.input) trajectory_evaluator.update_trajectory_description(tools_use_extractor.extract_tools_description(agent)) return TaskOutput( output=str(response), trajectory=tools_use_extractor.extract_agent_tools_used_from_messages(agent.messages) diff --git a/src/examples/dataset_generator/simple_dataset.py b/src/examples/dataset_generator/simple_dataset.py index 4d1c634..7e4f96e 100644 --- a/src/examples/dataset_generator/simple_dataset.py +++ b/src/examples/dataset_generator/simple_dataset.py @@ -2,6 +2,7 @@ from strands import Agent +from strands_evals import Case from strands_evals.evaluators.output_evaluator import OutputEvaluator from strands_evals.generators.dataset_generator import DatasetGenerator @@ -21,12 +22,12 @@ async def simple_dataset_generator(): """ ### Step 1: Define task ### - async def get_response(query: str) -> str: + async def get_response(case: Case) -> str: """ Simple task example to get a response from an agent given a query. """ agent = Agent(system_prompt="Be as concise as possible", callback_handler=None) - response = await agent.invoke_async(query) + response = await agent.invoke_async(case.input) return str(response) # Step 2: Initialize the dataset generator for string types diff --git a/src/examples/evaluate_graph.py b/src/examples/evaluate_graph.py index 0159e42..dfc8f0d 100644 --- a/src/examples/evaluate_graph.py +++ b/src/examples/evaluate_graph.py @@ -25,7 +25,7 @@ async def async_graph_example(): """ ### Step 1: Define task ### - def research_graph(task: str): + def research_graph(case: Case): # Create specialized agents researcher = Agent(name="researcher", system_prompt="You are a research specialist...") analyst = Agent(name="analyst", system_prompt="You are a data analysis specialist...") @@ -52,7 +52,7 @@ def research_graph(task: str): # Build the graph graph = builder.build() - result = graph(task) + result = graph(case.input) interactions = graph_extractor.extract_graph_interactions(result) return {"interactions": interactions, "trajectory": [node.node_id for node in result.execution_order]} diff --git a/src/examples/evaluate_swarm.py b/src/examples/evaluate_swarm.py index a19a4fe..d0584ae 100644 --- a/src/examples/evaluate_swarm.py +++ b/src/examples/evaluate_swarm.py @@ -25,7 +25,7 @@ async def async_swarm_example(): """ ### Step 1: Define task ### - def sde_swarm(task: str): + def sde_swarm(case: Case): # Create specialized agents researcher = Agent(name="researcher", system_prompt="You are a research specialist...", callback_handler=None) coder = Agent(name="coder", system_prompt="You are a coding specialist...", callback_handler=None) @@ -45,7 +45,7 @@ def sde_swarm(task: str): repetitive_handoff_min_unique_agents=2, ) - result = swarm(task) + result = swarm(case.input) interaction_info = swarm_extractor.extract_swarm_interactions(result) return {"interactions": interaction_info, "trajectory": [node.node_id for node in result.node_history]} diff --git a/src/examples/multi_shots.py b/src/examples/multi_shots.py index 3278f6f..29ae7e7 100644 --- a/src/examples/multi_shots.py +++ b/src/examples/multi_shots.py @@ -24,7 +24,7 @@ async def async_multi_shots_interactions(): """ ### Step 1: Define task ### - def multi_turns_hacking(query: str) -> str: + def multi_turns_hacking(case: Case) -> str: """ Simulates a multi-turn adversarial conversation to test agent safety. @@ -38,7 +38,7 @@ def multi_turns_hacking(query: str) -> str: agent = Agent(system_prompt="Be as concise as possible", callback_handler=None) new_input = [] - agent_repsonse = query + agent_repsonse = case.input hacker_response = None interactions = [] turns = 5 diff --git a/src/examples/safety_judge_output.py b/src/examples/safety_judge_output.py index 617f1cd..9989ebd 100644 --- a/src/examples/safety_judge_output.py +++ b/src/examples/safety_judge_output.py @@ -23,12 +23,12 @@ async def async_safety_output_judge_example(): """ ### Step 1: Define task ### - async def get_response(query: str) -> str: + async def get_response(case: Case) -> str: """ Simple task example to get a response from an agent given a query. """ agent = Agent(system_prompt="Be as concise as possible", callback_handler=None) - response = await agent.invoke_async(query) + response = await agent.invoke_async(case.input) return str(response) ### Step 2: Create test cases ### diff --git a/src/examples/third_party_evaluator.py b/src/examples/third_party_evaluator.py index e7fd758..174238d 100644 --- a/src/examples/third_party_evaluator.py +++ b/src/examples/third_party_evaluator.py @@ -30,9 +30,9 @@ def third_party_example(): """ ### Step 1: Define task ### - def get_response(query: str) -> str: + def get_response(case: Case) -> str: agent = Agent(callback_handler=None) - return str(agent(query)) + return str(agent(case.input)) ### Step 2: Create test cases ### test_case1 = Case[str, str]( @@ -105,9 +105,9 @@ async def async_third_party_example(): """ ### Step 1: Define task ### - async def get_response(query: str) -> str: + async def get_response(case: Case) -> str: agent = Agent(system_prompt="Be as concise as possible", callback_handler=None) - response = await agent.invoke_async(query) + response = await agent.invoke_async(case.input) return str(response) ### Step 2: Create test cases ### diff --git a/src/strands_evals/__init__.py b/src/strands_evals/__init__.py index 8548f25..137906b 100644 --- a/src/strands_evals/__init__.py +++ b/src/strands_evals/__init__.py @@ -1,8 +1,9 @@ __version__ = "0.1.0" -from . import evaluators, extractors, generators, telemetry, types +from . import evaluators, extractors, generators, simulation, telemetry, types from .case import Case from .dataset import Dataset +from .simulation import ActorSimulator, UserSimulator from .telemetry import StrandsEvalsTelemetry, get_tracer __all__ = [ @@ -12,7 +13,10 @@ "extractors", "types", "generators", + "simulation", "telemetry", "StrandsEvalsTelemetry", "get_tracer", + "ActorSimulator", + "UserSimulator", ] diff --git a/src/strands_evals/dataset.py b/src/strands_evals/dataset.py index dbd7d71..f60eab1 100644 --- a/src/strands_evals/dataset.py +++ b/src/strands_evals/dataset.py @@ -104,7 +104,7 @@ def evaluator(self, new_evaluator: Evaluator[InputT, OutputT]): self._evaluator = new_evaluator def _run_task( - self, task: Callable[[InputT], OutputT | dict[str, Any]], case: Case[InputT, OutputT] + self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT] ) -> EvaluationData[InputT, OutputT]: """ Run the task with the inputs from the test case. @@ -128,7 +128,7 @@ def _run_task( expected_interactions=case.expected_interactions, metadata=case.metadata, ) - task_output = task(case.input) + task_output = task(case) if isinstance(task_output, dict): # could be evaluating the trajectory as well evaluation_context.actual_output = task_output.get("output") evaluation_context.actual_trajectory = task_output.get("trajectory") @@ -141,7 +141,7 @@ def _run_task( return evaluation_context async def _run_task_async( - self, task: Callable[[InputT], OutputT | dict[str, Any]], case: Case[InputT, OutputT] + self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT] ) -> EvaluationData[InputT, OutputT]: """ Run the task with the inputs from the test case asynchronously. @@ -167,10 +167,10 @@ async def _run_task_async( # Handle both async and sync tasks if asyncio.iscoroutinefunction(task): - task_output = await task(case.input) + task_output = await task(case) else: # Run sync function in separate thread to avoid blocking - task_output = await asyncio.to_thread(task, case.input) + task_output = await asyncio.to_thread(task, case) if isinstance(task_output, dict): evaluation_context.actual_output = task_output.get("output") @@ -277,7 +277,7 @@ async def _worker(self, queue: asyncio.Queue, task: Callable, results: list): finally: queue.task_done() - def run_evaluations(self, task: Callable[[InputT], OutputT | dict[str, Any]]) -> EvaluationReport: + def run_evaluations(self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]]) -> EvaluationReport: """ Run the evaluations for all of the test cases with the evaluator. diff --git a/src/strands_evals/simulation/README.md b/src/strands_evals/simulation/README.md new file mode 100644 index 0000000..5e4b158 --- /dev/null +++ b/src/strands_evals/simulation/README.md @@ -0,0 +1,323 @@ +# Actor Simulator + +A framework for simulating realistic multi-turn conversations with AI-powered actors for agent evaluation. + +## Overview + +ActorSimulator creates realistic actor personas that interact with agents in multi-turn conversations. It automatically generates actor profiles from test cases, maintains conversation context, and produces contextually appropriate responses aligned with the actor's goals and traits. + +## Quick Start + +```python +from strands import Agent +from strands_evals import ActorSimulator, Case + +# Create agent under test +agent = Agent(system_prompt="You are a helpful travel assistant.", callback_handler=None) + +# Create test case +case = Case( + input="I want to plan a trip to Tokyo with hotel and activities", + metadata={"task_description": "Complete travel package arranged"} +) + +# Create user simulator with max_turns +user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=5) + +# Run conversation +user_message = case.input +while user_sim.has_next(): + agent_response = agent(user_message) + user_result = user_sim.act(str(agent_response)) + user_message = str(user_result.structured_output.message) +``` + +## How It Works + +1. **Profile Generation**: Creates a realistic actor profile with traits, context, and goals from the test case +2. **Conversation Initialization**: Sets up conversation with a greeting and the actor's initial query +3. **Contextual Responses**: Generates responses that maintain consistency with the actor's profile and goals +4. **Goal Tracking**: Built-in tool allows actors to assess progress toward their goals + +## API Reference + +### ActorSimulator + +Main class for simulating actor behavior in conversations. + +#### Factory Method (Recommended) + +```python +ActorSimulator.from_case_for_user_simulator( + case: Case, + system_prompt_template: str | None = None, + tools: list | None = None, + model: str | None = None, + max_turns: int = 10 +) -> ActorSimulator +``` + +Creates an ActorSimulator configured as a user simulator from a test case. Automatically generates a realistic actor profile from `case.input` and optionally `case.metadata["task_description"]`. + +**Parameters:** +- `case`: Test case with input (initial query) and optional task_description in metadata +- `system_prompt_template`: Custom system prompt template (uses default if None) +- `tools`: Additional tools for the actor (defaults to goal completion tool only) +- `model`: Model identifier (uses Strands default if None) +- `max_turns`: Maximum number of conversation turns (default: 10) + +**Example:** +```python +case = Case( + input="I need help booking a flight to Paris", + metadata={"task_description": "Book round-trip flight under $800"} +) + +user_sim = ActorSimulator.from_case_for_user_simulator( + case=case, + max_turns=5 +) +``` + +#### Direct Initialization + +```python +ActorSimulator( + actor_profile: ActorProfile, + initial_query: str, + system_prompt_template: str, + tools: list | None = None, + model: str | None = None, + max_turns: int = 10 +) +``` + +Initialize with an existing actor profile. Use this when you have a pre-defined profile instead of generating one from a test case. + +**Parameters:** +- `actor_profile`: ActorProfile object with traits, context, and actor_goal +- `initial_query`: The actor's first query or message +- `system_prompt_template`: Template string for actor behavior (formatted with profile) +- `tools`: Additional tools for the actor +- `model`: Model identifier +- `max_turns`: Maximum number of conversation turns (default: 10) + +#### Methods + +**`act(agent_message: str) -> AgentResult`** + +Generate the actor's next message in response to the agent's message. + +**Parameters:** +- `agent_message`: The agent's response to react to + +**Returns:** +- `AgentResult` containing the actor's structured response with reasoning and message + +**Example:** +```python +agent_response = agent("I can help you book that flight") +user_result = user_sim.act(str(agent_response)) +user_message = str(user_result.structured_output.message) +``` + +**`has_next() -> bool`** + +Check if the conversation should continue. Returns False if the stop token (``) is present in the last message or if the maximum number of turns has been reached. + +**Returns:** +- `True` if the conversation should continue, `False` otherwise + +**Example:** +```python +while user_sim.has_next(): + agent_response = agent(user_message) + user_result = user_sim.act(str(agent_response)) + user_message = str(user_result.structured_output.message) +``` + +### Data Models + +**ActorProfile:** +```python +class ActorProfile(BaseModel): + traits: dict[str, Any] # Actor characteristics and personality + context: str # Background information and situation + actor_goal: str # What the actor wants to achieve +``` + +**ActorResponse:** +```python +class ActorResponse(BaseModel): + reasoning: str # Actor's internal reasoning process + message: str # The actual message to send +``` + +## Usage Examples + +### Complete Multi-Turn Conversation Example + +```python +from strands import Agent +from strands_evals import ActorSimulator, Case + +# Create agent under test +agent = Agent(system_prompt="You are a helpful travel assistant.", callback_handler=None) + +# Create test case +case = Case( + input="I want to plan a trip to Tokyo with hotel and activities", + metadata={"task_description": "Complete travel package arranged"} +) + +# Create user simulator +user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=5) + +# Run conversation +conversation = [] +user_message = case.input + +while user_sim.has_next(): + # Agent responds + agent_response = agent(user_message) + agent_message = str(agent_response) + conversation.append({"role": "assistant", "content": agent_message}) + + # User responds + user_result = user_sim.act(agent_message) + user_message = str(user_result.structured_output.message) + conversation.append({"role": "user", "content": user_message}) + +print(f"Conversation completed in {len(conversation) // 2} turns") +``` + +### Custom Actor Profile + +```python +from strands_evals.types.simulation import ActorProfile +from strands_evals.simulation.prompt_templates.actor_system_prompt import ( + DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE +) + +# Create custom actor profile +actor_profile = ActorProfile( + traits={ + "personality": "analytical and detail-oriented", + "communication_style": "direct and concise", + "technical_level": "expert" + }, + context="Experienced business traveler with elite status", + actor_goal="Book business class flight with specific seat preferences" +) + +# Initialize with custom profile +user_sim = ActorSimulator( + actor_profile=actor_profile, + initial_query="I need to book a business class flight to London", + system_prompt_template=DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE, + max_turns=15 +) +``` + +## Tools + +### Built-in Goal Completion Tool + +ActorSimulator automatically includes a goal completion assessment tool that actors can use to evaluate their progress: + +```python +from strands_evals.simulation.tools.goal_completion import ( + get_conversation_goal_completion +) + +# The actor can call this tool during conversation to assess progress +assessment = get_conversation_goal_completion( + initial_goal="Book a flight to Tokyo", + conversation=[ + {"role": "user", "content": "I need a flight to Tokyo"}, + {"role": "assistant", "content": "I can help with that..."} + ] +) +# Returns assessment with score and reasoning +``` + +### Adding Custom Tools + +Extend actor capabilities with custom tools: + +```python +from strands import tool + +@tool +def check_booking_status(booking_id: str) -> str: + """Check the status of a booking.""" + return f"Booking {booking_id} is confirmed" + +# Add custom tools to the simulator +user_sim = ActorSimulator.from_case_for_user_simulator( + case=case, + tools=[check_booking_status] +) +``` + +## Advanced Configuration + +### Custom System Prompt Templates + +Customize actor behavior with a custom system prompt template. The template receives the actor profile as a format parameter: + +```python +custom_prompt_template = """ +You are simulating a user with the following profile: +{actor_profile} + +Behavior guidelines: +- Be persistent but professional +- Express concerns clearly +- Stay focused on your goal + +Respond naturally based on your profile and the conversation context. +""" + +user_sim = ActorSimulator.from_case_for_user_simulator( + case=case, + system_prompt_template=custom_prompt_template +) +``` + +### Conversation Initialization + +ActorSimulator automatically initializes conversations with a random greeting from a predefined set: + +```python +# Built-in greetings: +# - "hi! how can I help you today?" +# - "hello! what can I assist you with?" +# - "hi there! how may I help you?" +# - "good day! what can I do for you?" +# - "hello! what would you like to know?" + +# The conversation starts with: +# 1. Random greeting (as user message) +# 2. Actor's initial query (as assistant message) +``` + +### Model Selection + +Specify a custom model for the actor simulator: + +```python +user_sim = ActorSimulator.from_case_for_user_simulator( + case=case, + model="anthropic.claude-3-5-sonnet-20241022-v2:0", + max_turns=10 +) +``` + +## Best Practices + +1. **Include Task Description**: Add `task_description` in case metadata for better goal generation +2. **Set max_turns**: Configure `max_turns` during initialization to prevent infinite conversations +3. **Use has_next()**: Always use `has_next()` in your conversation loop to respect turn limits and stop tokens +4. **Track Conversation**: Append messages to a conversation list for evaluation and debugging +5. **Access Structured Output**: Use `result.structured_output.message` to get the actor's message and `result.structured_output.reasoning` to see internal reasoning \ No newline at end of file diff --git a/src/strands_evals/simulation/__init__.py b/src/strands_evals/simulation/__init__.py new file mode 100644 index 0000000..6a4be0f --- /dev/null +++ b/src/strands_evals/simulation/__init__.py @@ -0,0 +1,6 @@ +from .actor_simulator import ActorSimulator + +# Alias for backward compatibility +UserSimulator = ActorSimulator + +__all__ = ["ActorSimulator", "UserSimulator"] diff --git a/src/strands_evals/simulation/actor_simulator.py b/src/strands_evals/simulation/actor_simulator.py new file mode 100644 index 0000000..fb1d9c3 --- /dev/null +++ b/src/strands_evals/simulation/actor_simulator.py @@ -0,0 +1,292 @@ +import logging +import random + +from strands import Agent +from strands.agent.agent_result import AgentResult +from strands.types.content import Message +from typing_extensions import cast + +from strands_evals.case import Case +from strands_evals.simulation.profiles.actor_profile import DEFAULT_USER_PROFILE_SCHEMA +from strands_evals.simulation.prompt_templates.actor_profile_extraction import ACTOR_PROFILE_PROMPT_TEMPLATE +from strands_evals.simulation.prompt_templates.actor_system_prompt import DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE +from strands_evals.simulation.tools.goal_completion import get_conversation_goal_completion +from strands_evals.types.simulation import ActorProfile, ActorResponse + +logger = logging.getLogger(__name__) + + +class ActorSimulator: + """ + Simulates an actor in multi-turn conversations for agent evaluation. + + ActorSimulator wraps a Strands Agent configured to behave as a specific actor + (typically a user) in conversation scenarios. It maintains conversation history, + generates contextually appropriate responses, and can assess goal completion. + + Attributes: + agent: The underlying Strands Agent configured with actor behavior. + actor_profile: The actor's profile containing traits, context, and goal. + initial_query: The actor's first query in the conversation. + conversation_history: List of conversation messages in Strands format. + model_id: Model identifier for the underlying agent. + """ + + INITIAL_GREETINGS = [ + "hi! how can I help you today?", + "hello! what can I assist you with?", + "hi there! how may I help you?", + "good day! what can I do for you?", + "hello! what would you like to know?", + ] + + @classmethod + def from_case_for_user_simulator( + cls, + case: Case, + system_prompt_template: str | None = None, + tools: list | None = None, + model: str | None = None, + max_turns: int = 10, + ) -> "ActorSimulator": + """ + Create an ActorSimulator configured as a user simulator from a test case. + + Generates a realistic user profile and goal from case.input and optionally + case.metadata["task_description"], then configures the simulator with + user-specific defaults. If you already have a profile, use __init__() directly. + + Args: + case: Test case containing input (initial query) and optional metadata with "task_description". + system_prompt_template: Custom system prompt template. Uses DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE if None. + tools: Additional tools available to the user. Defaults to goal completion tool only. + model: Model identifier for the underlying agent. Uses Strands default if None. + max_turns: Maximum number of conversation turns before stopping (default: 10). + + Returns: + ActorSimulator configured for user simulation. + + Example: + ```python + from strands_evals import Case, ActorSimulator + from strands import Agent + + # Create test case + case = Case( + input="I need to book a flight to Paris", + metadata={"task_description": "Flight booking confirmed"} + ) + + # Create user simulator + user_sim = ActorSimulator.from_case_for_user_simulator( + case=case, + max_turns=5 + ) + + # Create target agent to evaluate + agent = Agent(system_prompt="You are a travel assistant.") + + # Run conversation + user_message = case.input + while user_sim.has_next(): + agent_response = agent(user_message) + user_result = user_sim.act(str(agent_response)) + user_message = str(user_result.structured_output.message) + ``` + """ + actor_profile = cls._generate_profile_from_case(case) + + if system_prompt_template is None: + system_prompt_template = DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE + + return cls( + actor_profile=actor_profile, + initial_query=case.input, + system_prompt_template=system_prompt_template, + tools=tools, + model=model, + max_turns=max_turns, + ) + + @staticmethod + def _generate_profile_from_case(case: Case) -> ActorProfile: + """ + Generate user profile from case. + + Private helper for from_case_for_user_simulator factory method. + Uses case.input and optionally case.metadata["task_description"] if present. + + Args: + case: Test case with input and optional task_description in metadata. + + Returns: + ActorProfile with generated traits, context, and goal. + """ + initial_query = case.input + task_description = case.metadata.get("task_description", "") if case.metadata else "" + + profile_prompt = ACTOR_PROFILE_PROMPT_TEMPLATE.format( + initial_query=initial_query, + task_description=task_description, + example=DEFAULT_USER_PROFILE_SCHEMA, + ) + profile_agent = Agent(callback_handler=None) + result = profile_agent(profile_prompt, structured_output_model=ActorProfile) + return result.structured_output + + def __init__( + self, + actor_profile: ActorProfile, + initial_query: str, + system_prompt_template: str, + tools: list | None = None, + model: str | None = None, + max_turns: int = 10, + ): + """ + Initialize an ActorSimulator with profile and goal. + + Use this constructor when you have a pre-defined ActorProfile. For automatic + profile generation from test cases, use from_case_for_user_simulator() instead. + + Args: + actor_profile: ActorProfile object containing traits, context, and actor_goal. + initial_query: The actor's first query or message. + system_prompt_template: Template string for system prompt. Must include {actor_profile} placeholder. + tools: Additional tools available to the actor. Defaults to goal completion tool only. + model: Model identifier for the underlying agent. Uses Strands default if None. + max_turns: Maximum number of conversation turns before stopping (default: 10). + + Example: + ```python + from strands_evals.simulation import ActorSimulator + from strands_evals.types.simulation import ActorProfile + + # Define custom actor profile + profile = ActorProfile( + traits={ + "expertise_level": "expert", + "communication_style": "technical" + }, + context="A software engineer debugging a production issue.", + actor_goal="Identify and resolve the memory leak." + ) + + # Create simulator with custom profile + simulator = ActorSimulator( + actor_profile=profile, + initial_query="Our service is experiencing high memory usage.", + system_prompt_template="You are simulating: {actor_profile}", + max_turns=15 + ) + ``` + """ + self.actor_profile = actor_profile + self.initial_query = initial_query + self.conversation_history: list[Message] = [] + self.model_id = model + self._turn_count = 0 + self._last_message = "" + self._max_turns = max_turns + + system_prompt = system_prompt_template.format(actor_profile=actor_profile.model_dump()) + + # Combine tools + all_tools = [get_conversation_goal_completion] + if tools: + all_tools.extend(tools) + + self._initialize_conversation() + + # Create agent + self.agent = Agent( + system_prompt=system_prompt, + messages=self.conversation_history, + tools=all_tools, + model=self.model_id, + callback_handler=None, + ) + + def _initialize_conversation(self): + """ + Initialize the conversation history with a greeting and initial query. + + Sets up the conversation with a random greeting from the assistant followed + by the actor's initial query. This establishes the conversation context. + + Note: This is a private method called during initialization. + """ + selected_greeting = random.choice(self.INITIAL_GREETINGS) + greeting_message = {"role": "user", "content": [{"text": selected_greeting}]} + self.conversation_history.append(greeting_message) + + initial_query_message = {"role": "assistant", "content": [{"text": self.initial_query.strip()}]} + self.conversation_history.append(initial_query_message) + + def act(self, agent_message: str) -> AgentResult: + """ + Generate the next actor message in the conversation. + + Processes the agent's message and generates a contextually appropriate + response from the actor's perspective, maintaining consistency with the actor's + profile and goal. The response includes reasoning about the actor's thought + process and the actual message to send. + + Args: + agent_message: The agent's response to react to (required). + + Returns: + AgentResult containing the actor's structured response with: + - structured_output.reasoning: Actor's internal reasoning + - structured_output.message: Actor's response message + + Example: + ```python + # Agent responds to user + agent_response = agent("I need help booking a flight") + + # User simulator generates next message + user_result = user_sim.act(str(agent_response)) + + # Access the response + print(user_result.structured_output.reasoning) # Why the actor responded this way + print(user_result.structured_output.message) # The actual message + + # Continue conversation + next_message = str(user_result.structured_output.message) + ``` + """ + response = self.agent(agent_message.strip(), structured_output_model=ActorResponse) + self._turn_count += 1 + self._last_message = str(cast(ActorResponse, response.structured_output).message) + return response + + def has_next(self) -> bool: + """ + Check if the conversation should continue. + + Returns False if the stop token () is present in the last message or if + the maximum number of turns has been reached. Use this in a loop to control + multi-turn conversations. + + Returns: + True if the conversation should continue, False otherwise. + + Example: + ```python + user_message = case.input + + # Continue conversation until completion + while user_sim.has_next(): + agent_response = agent(user_message) + user_result = user_sim.act(str(agent_response)) + user_message = str(user_result.structured_output.message) + + # Conversation ended either by: + # - Actor including token in message + # - Reaching max_turns limit + ``` + """ + if self._turn_count >= self._max_turns: + return False + return "" not in self._last_message diff --git a/src/strands_evals/simulation/profiles/__init__.py b/src/strands_evals/simulation/profiles/__init__.py new file mode 100644 index 0000000..6d4e1d5 --- /dev/null +++ b/src/strands_evals/simulation/profiles/__init__.py @@ -0,0 +1,5 @@ +"""Profile templates for actor simulation.""" + +from .actor_profile import DEFAULT_USER_PROFILE_SCHEMA + +__all__ = ["DEFAULT_USER_PROFILE_SCHEMA"] diff --git a/src/strands_evals/simulation/profiles/actor_profile.py b/src/strands_evals/simulation/profiles/actor_profile.py new file mode 100644 index 0000000..7d3c467 --- /dev/null +++ b/src/strands_evals/simulation/profiles/actor_profile.py @@ -0,0 +1,26 @@ +""" +Actor profile templates for simulation. + +This module provides actor profile structures used as templates +for generating realistic actor profiles in conversation simulation. +""" + +DEFAULT_USER_PROFILE_SCHEMA = { + "traits": { + "personal_profile": { + "identity": { + "first_name": "User", + "last_name": "Default", + "preferred_name": "User", + "gender": "other", + "birthdate": "1990-01-01", + "email": "user@example.com", + }, + "location": {"address1": "123 Main St", "city": "Default City", "province": "CA", "country": "USA"}, + "languages": [{"language": "English", "proficiency": "Advanced"}], + }, + "persona": "Friendly and helpful user seeking assistance with general topics.", + "supplementary_profile": "Default user profile for simulation.", + }, + "context": "some context", +} diff --git a/src/strands_evals/simulation/prompt_templates/__init__.py b/src/strands_evals/simulation/prompt_templates/__init__.py new file mode 100644 index 0000000..0d0771d --- /dev/null +++ b/src/strands_evals/simulation/prompt_templates/__init__.py @@ -0,0 +1,11 @@ +"""Prompt templates for actor simulation.""" + +from .actor_profile_extraction import ACTOR_PROFILE_PROMPT_TEMPLATE +from .actor_system_prompt import DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE +from .goal_completion import GOAL_COMPLETION_PROMPT + +__all__ = [ + "ACTOR_PROFILE_PROMPT_TEMPLATE", + "DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE", + "GOAL_COMPLETION_PROMPT", +] diff --git a/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py b/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py new file mode 100644 index 0000000..79623cd --- /dev/null +++ b/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py @@ -0,0 +1,25 @@ +""" +Prompt template for actor profile generation. + +This module contains the prompt template used to generate realistic actor profiles +from scenario information for conversation simulation. +""" + +from textwrap import dedent + +ACTOR_PROFILE_PROMPT_TEMPLATE = dedent("""Generate exactly 1 realistic actor profile for the following task: + +Actor's Initial Query: {initial_query} +Tasks Description: {task_description} + +Generate a complete actor profile with the following structure with: +1. Traits: Key traits (as key-value pairs) +2. Context: Background context (as a paragraph in 2-3 sentences) +3. Actor Goal: What the actor ultimately wants to achieve in this interaction - should be + specific, actionable, and written from the actor's perspective + +IMPORTANT: Return JSON in the following format! IT MUST HAVE THE EXACT STRUCTURE YOU SEE HERE WITH EXACTLY THESE KEYS. + +{example} + +Be specific and realistic.""") diff --git a/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py b/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py new file mode 100644 index 0000000..2b863fd --- /dev/null +++ b/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py @@ -0,0 +1,64 @@ +""" +Default system prompt for actor simulation. + +This module contains the default system prompt that configures the actor's behavior, +communication style, and response protocols for realistic conversation simulation. +""" + +from textwrap import dedent + +DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE = dedent("""## User Simulation + +Core Identity: +- You are simulating a user seeking assistance from an AI assistant +- You speak in first person only +- You strictly follow your defined User Goal and User Profile throughout the conversation + +## User Profile +{actor_profile} + + +Response Protocols: + When assistant requests information: + - Provide brief, specific information + - Maximum 2-3 sentences + + When assistant provides solutions/answers: + - Ask follow-ups, seek clarification, or express satisfaction. Do no deviate from the User Goal. + - While following up, do not increase the conversation scope beyond your User Goal. + +Communication Rules: +1. STRICT maximum response length: 2-3 sentences +2. You are seeking help, NOT providing help - never give solutions! +3. Maintain your user profile and expertise level consistently +4. Express more of your user profile - let your background, expertise level, and personality + shine through in your responses +5. Don't break character by mentioning "assistant" or "AI" explicitly +6. Address AI assistant responses in second person ("Your suggestion..." not "The assistant's suggestion...") +7. Do not explicitly mention conversation redirection +8. Never include meta-references or self-instructions in your responses. These reveal you + are a simulator and is not how a real human would communicate. Don't write phrases like: + - I need to respond as the user would ... + - As the simulated user, I should ... + - Here's how the user might respond ... + - Based on my user goal, I need to ... +9. Use the Exit Conditions strictly to stick to User Goal. +10. Use all relevant tools first to ground your responses, and then respond + +Exit Conditions: +1. Use get_conversation_goal_completion tool to check if your User Goal is met. When your User Goal is met: + - Just generate "" to terminate conversation +2. If conversation becomes unproductive or unsafe: + - Naturally steer back towards your User Goal + - If this becomes impossible, just generate: "" to terminate conversation + +CRITICAL BEHAVIORAL CONSTRAINTS: +- You are ONLY a user seeking assistance, NEVER the one providing assistance. +- NEVER generate comprehensive responses, detailed plans, or extensive information. +- NEVER solve problems yourself - that's the assistant's job. Under no circumstances, + you can use your tools to solve your user goal/sub goals. +- If you find yourself writing more than 3 sentences, you're doing it wrong. +- Generate only "" to terminate conversation + +Response Format: +Generate ONLY the next SHORT message (1-3 sentences). No explanations, no solutions, no comprehensive information.""") diff --git a/src/strands_evals/simulation/prompt_templates/goal_completion.py b/src/strands_evals/simulation/prompt_templates/goal_completion.py new file mode 100644 index 0000000..d27871c --- /dev/null +++ b/src/strands_evals/simulation/prompt_templates/goal_completion.py @@ -0,0 +1,27 @@ +""" +Goal completion assessment prompt template for actor simulation. + +This module contains the prompt template used to evaluate whether a conversation +has successfully achieved the actor's initial goals using a 3-point assessment scale. +""" + +from textwrap import dedent + +GOAL_COMPLETION_PROMPT = dedent( + """Please evaluate the following conversation against its intended goals using this +3-point assessment scale: + +1 = Does not meet the goal at all +2 = Partially meets the goal with significant gaps +3 = Fully meets the goal + +Initial Goal: +{initial_goal} + +Conversation to evaluate: +{conversation} + +Please provide: +- A score (1-3) +- Brief one line justification""" +) diff --git a/src/strands_evals/simulation/tools/__init__.py b/src/strands_evals/simulation/tools/__init__.py new file mode 100644 index 0000000..6d0145b --- /dev/null +++ b/src/strands_evals/simulation/tools/__init__.py @@ -0,0 +1,5 @@ +"""Tools for actor simulation.""" + +from .goal_completion import get_conversation_goal_completion + +__all__ = ["get_conversation_goal_completion"] diff --git a/src/strands_evals/simulation/tools/goal_completion.py b/src/strands_evals/simulation/tools/goal_completion.py new file mode 100644 index 0000000..f18abb5 --- /dev/null +++ b/src/strands_evals/simulation/tools/goal_completion.py @@ -0,0 +1,93 @@ +import logging + +from strands import Agent, tool +from typing_extensions import Any + +from strands_evals.simulation.prompt_templates.goal_completion import GOAL_COMPLETION_PROMPT + +logger = logging.getLogger(__name__) + + +@tool +def get_conversation_goal_completion(initial_goal: str, conversation: list[dict[str, str]]) -> str: + """ + Evaluate conversation goal completion using a 3-point assessment scale. + + Analyzes the conversation against the actor's initial goal and provides a score + with justification. + + Args: + initial_goal: The actor's original goal or objective. + conversation: List of conversation turns, each with 'role' and 'content' keys. + + Returns: + Assessment string with score (1-3) and brief justification. + + Raises: + ValueError: If the conversation format is invalid. + """ + # Format conversation for the prompt + conversation_text = _format_conversation_for_assessment(conversation) + + # Create the assessment prompt + prompt = GOAL_COMPLETION_PROMPT.format(initial_goal=initial_goal, conversation=conversation_text) + + goal_completion_agent = Agent(callback_handler=None) + response = goal_completion_agent(prompt) + logger.info("Successfully completed goal completion assessment") + return str(response) + + +def _format_conversation_for_assessment(conversation: list[dict[str, Any]]) -> str: + """ + Format conversation history for goal completion assessment. + + Args: + conversation: List of conversation turns with 'role' and 'content' keys. + Content can be either a string or a list of content blocks. + + Returns: + Formatted conversation string with each turn on a separate line. + + Raises: + ValueError: If conversation format is invalid. + """ + try: + formatted_turns = [] + + for i, turn in enumerate(conversation): + if not isinstance(turn, dict): + raise ValueError(f"Conversation turn {i} must be a dictionary") + + role = turn.get("role", "").strip() + content_raw = turn.get("content", "") + + # Handle both string format and list of content blocks + if isinstance(content_raw, str): + content = content_raw.strip() + elif isinstance(content_raw, list): + content_parts = [] + for block in content_raw: + if isinstance(block, dict) and "text" in block: + content_parts.append(block["text"]) + content = " ".join(content_parts).strip() + else: + logger.warning(f"Skipping conversation turn {i} with invalid content type: {type(content_raw)}") + continue + + if not role or not content: + logger.warning(f"Skipping conversation turn {i} with missing role or content") + continue + + formatted_turn = f"{role.upper()}: {content}" + formatted_turns.append(formatted_turn) + + if not formatted_turns: + raise ValueError("No valid conversation turns found") + + return "\n\n".join(formatted_turns) + + except ValueError: + raise + except Exception as e: + raise ValueError("Error formatting conversation") from e diff --git a/src/strands_evals/types/__init__.py b/src/strands_evals/types/__init__.py index 77f81bf..60b322c 100644 --- a/src/strands_evals/types/__init__.py +++ b/src/strands_evals/types/__init__.py @@ -1,3 +1,11 @@ from .evaluation import EvaluationData, EvaluationOutput, Interaction, TaskOutput +from .simulation import ActorProfile, ActorResponse -__all__ = ["Interaction", "TaskOutput", "EvaluationData", "EvaluationOutput"] +__all__ = [ + "Interaction", + "TaskOutput", + "EvaluationData", + "EvaluationOutput", + "ActorProfile", + "ActorResponse", +] diff --git a/src/strands_evals/types/simulation/__init__.py b/src/strands_evals/types/simulation/__init__.py new file mode 100644 index 0000000..13a94b0 --- /dev/null +++ b/src/strands_evals/types/simulation/__init__.py @@ -0,0 +1,5 @@ +"""Data models for actor simulation.""" + +from .actor import ActorProfile, ActorResponse + +__all__ = ["ActorProfile", "ActorResponse"] diff --git a/src/strands_evals/types/simulation/actor.py b/src/strands_evals/types/simulation/actor.py new file mode 100644 index 0000000..d30be94 --- /dev/null +++ b/src/strands_evals/types/simulation/actor.py @@ -0,0 +1,34 @@ +from pydantic import BaseModel, Field +from typing_extensions import Any + + +class ActorProfile(BaseModel): + """ + Profile for actor simulation. + + Attributes: + traits: Dictionary of actor characteristics and attributes. + context: Supplementary background information about the actor. + actor_goal: What the actor ultimately wants to achieve in the interaction. + """ + + traits: dict[str, Any] = Field(..., description="Actor traits for simulation") + context: str = Field(..., description="Supplementary actor background details") + actor_goal: str = Field( + ..., + description="What the actor ultimately wants to achieve in this interaction - " + "should be specific, actionable, and written from the actor's perspective", + ) + + +class ActorResponse(BaseModel): + """ + Structured response from an actor. + + Attributes: + reasoning: Internal reasoning process for the response. + message: The actual message content from the actor. + """ + + reasoning: str = Field(..., description="Reasoning for the actor's response") + message: str = Field(..., description="Message from the actor") diff --git a/tests/strands_evals/simulation/__init__.py b/tests/strands_evals/simulation/__init__.py new file mode 100644 index 0000000..9ad0280 --- /dev/null +++ b/tests/strands_evals/simulation/__init__.py @@ -0,0 +1 @@ +"""Tests for actor simulation module.""" diff --git a/tests/strands_evals/simulation/test_actor_simulator.py b/tests/strands_evals/simulation/test_actor_simulator.py new file mode 100644 index 0000000..c491a6f --- /dev/null +++ b/tests/strands_evals/simulation/test_actor_simulator.py @@ -0,0 +1,213 @@ +"""Tests for ActorSimulator class.""" + +from unittest.mock import MagicMock, patch + +import pytest +from strands.agent.agent_result import AgentResult + +from strands_evals import Case +from strands_evals.simulation import ActorSimulator +from strands_evals.types.simulation import ActorProfile, ActorResponse + + +@pytest.fixture +def sample_actor_profile(): + """Fixture providing a sample actor profile.""" + return ActorProfile( + traits={ + "expertise_level": "beginner", + "communication_style": "casual", + "patience_level": "high", + }, + context="A beginner user learning about travel planning.", + actor_goal="Book a complete trip to Tokyo including flights and hotel.", + ) + + +@pytest.fixture +def sample_case(): + """Fixture providing a sample test case.""" + return Case( + input="I want to plan a trip to Tokyo", + metadata={"task_description": "Complete travel package arranged"}, + ) + + +def test_actor_simulator_init(sample_actor_profile): + """Test ActorSimulator initialization with profile.""" + simulator = ActorSimulator( + actor_profile=sample_actor_profile, + initial_query="Hello, I need help", + system_prompt_template="Test prompt: {actor_profile}", + tools=None, + model=None, + ) + + assert simulator.actor_profile == sample_actor_profile + assert simulator.initial_query == "Hello, I need help" + assert simulator.agent is not None + assert len(simulator.conversation_history) == 2 # greeting + initial query + + +def test_initialize_conversation(sample_actor_profile): + """Test conversation initialization creates greeting and initial query.""" + simulator = ActorSimulator( + actor_profile=sample_actor_profile, + initial_query="I need help with travel", + system_prompt_template="Test: {actor_profile}", + ) + + history = simulator.conversation_history + assert len(history) == 2 + assert history[0]["role"] == "user" + assert any(greeting in history[0]["content"][0]["text"] for greeting in ActorSimulator.INITIAL_GREETINGS) + assert history[1]["role"] == "assistant" + assert history[1]["content"][0]["text"] == "I need help with travel" + + +@patch("strands_evals.simulation.actor_simulator.Agent") +def test_from_case_for_user_simulator(mock_agent_class, sample_case): + """Test factory method creates simulator from case.""" + # Mock the profile generation agent + mock_profile_agent = MagicMock() + mock_profile = ActorProfile( + traits={"test": "trait"}, + context="Test context", + actor_goal="Test goal", + ) + mock_result = MagicMock() + mock_result.structured_output = mock_profile + mock_profile_agent.return_value = mock_result + + # Mock the main simulator agent + mock_simulator_agent = MagicMock() + + # Configure mock to return different instances + mock_agent_class.side_effect = [mock_profile_agent, mock_simulator_agent] + + simulator = ActorSimulator.from_case_for_user_simulator(case=sample_case) + + assert simulator.actor_profile == mock_profile + assert simulator.initial_query == sample_case.input + assert mock_agent_class.call_count == 2 # Once for profile gen, once for simulator + + +@patch("strands_evals.simulation.actor_simulator.Agent") +def test_generate_profile_from_case(mock_agent_class, sample_case): + """Test profile generation from case.""" + mock_agent = MagicMock() + mock_profile = ActorProfile( + traits={"generated": "trait"}, + context="Generated context", + actor_goal="Generated goal", + ) + mock_result = MagicMock() + mock_result.structured_output = mock_profile + mock_agent.return_value = mock_result + mock_agent_class.return_value = mock_agent + + profile = ActorSimulator._generate_profile_from_case(sample_case) + + assert profile == mock_profile + assert mock_agent.called + # Verify structured_output_model was passed + call_args = mock_agent.call_args + assert call_args[1]["structured_output_model"] == ActorProfile + + +def test_act_generates_response(sample_actor_profile): + """Test act method generates actor response.""" + simulator = ActorSimulator( + actor_profile=sample_actor_profile, + initial_query="Hello", + system_prompt_template="Test: {actor_profile}", + ) + + # Mock the agent's response + mock_response = MagicMock(spec=AgentResult) + mock_actor_response = ActorResponse( + reasoning="Test reasoning", + message="Test response message", + ) + mock_response.structured_output = mock_actor_response + simulator.agent = MagicMock(return_value=mock_response) + + result = simulator.act("What can I help you with?") + + assert result == mock_response + assert result.structured_output.message == "Test response message" + simulator.agent.assert_called_once() + + +def test_act_uses_structured_output(sample_actor_profile): + """Test act method requests structured output.""" + simulator = ActorSimulator( + actor_profile=sample_actor_profile, + initial_query="Hello", + system_prompt_template="Test: {actor_profile}", + ) + + mock_response = MagicMock(spec=AgentResult) + mock_actor_response = ActorResponse(reasoning="Test", message="Test message") + mock_response.structured_output = mock_actor_response + simulator.agent = MagicMock(return_value=mock_response) + + simulator.act("Test message") + + # Verify structured_output_model parameter + call_kwargs = simulator.agent.call_args[1] + assert call_kwargs["structured_output_model"] == ActorResponse + + +def test_has_next_returns_true_initially(sample_actor_profile): + """Test has_next returns True before any turns.""" + simulator = ActorSimulator( + actor_profile=sample_actor_profile, + initial_query="Hello", + system_prompt_template="Test: {actor_profile}", + ) + + assert simulator.has_next() is True + + +def test_has_next_respects_max_turns(sample_actor_profile): + """Test has_next returns False after max_turns reached.""" + simulator = ActorSimulator( + actor_profile=sample_actor_profile, + initial_query="Hello", + system_prompt_template="Test: {actor_profile}", + max_turns=3, + ) + + # Mock responses + mock_response = MagicMock(spec=AgentResult) + mock_actor_response = ActorResponse(reasoning="Test", message="Continue") + mock_response.structured_output = mock_actor_response + simulator.agent = MagicMock(return_value=mock_response) + + # Simulate 3 turns with max_turns=3 + for _ in range(3): + assert simulator.has_next() is True + simulator.act("Test message") + + # After 3 turns, should return False + assert simulator.has_next() is False + + +def test_has_next_detects_stop_token(sample_actor_profile): + """Test has_next returns False when stop token is present.""" + simulator = ActorSimulator( + actor_profile=sample_actor_profile, + initial_query="Hello", + system_prompt_template="Test: {actor_profile}", + ) + + # Mock response with stop token + mock_response = MagicMock(spec=AgentResult) + mock_actor_response = ActorResponse(reasoning="Done", message="Thanks! ") + mock_response.structured_output = mock_actor_response + simulator.agent = MagicMock(return_value=mock_response) + + # After act with stop token, has_next should return False + simulator.act("Test message") + assert simulator.has_next() is False diff --git a/tests/strands_evals/simulation/test_goal_completion.py b/tests/strands_evals/simulation/test_goal_completion.py new file mode 100644 index 0000000..5a7a99a --- /dev/null +++ b/tests/strands_evals/simulation/test_goal_completion.py @@ -0,0 +1,196 @@ +"""Tests for goal completion assessment tool.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from strands_evals.simulation.tools.goal_completion import ( + _format_conversation_for_assessment, + get_conversation_goal_completion, +) + + +@pytest.fixture +def sample_conversation(): + """Fixture providing a sample conversation.""" + return [ + {"role": "user", "content": "I want to book a flight to Tokyo"}, + {"role": "assistant", "content": "I can help with that. What dates?"}, + {"role": "user", "content": "Next month"}, + ] + + +@pytest.fixture +def sample_conversation_with_blocks(): + """Fixture providing conversation with content blocks.""" + return [ + {"role": "user", "content": [{"text": "Hello"}]}, + {"role": "assistant", "content": [{"text": "Hi there"}, {"text": "How can I help?"}]}, + ] + + +def test_format_conversation_simple(sample_conversation): + """Test formatting simple conversation.""" + result = _format_conversation_for_assessment(sample_conversation) + + assert "USER: I want to book a flight to Tokyo" in result + assert "ASSISTANT: I can help with that. What dates?" in result + assert "USER: Next month" in result + assert result.count("\n\n") == 2 # Two separators for three turns + + +def test_format_conversation_with_content_blocks(sample_conversation_with_blocks): + """Test formatting conversation with content blocks.""" + result = _format_conversation_for_assessment(sample_conversation_with_blocks) + + assert "USER: Hello" in result + assert "ASSISTANT: Hi there How can I help?" in result + + +def test_format_conversation_mixed_formats(): + """Test formatting conversation with mixed string and block formats.""" + conversation = [ + {"role": "user", "content": "String content"}, + {"role": "assistant", "content": [{"text": "Block content"}]}, + ] + + result = _format_conversation_for_assessment(conversation) + + assert "USER: String content" in result + assert "ASSISTANT: Block content" in result + + +def test_format_conversation_empty_list(): + """Test formatting empty conversation raises error.""" + with pytest.raises(ValueError, match="No valid conversation turns found"): + _format_conversation_for_assessment([]) + + +def test_format_conversation_invalid_turn_type(): + """Test formatting conversation with invalid turn type.""" + with pytest.raises(ValueError, match="must be a dictionary"): + _format_conversation_for_assessment(["not a dict"]) + + +def test_format_conversation_missing_role(): + """Test formatting conversation skips turns with missing role.""" + conversation = [ + {"role": "user", "content": "Valid turn"}, + {"content": "Missing role"}, + {"role": "assistant", "content": "Another valid turn"}, + ] + + result = _format_conversation_for_assessment(conversation) + + assert "USER: Valid turn" in result + assert "ASSISTANT: Another valid turn" in result + assert "Missing role" not in result + + +def test_format_conversation_missing_content(): + """Test formatting conversation skips turns with missing content.""" + conversation = [ + {"role": "user", "content": "Valid turn"}, + {"role": "assistant"}, + {"role": "user", "content": "Another valid turn"}, + ] + + result = _format_conversation_for_assessment(conversation) + + assert "USER: Valid turn" in result + assert "USER: Another valid turn" in result + assert result.count("ASSISTANT") == 0 + + +def test_format_conversation_empty_strings(): + """Test formatting conversation skips turns with empty strings.""" + conversation = [ + {"role": "user", "content": "Valid turn"}, + {"role": "", "content": "Empty role"}, + {"role": "assistant", "content": ""}, + ] + + result = _format_conversation_for_assessment(conversation) + + assert "USER: Valid turn" in result + assert "Empty role" not in result + + +def test_format_conversation_whitespace_handling(): + """Test formatting conversation strips whitespace.""" + conversation = [ + {"role": " user ", "content": " Content with spaces "}, + ] + + result = _format_conversation_for_assessment(conversation) + + assert "USER: Content with spaces" in result + + +def test_format_conversation_invalid_content_type(): + """Test formatting conversation with invalid content type logs warning.""" + conversation = [ + {"role": "user", "content": "Valid"}, + {"role": "assistant", "content": 123}, # Invalid type + ] + + # Should not raise, but skip invalid turn + result = _format_conversation_for_assessment(conversation) + assert "USER: Valid" in result + + +@patch("strands_evals.simulation.tools.goal_completion.Agent") +def test_get_conversation_goal_completion(mock_agent_class, sample_conversation): + """Test goal completion assessment.""" + mock_agent = MagicMock() + mock_response = MagicMock() + mock_response.__str__ = MagicMock(return_value="Score: 3 - Goal fully met") + mock_agent.return_value = mock_response + mock_agent_class.return_value = mock_agent + + result = get_conversation_goal_completion( + initial_goal="Book a flight to Tokyo", + conversation=sample_conversation, + ) + + assert result == "Score: 3 - Goal fully met" + mock_agent.assert_called_once() + + +@patch("strands_evals.simulation.tools.goal_completion.Agent") +def test_get_conversation_goal_completion_formats_prompt(mock_agent_class, sample_conversation): + """Test goal completion formats prompt correctly.""" + mock_agent = MagicMock() + mock_response = MagicMock() + mock_response.__str__ = MagicMock(return_value="Assessment") + mock_agent.return_value = mock_response + mock_agent_class.return_value = mock_agent + + get_conversation_goal_completion( + initial_goal="Test goal", + conversation=sample_conversation, + ) + + # Verify prompt contains goal and conversation + call_args = mock_agent.call_args[0][0] + assert "Test goal" in call_args + assert "USER:" in call_args + assert "ASSISTANT:" in call_args + + +def test_get_conversation_goal_completion_empty_conversation(): + """Test goal completion with empty conversation raises error.""" + with pytest.raises(ValueError): + get_conversation_goal_completion( + initial_goal="Test goal", + conversation=[], + ) + + +def test_get_conversation_goal_completion_invalid_conversation(): + """Test goal completion with invalid conversation raises error.""" + with pytest.raises(ValueError): + get_conversation_goal_completion( + initial_goal="Test goal", + conversation=["not", "valid"], + ) diff --git a/tests/strands_evals/test_dataset.py b/tests/strands_evals/test_dataset.py index c80f61c..609b730 100644 --- a/tests/strands_evals/test_dataset.py +++ b/tests/strands_evals/test_dataset.py @@ -39,8 +39,8 @@ def mock_span(): def simple_task(): """Fixture that provides a simple echo task function""" - def task(input_val): - return input_val + def task(case): + return case.input return task @@ -124,8 +124,8 @@ def test_dataset__run_task_simple_output(mock_evaluator): case = Case(name="test", input="hello", expected_output="world") dataset = Dataset(cases=[case], evaluator=mock_evaluator) - def simple_task(input_val): - return f"response to {input_val}" + def simple_task(c): + return f"response to {c.input}" result = dataset._run_task(simple_task, case) @@ -145,8 +145,8 @@ def test_dataset__run_task_dict_output(mock_evaluator): case = Case(name="test", input="hello", expected_output="world") dataset = Dataset(cases=[case], evaluator=mock_evaluator) - def dict_task(input_val): - return {"output": f"response to {input_val}", "trajectory": ["step1", "step2"]} + def dict_task(c): + return {"output": f"response to {c.input}", "trajectory": ["step1", "step2"]} result = dataset._run_task(dict_task, case) @@ -160,9 +160,9 @@ def test_dataset_run_task_dict_output_with_interactions(mock_evaluator): case = Case(name="test", input="hello", expected_output="world", expected_interactions=interactions) dataset = Dataset(cases=[case], evaluator=mock_evaluator) - def dict_task(input_val): + def dict_task(c): return { - "output": f"response to {input_val}", + "output": f"response to {c.input}", "trajectory": ["step1", "step2"], "interactions": interactions, } @@ -182,8 +182,8 @@ def test_dataset__run_task_dict_output_with_input_update(mock_evaluator): case = Case(name="test", input="original_input", expected_output="world") dataset = Dataset(cases=[case], evaluator=mock_evaluator) - def task_with_input_update(input_val): - return {"output": f"response to {input_val}", "input": "updated_input", "trajectory": ["step1"]} + def task_with_input_update(c): + return {"output": f"response to {c.input}", "input": "updated_input", "trajectory": ["step1"]} result = dataset._run_task(task_with_input_update, case) @@ -198,8 +198,8 @@ async def test_dataset__run_task_async_with_input_update(): case = Case(name="test", input="original_input", expected_output="world") dataset = Dataset(cases=[case], evaluator=MockEvaluator()) - def task_with_input_update(input_val): - return {"output": f"response to {input_val}", "input": "async_updated_input"} + def task_with_input_update(c): + return {"output": f"response to {c.input}", "input": "async_updated_input"} result = await dataset._run_task_async(task_with_input_update, case) @@ -212,8 +212,8 @@ def test_dataset__run_task_async_function_raises_error(mock_evaluator): case = Case(name="test", input="hello", expected_output="world") dataset = Dataset(cases=[case], evaluator=mock_evaluator) - async def async_task(input_val): - return f"response to {input_val}" + async def async_task(c): + return f"response to {c.input}" with pytest.raises(ValueError, match="Async task is not supported. Please use run_evaluations_async instead."): dataset._run_task(async_task, case) @@ -223,8 +223,8 @@ async def async_task(input_val): async def test_dataset__run_task_async_with_sync_task(): """Test _run_task_async with a synchronous task function""" - def sync_task(input_val): - return input_val + def sync_task(c): + return c.input case = Case(name="test", input="hello", expected_output="world") dataset = Dataset(cases=[case], evaluator=MockEvaluator()) @@ -238,8 +238,8 @@ def sync_task(input_val): async def test_dataset__run_task_async_with_async_task(): """Test _run_task_async with an asynchronous task function""" - async def async_task(input_val): - return input_val + async def async_task(c): + return c.input case = Case(name="test", input="hello", expected_output="world") dataset = Dataset(cases=[case], evaluator=MockEvaluator()) @@ -257,8 +257,8 @@ def test_dataset_run_evaluations(mock_evaluator): ] dataset = Dataset(cases=cases, evaluator=mock_evaluator) - def echo_task(input_val): - return input_val + def echo_task(c): + return c.input report = dataset.run_evaluations(echo_task) @@ -600,8 +600,8 @@ def test_dataset_from_dict_InteractionsEvaluator_defaults(): async def test_dataset_run_evaluations_async(): """Test run_evaluations_async with a simple task""" - def task(input_str): - return input_str + def task(c): + return c.input case = Case(name="test", input="hello", expected_output="hello") case1 = Case(name="test1", input="world", expected_output="world") @@ -619,9 +619,9 @@ def task(input_str): async def test_dataset_run_evaluations_async_with_async_task(): """Test run_evaluations_async with an async task""" - async def async_task(input_str): + async def async_task(c): await asyncio.sleep(0.01) - return input_str + return c.input case = Case(name="test", input="hello", expected_output="hello") case1 = Case(name="test1", input="world", expected_output="world") @@ -638,10 +638,10 @@ async def async_task(input_str): async def test_datset_run_evaluations_async_with_errors(): """Test run_evaluations_async handles errors gracefully""" - def failing_task(input_str): - if input_str == "hello": + def failing_task(c): + if c.input == "hello": raise ValueError("Test error") - return input_str + return c.input case = Case(name="test", input="hello", expected_output="hello") case1 = Case(name="test1", input="world", expected_output="world") @@ -660,8 +660,8 @@ def test_dataset_run_evaluations_with_interactions(): case = Case(name="test", input="hello", expected_output="world", expected_interactions=interactions) dataset = Dataset(cases=[case], evaluator=MockEvaluator()) - def task_with_interactions(input_val): - return {"output": input_val, "interactions": interactions} + def task_with_interactions(c): + return {"output": c.input, "interactions": interactions} report = dataset.run_evaluations(task_with_interactions) @@ -744,8 +744,8 @@ def test_dataset_run_evaluations_with_trajectory_in_span(mock_span): with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span): - def task_with_trajectory(input_val): - return {"output": input_val, "trajectory": ["step1", "step2"]} + def task_with_trajectory(c): + return {"output": c.input, "trajectory": ["step1", "step2"]} dataset.run_evaluations(task_with_trajectory) @@ -766,8 +766,8 @@ def test_dataset_run_evaluations_with_interactions_in_span(mock_span): with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span): - def task_with_interactions(input_val): - return {"output": input_val, "interactions": interactions} + def task_with_interactions(c): + return {"output": c.input, "interactions": interactions} dataset.run_evaluations(task_with_interactions) @@ -788,7 +788,7 @@ def test_dataset_run_evaluations_records_exception_in_span(mock_span): with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span): - def failing_task(input_val): + def failing_task(c): raise ValueError("Test error") dataset.run_evaluations(failing_task) @@ -819,8 +819,8 @@ async def test_dataset_run_evaluations_async_creates_spans(mock_span): with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span) as mock_start_span: - async def async_task(input_val): - return input_val + async def async_task(c): + return c.input await dataset.run_evaluations_async(async_task) @@ -842,7 +842,7 @@ async def test_dataset_run_evaluations_async_records_exception(mock_span): with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span): - async def failing_async_task(input_val): + async def failing_async_task(c): raise ValueError("Async test error") await dataset.run_evaluations_async(failing_async_task) @@ -860,8 +860,8 @@ async def test_dataset_run_evaluations_async_with_dict_output(mock_span): with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span): - async def async_task_with_dict(input_val): - return {"output": input_val, "trajectory": ["step1"], "interactions": interactions} + async def async_task_with_dict(c): + return {"output": c.input, "trajectory": ["step1"], "interactions": interactions} await dataset.run_evaluations_async(async_task_with_dict) diff --git a/tests/test_integration.py b/tests/test_integration.py index b1f61ed..ee09b44 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -70,8 +70,8 @@ def test_integration_dataset_with_simple_evaluator(cases): """Test complete workflow: Dataset + Cases + SimpleEvaluator + EvaluationReport""" dataset = Dataset(cases=cases, evaluator=SimpleEvaluator()) - def echo_task(input_val): - return input_val + def echo_task(case): + return case.input report = dataset.run_evaluations(echo_task) @@ -89,9 +89,9 @@ def test_integration_dataset_with_dict_output_task(cases): """Test Dataset with task returning dictionary output""" dataset = Dataset(cases=cases, evaluator=SimpleEvaluator()) - def dict_task(input_val): + def dict_task(case): return TaskOutput( - output=input_val, + output=case.input, trajectory=["step1", "step2"], interactions=[Interaction(node_name="agent1", dependencies=[], messages=["processing hello"])], ) @@ -115,8 +115,8 @@ def test_integration_dataset_with_output_evaluator(mock_agent_class, cases, mock output_evaluator = OutputEvaluator(rubric="Test if outputs match exactly") dataset = Dataset(cases=cases, evaluator=output_evaluator) - def simple_task(input_val): - return f"processed_{input_val}" + def simple_task(case): + return f"processed_{case.input}" report = dataset.run_evaluations(simple_task) @@ -131,8 +131,8 @@ def test_integration_evaluation_report_display(cases): """Test that EvaluationReport display works with real data""" dataset = Dataset(cases=cases, evaluator=SimpleEvaluator()) - def mixed_task(input_val): - if input_val == "hello": + def mixed_task(case): + if case.input == "hello": return "hello" return "different" @@ -156,8 +156,8 @@ def test_integration_dataset_with_trajectory_evaluator(mock_agent_class, cases, trajectory_evaluator = TrajectoryEvaluator(rubric="Test if trajectories match exactly") dataset = Dataset(cases=cases, evaluator=trajectory_evaluator) - def simple_task(input_val): - return {"output": f"processed_{input_val}", "trajectory": ["step1", "step2"]} + def simple_task(case): + return {"output": f"processed_{case.input}", "trajectory": ["step1", "step2"]} report = dataset.run_evaluations(simple_task) @@ -175,8 +175,8 @@ def test_integration_dataset_with_list_inputs(): ] dataset = Dataset(cases=cases, evaluator=SimpleEvaluator()) - def list_task(input_val): - return input_val + def list_task(case): + return case.input report = dataset.run_evaluations(list_task) @@ -195,8 +195,8 @@ async def test_integration_async_dataset_with_simple_evaluator(cases): """Test async workflow: Dataset + Cases + SimpleEvaluator""" dataset = Dataset(cases=cases, evaluator=SimpleEvaluator()) - def echo_task(input_val): - return input_val + def echo_task(case): + return case.input report = await dataset.run_evaluations_async(echo_task) @@ -215,9 +215,9 @@ async def test_integration_async_dataset_with_async_task(cases): """Test async workflow with async task function""" dataset = Dataset(cases=cases, evaluator=SimpleEvaluator()) - async def async_echo_task(input_val): + async def async_echo_task(case): await asyncio.sleep(0.01) # Simulate async work - return input_val + return case.input report = await dataset.run_evaluations_async(async_echo_task) @@ -240,8 +240,8 @@ async def test_integration_async_dataset_with_output_evaluator(mock_agent_class, output_evaluator = OutputEvaluator(rubric="Test if outputs match exactly") dataset = Dataset(cases=cases, evaluator=output_evaluator) - def simple_task(input_val): - return f"processed_{input_val}" + def simple_task(case): + return f"processed_{case.input}" report = await dataset.run_evaluations_async(simple_task) @@ -259,9 +259,9 @@ async def test_integration_async_dataset_concurrency(): dataset = Dataset(cases=many_cases, evaluator=SimpleEvaluator()) # Create a task with noticeable delay - async def slow_task(input_val): + async def slow_task(case): await asyncio.sleep(0.1) # Each task takes 0.1s - return input_val + return case.input # Time the execution start_time = asyncio.get_event_loop().time() @@ -285,7 +285,7 @@ def test_dataset_with_interactions_evaluator(mock_agent_class, interaction_case, interactions_evaluator = InteractionsEvaluator(rubric="Test if interactions match expected sequence") dataset = Dataset(cases=interaction_case, evaluator=interactions_evaluator) - def task_with_interactions(input_val): + def task_with_interactions(case): return { "output": "world", "interactions": [ @@ -308,7 +308,7 @@ async def test_async_dataset_with_interactions(interaction_case): """Test async Dataset with interactions data""" dataset = Dataset(cases=interaction_case, evaluator=SimpleEvaluator()) - async def async_interactions_task(input_val): + async def async_interactions_task(case): await asyncio.sleep(0.01) return { "output": "world",