From 5aa4cbb6cb3d083db8ecd626a132eb3348a3799e Mon Sep 17 00:00:00 2001
From: Jonathan Buck <jonabuck@amazon.com>
Date: Thu, 6 Nov 2025 09:17:46 -0800
Subject: [PATCH] feat: add ActorSimulator for multi-turn conversation
 evaluation

ntroduces ActorSimulator framework for simulating realistic actors (typically users) in multi-turn conversations with agents under test. Enables systematic evaluation of conversational agents through synthetic user interactions.

Key capabilities:
1. Generic ActorSimulator class configurable with arbitrary system prompts
2. from_case_for_user_simulator() factory method to condition simulator to act as a user on the basis of a given Case
3. Automatic profile generation from test cases using LLM inference
4. Built-in goal completion assessment tool for conversation evaluation
5. Support for custom tools and behaviors

Design principles:
1. Generic base class (ActorSimulator) with specialized factory methods
2. Clear separation: init() for generic construction, factory for specialization
3. Optional task_description in Case metadata (handles vague initial queries)
---
 src/__init__.py                               |   0
 src/examples/actor_simulator_basic.py         |  63 ++++
 src/examples/agents_as_tools.py               |   4 +-
 src/examples/bank_tools_trajectory.py         |   4 +-
 .../dataset_generator/simple_dataset.py       |   5 +-
 src/examples/evaluate_graph.py                |   4 +-
 src/examples/evaluate_swarm.py                |   4 +-
 src/examples/multi_shots.py                   |   4 +-
 src/examples/safety_judge_output.py           |   4 +-
 src/examples/third_party_evaluator.py         |   8 +-
 src/strands_evals/__init__.py                 |   6 +-
 src/strands_evals/dataset.py                  |  12 +-
 src/strands_evals/simulation/README.md        | 323 ++++++++++++++++++
 src/strands_evals/simulation/__init__.py      |   6 +
 .../simulation/actor_simulator.py             | 292 ++++++++++++++++
 .../simulation/profiles/__init__.py           |   5 +
 .../simulation/profiles/actor_profile.py      |  26 ++
 .../simulation/prompt_templates/__init__.py   |  11 +
 .../actor_profile_extraction.py               |  25 ++
 .../prompt_templates/actor_system_prompt.py   |  64 ++++
 .../prompt_templates/goal_completion.py       |  27 ++
 .../simulation/tools/__init__.py              |   5 +
 .../simulation/tools/goal_completion.py       |  93 +++++
 src/strands_evals/types/__init__.py           |  10 +-
 .../types/simulation/__init__.py              |   5 +
 src/strands_evals/types/simulation/actor.py   |  34 ++
 tests/strands_evals/simulation/__init__.py    |   1 +
 .../simulation/test_actor_simulator.py        | 213 ++++++++++++
 .../simulation/test_goal_completion.py        | 196 +++++++++++
 tests/strands_evals/test_dataset.py           |  78 ++---
 tests/test_integration.py                     |  44 +--
 31 files changed, 1489 insertions(+), 87 deletions(-)
 create mode 100644 src/__init__.py
 create mode 100644 src/examples/actor_simulator_basic.py
 create mode 100644 src/strands_evals/simulation/README.md
 create mode 100644 src/strands_evals/simulation/__init__.py
 create mode 100644 src/strands_evals/simulation/actor_simulator.py
 create mode 100644 src/strands_evals/simulation/profiles/__init__.py
 create mode 100644 src/strands_evals/simulation/profiles/actor_profile.py
 create mode 100644 src/strands_evals/simulation/prompt_templates/__init__.py
 create mode 100644 src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py
 create mode 100644 src/strands_evals/simulation/prompt_templates/actor_system_prompt.py
 create mode 100644 src/strands_evals/simulation/prompt_templates/goal_completion.py
 create mode 100644 src/strands_evals/simulation/tools/__init__.py
 create mode 100644 src/strands_evals/simulation/tools/goal_completion.py
 create mode 100644 src/strands_evals/types/simulation/__init__.py
 create mode 100644 src/strands_evals/types/simulation/actor.py
 create mode 100644 tests/strands_evals/simulation/__init__.py
 create mode 100644 tests/strands_evals/simulation/test_actor_simulator.py
 create mode 100644 tests/strands_evals/simulation/test_goal_completion.py

diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/examples/actor_simulator_basic.py b/src/examples/actor_simulator_basic.py
new file mode 100644
index 0000000..641e36d
--- /dev/null
+++ b/src/examples/actor_simulator_basic.py
@@ -0,0 +1,63 @@
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+from strands import Agent
+
+from strands_evals import ActorSimulator, Case, Dataset
+from strands_evals.evaluators import HelpfulnessEvaluator
+from strands_evals.mappers import StrandsInMemorySessionMapper
+from strands_evals.telemetry import StrandsEvalsTelemetry
+
+# ======================================
+# SETUP TELEMETRY
+# ======================================
+telemetry = StrandsEvalsTelemetry()
+memory_exporter = InMemorySpanExporter()
+span_processor = BatchSpanProcessor(memory_exporter)
+telemetry.tracer_provider.add_span_processor(span_processor)
+
+
+# ======================================
+# SETUP AND RUN STRANDS EVAL
+# ======================================
+
+
+def task_function(case: Case) -> dict:
+    # Create simulator
+    user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=3)
+
+    # Create target agent
+    agent = Agent(system_prompt="You are a helpful travel assistant.", callback_handler=None)
+
+    # Accumulate target spans across all turns
+    all_target_spans = []
+
+    user_message = case.input
+    while user_sim.has_next():
+        # Clear before each target agent call to ensure we don't capture simulator traces.
+        memory_exporter.clear()
+        agent_response = agent(user_message)
+        agent_message = str(agent_response)
+        turn_spans = list(memory_exporter.get_finished_spans())
+        all_target_spans.extend(turn_spans)
+        user_result = user_sim.act(agent_message)
+        user_message = str(user_result.structured_output.message)
+
+    mapper = StrandsInMemorySessionMapper()
+    session = mapper.map_to_session(all_target_spans, session_id="test-session")
+
+    return {"output": agent_message, "trajectory": session}
+
+
+test_cases = [
+    Case[str, str](
+        name="booking-simple",
+        input="I need to book a flight to Paris next week",
+        metadata={"category": "booking", "task_description": "Flight booking confirmed"},
+    )
+]
+
+evaluator = HelpfulnessEvaluator()
+dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator)
+
+report = dataset.run_evaluations(task_function)
+report.run_display()
diff --git a/src/examples/agents_as_tools.py b/src/examples/agents_as_tools.py
index 3daa40a..6d0c1bd 100644
--- a/src/examples/agents_as_tools.py
+++ b/src/examples/agents_as_tools.py
@@ -102,7 +102,7 @@ async def async_agents_as_tools_example():
     """
 
     ### Step 1: Define task ###
-    def customer_support(task: str):
+    def customer_support(case: Case):
         @tool
         def technical_support(query: str) -> str:
             """Handle technical issues, bugs, and troubleshooting."""
@@ -158,7 +158,7 @@ def returns_exchanges(query: str) -> str:
             callback_handler=None,
             tools=[technical_support, billing_support, product_info, returns_exchanges],
         )
-        response = orchestrator(task)
+        response = orchestrator(case.input)
         description = tools_use_extractor.extract_tools_description(orchestrator)
         trajectory_evaluator.update_trajectory_description(description)
         interaction_evaluator.update_interaction_description(description)
diff --git a/src/examples/bank_tools_trajectory.py b/src/examples/bank_tools_trajectory.py
index 9ad7b5a..6f52b36 100644
--- a/src/examples/bank_tools_trajectory.py
+++ b/src/examples/bank_tools_trajectory.py
@@ -74,7 +74,7 @@ async def async_descriptive_tools_trajectory_example():
     """
 
     ### Step 1: Define task ###
-    async def get_response(query: str) -> dict:
+    async def get_response(case: Case) -> dict:
         bank_prompt = (
             "You are a banker, ensure that only people with sufficient balance can spend them."
             " Collect debt from people with negative balance."
@@ -83,7 +83,7 @@ async def get_response(query: str) -> dict:
         agent = Agent(
             tools=[get_balance, modify_balance, collect_debt], system_prompt=bank_prompt, callback_handler=None
         )
-        response = await agent.invoke_async(query)
+        response = await agent.invoke_async(case.input)
         trajectory_evaluator.update_trajectory_description(tools_use_extractor.extract_tools_description(agent))
         return TaskOutput(
             output=str(response), trajectory=tools_use_extractor.extract_agent_tools_used_from_messages(agent.messages)
diff --git a/src/examples/dataset_generator/simple_dataset.py b/src/examples/dataset_generator/simple_dataset.py
index 4d1c634..7e4f96e 100644
--- a/src/examples/dataset_generator/simple_dataset.py
+++ b/src/examples/dataset_generator/simple_dataset.py
@@ -2,6 +2,7 @@
 
 from strands import Agent
 
+from strands_evals import Case
 from strands_evals.evaluators.output_evaluator import OutputEvaluator
 from strands_evals.generators.dataset_generator import DatasetGenerator
 
@@ -21,12 +22,12 @@ async def simple_dataset_generator():
     """
 
     ### Step 1: Define task ###
-    async def get_response(query: str) -> str:
+    async def get_response(case: Case) -> str:
         """
         Simple task example to get a response from an agent given a query.
         """
         agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
-        response = await agent.invoke_async(query)
+        response = await agent.invoke_async(case.input)
         return str(response)
 
     # Step 2: Initialize the dataset generator for string types
diff --git a/src/examples/evaluate_graph.py b/src/examples/evaluate_graph.py
index 0159e42..dfc8f0d 100644
--- a/src/examples/evaluate_graph.py
+++ b/src/examples/evaluate_graph.py
@@ -25,7 +25,7 @@ async def async_graph_example():
     """
 
     ### Step 1: Define task ###
-    def research_graph(task: str):
+    def research_graph(case: Case):
         # Create specialized agents
         researcher = Agent(name="researcher", system_prompt="You are a research specialist...")
         analyst = Agent(name="analyst", system_prompt="You are a data analysis specialist...")
@@ -52,7 +52,7 @@ def research_graph(task: str):
         # Build the graph
         graph = builder.build()
 
-        result = graph(task)
+        result = graph(case.input)
         interactions = graph_extractor.extract_graph_interactions(result)
 
         return {"interactions": interactions, "trajectory": [node.node_id for node in result.execution_order]}
diff --git a/src/examples/evaluate_swarm.py b/src/examples/evaluate_swarm.py
index a19a4fe..d0584ae 100644
--- a/src/examples/evaluate_swarm.py
+++ b/src/examples/evaluate_swarm.py
@@ -25,7 +25,7 @@ async def async_swarm_example():
     """
 
     ### Step 1: Define task ###
-    def sde_swarm(task: str):
+    def sde_swarm(case: Case):
         # Create specialized agents
         researcher = Agent(name="researcher", system_prompt="You are a research specialist...", callback_handler=None)
         coder = Agent(name="coder", system_prompt="You are a coding specialist...", callback_handler=None)
@@ -45,7 +45,7 @@ def sde_swarm(task: str):
             repetitive_handoff_min_unique_agents=2,
         )
 
-        result = swarm(task)
+        result = swarm(case.input)
         interaction_info = swarm_extractor.extract_swarm_interactions(result)
 
         return {"interactions": interaction_info, "trajectory": [node.node_id for node in result.node_history]}
diff --git a/src/examples/multi_shots.py b/src/examples/multi_shots.py
index 3278f6f..29ae7e7 100644
--- a/src/examples/multi_shots.py
+++ b/src/examples/multi_shots.py
@@ -24,7 +24,7 @@ async def async_multi_shots_interactions():
     """
 
     ### Step 1: Define task ###
-    def multi_turns_hacking(query: str) -> str:
+    def multi_turns_hacking(case: Case) -> str:
         """
         Simulates a multi-turn adversarial conversation to test agent safety.
 
@@ -38,7 +38,7 @@ def multi_turns_hacking(query: str) -> str:
         agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
 
         new_input = []
-        agent_repsonse = query
+        agent_repsonse = case.input
         hacker_response = None
         interactions = []
         turns = 5
diff --git a/src/examples/safety_judge_output.py b/src/examples/safety_judge_output.py
index 617f1cd..9989ebd 100644
--- a/src/examples/safety_judge_output.py
+++ b/src/examples/safety_judge_output.py
@@ -23,12 +23,12 @@ async def async_safety_output_judge_example():
     """
 
     ### Step 1: Define task ###
-    async def get_response(query: str) -> str:
+    async def get_response(case: Case) -> str:
         """
         Simple task example to get a response from an agent given a query.
         """
         agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
-        response = await agent.invoke_async(query)
+        response = await agent.invoke_async(case.input)
         return str(response)
 
     ### Step 2: Create test cases ###
diff --git a/src/examples/third_party_evaluator.py b/src/examples/third_party_evaluator.py
index e7fd758..174238d 100644
--- a/src/examples/third_party_evaluator.py
+++ b/src/examples/third_party_evaluator.py
@@ -30,9 +30,9 @@ def third_party_example():
     """
 
     ### Step 1: Define task ###
-    def get_response(query: str) -> str:
+    def get_response(case: Case) -> str:
         agent = Agent(callback_handler=None)
-        return str(agent(query))
+        return str(agent(case.input))
 
     ### Step 2: Create test cases ###
     test_case1 = Case[str, str](
@@ -105,9 +105,9 @@ async def async_third_party_example():
     """
 
     ### Step 1: Define task ###
-    async def get_response(query: str) -> str:
+    async def get_response(case: Case) -> str:
         agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
-        response = await agent.invoke_async(query)
+        response = await agent.invoke_async(case.input)
         return str(response)
 
     ### Step 2: Create test cases ###
diff --git a/src/strands_evals/__init__.py b/src/strands_evals/__init__.py
index 8548f25..137906b 100644
--- a/src/strands_evals/__init__.py
+++ b/src/strands_evals/__init__.py
@@ -1,8 +1,9 @@
 __version__ = "0.1.0"
 
-from . import evaluators, extractors, generators, telemetry, types
+from . import evaluators, extractors, generators, simulation, telemetry, types
 from .case import Case
 from .dataset import Dataset
+from .simulation import ActorSimulator, UserSimulator
 from .telemetry import StrandsEvalsTelemetry, get_tracer
 
 __all__ = [
@@ -12,7 +13,10 @@
     "extractors",
     "types",
     "generators",
+    "simulation",
     "telemetry",
     "StrandsEvalsTelemetry",
     "get_tracer",
+    "ActorSimulator",
+    "UserSimulator",
 ]
diff --git a/src/strands_evals/dataset.py b/src/strands_evals/dataset.py
index dbd7d71..f60eab1 100644
--- a/src/strands_evals/dataset.py
+++ b/src/strands_evals/dataset.py
@@ -104,7 +104,7 @@ def evaluator(self, new_evaluator: Evaluator[InputT, OutputT]):
         self._evaluator = new_evaluator
 
     def _run_task(
-        self, task: Callable[[InputT], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
+        self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
     ) -> EvaluationData[InputT, OutputT]:
         """
         Run the task with the inputs from the test case.
@@ -128,7 +128,7 @@ def _run_task(
             expected_interactions=case.expected_interactions,
             metadata=case.metadata,
         )
-        task_output = task(case.input)
+        task_output = task(case)
         if isinstance(task_output, dict):  # could be evaluating the trajectory as well
             evaluation_context.actual_output = task_output.get("output")
             evaluation_context.actual_trajectory = task_output.get("trajectory")
@@ -141,7 +141,7 @@ def _run_task(
         return evaluation_context
 
     async def _run_task_async(
-        self, task: Callable[[InputT], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
+        self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]], case: Case[InputT, OutputT]
     ) -> EvaluationData[InputT, OutputT]:
         """
         Run the task with the inputs from the test case asynchronously.
@@ -167,10 +167,10 @@ async def _run_task_async(
 
         # Handle both async and sync tasks
         if asyncio.iscoroutinefunction(task):
-            task_output = await task(case.input)
+            task_output = await task(case)
         else:
             # Run sync function in separate thread to avoid blocking
-            task_output = await asyncio.to_thread(task, case.input)
+            task_output = await asyncio.to_thread(task, case)
 
         if isinstance(task_output, dict):
             evaluation_context.actual_output = task_output.get("output")
@@ -277,7 +277,7 @@ async def _worker(self, queue: asyncio.Queue, task: Callable, results: list):
                 finally:
                     queue.task_done()
 
-    def run_evaluations(self, task: Callable[[InputT], OutputT | dict[str, Any]]) -> EvaluationReport:
+    def run_evaluations(self, task: Callable[[Case[InputT, OutputT]], OutputT | dict[str, Any]]) -> EvaluationReport:
         """
         Run the evaluations for all of the test cases with the evaluator.
 
diff --git a/src/strands_evals/simulation/README.md b/src/strands_evals/simulation/README.md
new file mode 100644
index 0000000..5e4b158
--- /dev/null
+++ b/src/strands_evals/simulation/README.md
@@ -0,0 +1,323 @@
+# Actor Simulator
+
+A framework for simulating realistic multi-turn conversations with AI-powered actors for agent evaluation.
+
+## Overview
+
+ActorSimulator creates realistic actor personas that interact with agents in multi-turn conversations. It automatically generates actor profiles from test cases, maintains conversation context, and produces contextually appropriate responses aligned with the actor's goals and traits.
+
+## Quick Start
+
+```python
+from strands import Agent
+from strands_evals import ActorSimulator, Case
+
+# Create agent under test
+agent = Agent(system_prompt="You are a helpful travel assistant.", callback_handler=None)
+
+# Create test case
+case = Case(
+    input="I want to plan a trip to Tokyo with hotel and activities",
+    metadata={"task_description": "Complete travel package arranged"}
+)
+
+# Create user simulator with max_turns
+user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=5)
+
+# Run conversation
+user_message = case.input
+while user_sim.has_next():
+    agent_response = agent(user_message)
+    user_result = user_sim.act(str(agent_response))
+    user_message = str(user_result.structured_output.message)
+```
+
+## How It Works
+
+1. **Profile Generation**: Creates a realistic actor profile with traits, context, and goals from the test case
+2. **Conversation Initialization**: Sets up conversation with a greeting and the actor's initial query
+3. **Contextual Responses**: Generates responses that maintain consistency with the actor's profile and goals
+4. **Goal Tracking**: Built-in tool allows actors to assess progress toward their goals
+
+## API Reference
+
+### ActorSimulator
+
+Main class for simulating actor behavior in conversations.
+
+#### Factory Method (Recommended)
+
+```python
+ActorSimulator.from_case_for_user_simulator(
+    case: Case,
+    system_prompt_template: str | None = None,
+    tools: list | None = None,
+    model: str | None = None,
+    max_turns: int = 10
+) -> ActorSimulator
+```
+
+Creates an ActorSimulator configured as a user simulator from a test case. Automatically generates a realistic actor profile from `case.input` and optionally `case.metadata["task_description"]`.
+
+**Parameters:**
+- `case`: Test case with input (initial query) and optional task_description in metadata
+- `system_prompt_template`: Custom system prompt template (uses default if None)
+- `tools`: Additional tools for the actor (defaults to goal completion tool only)
+- `model`: Model identifier (uses Strands default if None)
+- `max_turns`: Maximum number of conversation turns (default: 10)
+
+**Example:**
+```python
+case = Case(
+    input="I need help booking a flight to Paris",
+    metadata={"task_description": "Book round-trip flight under $800"}
+)
+
+user_sim = ActorSimulator.from_case_for_user_simulator(
+    case=case,
+    max_turns=5
+)
+```
+
+#### Direct Initialization
+
+```python
+ActorSimulator(
+    actor_profile: ActorProfile,
+    initial_query: str,
+    system_prompt_template: str,
+    tools: list | None = None,
+    model: str | None = None,
+    max_turns: int = 10
+)
+```
+
+Initialize with an existing actor profile. Use this when you have a pre-defined profile instead of generating one from a test case.
+
+**Parameters:**
+- `actor_profile`: ActorProfile object with traits, context, and actor_goal
+- `initial_query`: The actor's first query or message
+- `system_prompt_template`: Template string for actor behavior (formatted with profile)
+- `tools`: Additional tools for the actor
+- `model`: Model identifier
+- `max_turns`: Maximum number of conversation turns (default: 10)
+
+#### Methods
+
+**`act(agent_message: str) -> AgentResult`**
+
+Generate the actor's next message in response to the agent's message.
+
+**Parameters:**
+- `agent_message`: The agent's response to react to
+
+**Returns:**
+- `AgentResult` containing the actor's structured response with reasoning and message
+
+**Example:**
+```python
+agent_response = agent("I can help you book that flight")
+user_result = user_sim.act(str(agent_response))
+user_message = str(user_result.structured_output.message)
+```
+
+**`has_next() -> bool`**
+
+Check if the conversation should continue. Returns False if the stop token (`<stop/>`) is present in the last message or if the maximum number of turns has been reached.
+
+**Returns:**
+- `True` if the conversation should continue, `False` otherwise
+
+**Example:**
+```python
+while user_sim.has_next():
+    agent_response = agent(user_message)
+    user_result = user_sim.act(str(agent_response))
+    user_message = str(user_result.structured_output.message)
+```
+
+### Data Models
+
+**ActorProfile:**
+```python
+class ActorProfile(BaseModel):
+    traits: dict[str, Any]  # Actor characteristics and personality
+    context: str  # Background information and situation
+    actor_goal: str  # What the actor wants to achieve
+```
+
+**ActorResponse:**
+```python
+class ActorResponse(BaseModel):
+    reasoning: str  # Actor's internal reasoning process
+    message: str  # The actual message to send
+```
+
+## Usage Examples
+
+### Complete Multi-Turn Conversation Example
+
+```python
+from strands import Agent
+from strands_evals import ActorSimulator, Case
+
+# Create agent under test
+agent = Agent(system_prompt="You are a helpful travel assistant.", callback_handler=None)
+
+# Create test case
+case = Case(
+    input="I want to plan a trip to Tokyo with hotel and activities",
+    metadata={"task_description": "Complete travel package arranged"}
+)
+
+# Create user simulator
+user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=5)
+
+# Run conversation
+conversation = []
+user_message = case.input
+
+while user_sim.has_next():
+    # Agent responds
+    agent_response = agent(user_message)
+    agent_message = str(agent_response)
+    conversation.append({"role": "assistant", "content": agent_message})
+    
+    # User responds
+    user_result = user_sim.act(agent_message)
+    user_message = str(user_result.structured_output.message)
+    conversation.append({"role": "user", "content": user_message})
+
+print(f"Conversation completed in {len(conversation) // 2} turns")
+```
+
+### Custom Actor Profile
+
+```python
+from strands_evals.types.simulation import ActorProfile
+from strands_evals.simulation.prompt_templates.actor_system_prompt import (
+    DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE
+)
+
+# Create custom actor profile
+actor_profile = ActorProfile(
+    traits={
+        "personality": "analytical and detail-oriented",
+        "communication_style": "direct and concise",
+        "technical_level": "expert"
+    },
+    context="Experienced business traveler with elite status",
+    actor_goal="Book business class flight with specific seat preferences"
+)
+
+# Initialize with custom profile
+user_sim = ActorSimulator(
+    actor_profile=actor_profile,
+    initial_query="I need to book a business class flight to London",
+    system_prompt_template=DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE,
+    max_turns=15
+)
+```
+
+## Tools
+
+### Built-in Goal Completion Tool
+
+ActorSimulator automatically includes a goal completion assessment tool that actors can use to evaluate their progress:
+
+```python
+from strands_evals.simulation.tools.goal_completion import (
+    get_conversation_goal_completion
+)
+
+# The actor can call this tool during conversation to assess progress
+assessment = get_conversation_goal_completion(
+    initial_goal="Book a flight to Tokyo",
+    conversation=[
+        {"role": "user", "content": "I need a flight to Tokyo"},
+        {"role": "assistant", "content": "I can help with that..."}
+    ]
+)
+# Returns assessment with score and reasoning
+```
+
+### Adding Custom Tools
+
+Extend actor capabilities with custom tools:
+
+```python
+from strands import tool
+
+@tool
+def check_booking_status(booking_id: str) -> str:
+    """Check the status of a booking."""
+    return f"Booking {booking_id} is confirmed"
+
+# Add custom tools to the simulator
+user_sim = ActorSimulator.from_case_for_user_simulator(
+    case=case,
+    tools=[check_booking_status]
+)
+```
+
+## Advanced Configuration
+
+### Custom System Prompt Templates
+
+Customize actor behavior with a custom system prompt template. The template receives the actor profile as a format parameter:
+
+```python
+custom_prompt_template = """
+You are simulating a user with the following profile:
+{actor_profile}
+
+Behavior guidelines:
+- Be persistent but professional
+- Express concerns clearly
+- Stay focused on your goal
+
+Respond naturally based on your profile and the conversation context.
+"""
+
+user_sim = ActorSimulator.from_case_for_user_simulator(
+    case=case,
+    system_prompt_template=custom_prompt_template
+)
+```
+
+### Conversation Initialization
+
+ActorSimulator automatically initializes conversations with a random greeting from a predefined set:
+
+```python
+# Built-in greetings:
+# - "hi! how can I help you today?"
+# - "hello! what can I assist you with?"
+# - "hi there! how may I help you?"
+# - "good day! what can I do for you?"
+# - "hello! what would you like to know?"
+
+# The conversation starts with:
+# 1. Random greeting (as user message)
+# 2. Actor's initial query (as assistant message)
+```
+
+### Model Selection
+
+Specify a custom model for the actor simulator:
+
+```python
+user_sim = ActorSimulator.from_case_for_user_simulator(
+    case=case,
+    model="anthropic.claude-3-5-sonnet-20241022-v2:0",
+    max_turns=10
+)
+```
+
+## Best Practices
+
+1. **Include Task Description**: Add `task_description` in case metadata for better goal generation
+2. **Set max_turns**: Configure `max_turns` during initialization to prevent infinite conversations
+3. **Use has_next()**: Always use `has_next()` in your conversation loop to respect turn limits and stop tokens
+4. **Track Conversation**: Append messages to a conversation list for evaluation and debugging
+5. **Access Structured Output**: Use `result.structured_output.message` to get the actor's message and `result.structured_output.reasoning` to see internal reasoning
\ No newline at end of file
diff --git a/src/strands_evals/simulation/__init__.py b/src/strands_evals/simulation/__init__.py
new file mode 100644
index 0000000..6a4be0f
--- /dev/null
+++ b/src/strands_evals/simulation/__init__.py
@@ -0,0 +1,6 @@
+from .actor_simulator import ActorSimulator
+
+# Alias for backward compatibility
+UserSimulator = ActorSimulator
+
+__all__ = ["ActorSimulator", "UserSimulator"]
diff --git a/src/strands_evals/simulation/actor_simulator.py b/src/strands_evals/simulation/actor_simulator.py
new file mode 100644
index 0000000..fb1d9c3
--- /dev/null
+++ b/src/strands_evals/simulation/actor_simulator.py
@@ -0,0 +1,292 @@
+import logging
+import random
+
+from strands import Agent
+from strands.agent.agent_result import AgentResult
+from strands.types.content import Message
+from typing_extensions import cast
+
+from strands_evals.case import Case
+from strands_evals.simulation.profiles.actor_profile import DEFAULT_USER_PROFILE_SCHEMA
+from strands_evals.simulation.prompt_templates.actor_profile_extraction import ACTOR_PROFILE_PROMPT_TEMPLATE
+from strands_evals.simulation.prompt_templates.actor_system_prompt import DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE
+from strands_evals.simulation.tools.goal_completion import get_conversation_goal_completion
+from strands_evals.types.simulation import ActorProfile, ActorResponse
+
+logger = logging.getLogger(__name__)
+
+
+class ActorSimulator:
+    """
+    Simulates an actor in multi-turn conversations for agent evaluation.
+
+    ActorSimulator wraps a Strands Agent configured to behave as a specific actor
+    (typically a user) in conversation scenarios. It maintains conversation history,
+    generates contextually appropriate responses, and can assess goal completion.
+
+    Attributes:
+        agent: The underlying Strands Agent configured with actor behavior.
+        actor_profile: The actor's profile containing traits, context, and goal.
+        initial_query: The actor's first query in the conversation.
+        conversation_history: List of conversation messages in Strands format.
+        model_id: Model identifier for the underlying agent.
+    """
+
+    INITIAL_GREETINGS = [
+        "hi! how can I help you today?",
+        "hello! what can I assist you with?",
+        "hi there! how may I help you?",
+        "good day! what can I do for you?",
+        "hello! what would you like to know?",
+    ]
+
+    @classmethod
+    def from_case_for_user_simulator(
+        cls,
+        case: Case,
+        system_prompt_template: str | None = None,
+        tools: list | None = None,
+        model: str | None = None,
+        max_turns: int = 10,
+    ) -> "ActorSimulator":
+        """
+        Create an ActorSimulator configured as a user simulator from a test case.
+
+        Generates a realistic user profile and goal from case.input and optionally
+        case.metadata["task_description"], then configures the simulator with
+        user-specific defaults. If you already have a profile, use __init__() directly.
+
+        Args:
+            case: Test case containing input (initial query) and optional metadata with "task_description".
+            system_prompt_template: Custom system prompt template. Uses DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE if None.
+            tools: Additional tools available to the user. Defaults to goal completion tool only.
+            model: Model identifier for the underlying agent. Uses Strands default if None.
+            max_turns: Maximum number of conversation turns before stopping (default: 10).
+
+        Returns:
+            ActorSimulator configured for user simulation.
+
+        Example:
+            ```python
+            from strands_evals import Case, ActorSimulator
+            from strands import Agent
+
+            # Create test case
+            case = Case(
+                input="I need to book a flight to Paris",
+                metadata={"task_description": "Flight booking confirmed"}
+            )
+
+            # Create user simulator
+            user_sim = ActorSimulator.from_case_for_user_simulator(
+                case=case,
+                max_turns=5
+            )
+
+            # Create target agent to evaluate
+            agent = Agent(system_prompt="You are a travel assistant.")
+
+            # Run conversation
+            user_message = case.input
+            while user_sim.has_next():
+                agent_response = agent(user_message)
+                user_result = user_sim.act(str(agent_response))
+                user_message = str(user_result.structured_output.message)
+            ```
+        """
+        actor_profile = cls._generate_profile_from_case(case)
+
+        if system_prompt_template is None:
+            system_prompt_template = DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE
+
+        return cls(
+            actor_profile=actor_profile,
+            initial_query=case.input,
+            system_prompt_template=system_prompt_template,
+            tools=tools,
+            model=model,
+            max_turns=max_turns,
+        )
+
+    @staticmethod
+    def _generate_profile_from_case(case: Case) -> ActorProfile:
+        """
+        Generate user profile from case.
+
+        Private helper for from_case_for_user_simulator factory method.
+        Uses case.input and optionally case.metadata["task_description"] if present.
+
+        Args:
+            case: Test case with input and optional task_description in metadata.
+
+        Returns:
+            ActorProfile with generated traits, context, and goal.
+        """
+        initial_query = case.input
+        task_description = case.metadata.get("task_description", "") if case.metadata else ""
+
+        profile_prompt = ACTOR_PROFILE_PROMPT_TEMPLATE.format(
+            initial_query=initial_query,
+            task_description=task_description,
+            example=DEFAULT_USER_PROFILE_SCHEMA,
+        )
+        profile_agent = Agent(callback_handler=None)
+        result = profile_agent(profile_prompt, structured_output_model=ActorProfile)
+        return result.structured_output
+
+    def __init__(
+        self,
+        actor_profile: ActorProfile,
+        initial_query: str,
+        system_prompt_template: str,
+        tools: list | None = None,
+        model: str | None = None,
+        max_turns: int = 10,
+    ):
+        """
+        Initialize an ActorSimulator with profile and goal.
+
+        Use this constructor when you have a pre-defined ActorProfile. For automatic
+        profile generation from test cases, use from_case_for_user_simulator() instead.
+
+        Args:
+            actor_profile: ActorProfile object containing traits, context, and actor_goal.
+            initial_query: The actor's first query or message.
+            system_prompt_template: Template string for system prompt. Must include {actor_profile} placeholder.
+            tools: Additional tools available to the actor. Defaults to goal completion tool only.
+            model: Model identifier for the underlying agent. Uses Strands default if None.
+            max_turns: Maximum number of conversation turns before stopping (default: 10).
+
+        Example:
+            ```python
+            from strands_evals.simulation import ActorSimulator
+            from strands_evals.types.simulation import ActorProfile
+
+            # Define custom actor profile
+            profile = ActorProfile(
+                traits={
+                    "expertise_level": "expert",
+                    "communication_style": "technical"
+                },
+                context="A software engineer debugging a production issue.",
+                actor_goal="Identify and resolve the memory leak."
+            )
+
+            # Create simulator with custom profile
+            simulator = ActorSimulator(
+                actor_profile=profile,
+                initial_query="Our service is experiencing high memory usage.",
+                system_prompt_template="You are simulating: {actor_profile}",
+                max_turns=15
+            )
+            ```
+        """
+        self.actor_profile = actor_profile
+        self.initial_query = initial_query
+        self.conversation_history: list[Message] = []
+        self.model_id = model
+        self._turn_count = 0
+        self._last_message = ""
+        self._max_turns = max_turns
+
+        system_prompt = system_prompt_template.format(actor_profile=actor_profile.model_dump())
+
+        # Combine tools
+        all_tools = [get_conversation_goal_completion]
+        if tools:
+            all_tools.extend(tools)
+
+        self._initialize_conversation()
+
+        # Create agent
+        self.agent = Agent(
+            system_prompt=system_prompt,
+            messages=self.conversation_history,
+            tools=all_tools,
+            model=self.model_id,
+            callback_handler=None,
+        )
+
+    def _initialize_conversation(self):
+        """
+        Initialize the conversation history with a greeting and initial query.
+
+        Sets up the conversation with a random greeting from the assistant followed
+        by the actor's initial query. This establishes the conversation context.
+
+        Note: This is a private method called during initialization.
+        """
+        selected_greeting = random.choice(self.INITIAL_GREETINGS)
+        greeting_message = {"role": "user", "content": [{"text": selected_greeting}]}
+        self.conversation_history.append(greeting_message)
+
+        initial_query_message = {"role": "assistant", "content": [{"text": self.initial_query.strip()}]}
+        self.conversation_history.append(initial_query_message)
+
+    def act(self, agent_message: str) -> AgentResult:
+        """
+        Generate the next actor message in the conversation.
+
+        Processes the agent's message and generates a contextually appropriate
+        response from the actor's perspective, maintaining consistency with the actor's
+        profile and goal. The response includes reasoning about the actor's thought
+        process and the actual message to send.
+
+        Args:
+            agent_message: The agent's response to react to (required).
+
+        Returns:
+            AgentResult containing the actor's structured response with:
+                - structured_output.reasoning: Actor's internal reasoning
+                - structured_output.message: Actor's response message
+
+        Example:
+            ```python
+            # Agent responds to user
+            agent_response = agent("I need help booking a flight")
+
+            # User simulator generates next message
+            user_result = user_sim.act(str(agent_response))
+
+            # Access the response
+            print(user_result.structured_output.reasoning)  # Why the actor responded this way
+            print(user_result.structured_output.message)    # The actual message
+
+            # Continue conversation
+            next_message = str(user_result.structured_output.message)
+            ```
+        """
+        response = self.agent(agent_message.strip(), structured_output_model=ActorResponse)
+        self._turn_count += 1
+        self._last_message = str(cast(ActorResponse, response.structured_output).message)
+        return response
+
+    def has_next(self) -> bool:
+        """
+        Check if the conversation should continue.
+
+        Returns False if the stop token (<stop/>) is present in the last message or if
+        the maximum number of turns has been reached. Use this in a loop to control
+        multi-turn conversations.
+
+        Returns:
+            True if the conversation should continue, False otherwise.
+
+        Example:
+            ```python
+            user_message = case.input
+
+            # Continue conversation until completion
+            while user_sim.has_next():
+                agent_response = agent(user_message)
+                user_result = user_sim.act(str(agent_response))
+                user_message = str(user_result.structured_output.message)
+
+            # Conversation ended either by:
+            # - Actor including <stop/> token in message
+            # - Reaching max_turns limit
+            ```
+        """
+        if self._turn_count >= self._max_turns:
+            return False
+        return "<stop/>" not in self._last_message
diff --git a/src/strands_evals/simulation/profiles/__init__.py b/src/strands_evals/simulation/profiles/__init__.py
new file mode 100644
index 0000000..6d4e1d5
--- /dev/null
+++ b/src/strands_evals/simulation/profiles/__init__.py
@@ -0,0 +1,5 @@
+"""Profile templates for actor simulation."""
+
+from .actor_profile import DEFAULT_USER_PROFILE_SCHEMA
+
+__all__ = ["DEFAULT_USER_PROFILE_SCHEMA"]
diff --git a/src/strands_evals/simulation/profiles/actor_profile.py b/src/strands_evals/simulation/profiles/actor_profile.py
new file mode 100644
index 0000000..7d3c467
--- /dev/null
+++ b/src/strands_evals/simulation/profiles/actor_profile.py
@@ -0,0 +1,26 @@
+"""
+Actor profile templates for simulation.
+
+This module provides actor profile structures used as templates
+for generating realistic actor profiles in conversation simulation.
+"""
+
+DEFAULT_USER_PROFILE_SCHEMA = {
+    "traits": {
+        "personal_profile": {
+            "identity": {
+                "first_name": "User",
+                "last_name": "Default",
+                "preferred_name": "User",
+                "gender": "other",
+                "birthdate": "1990-01-01",
+                "email": "user@example.com",
+            },
+            "location": {"address1": "123 Main St", "city": "Default City", "province": "CA", "country": "USA"},
+            "languages": [{"language": "English", "proficiency": "Advanced"}],
+        },
+        "persona": "Friendly and helpful user seeking assistance with general topics.",
+        "supplementary_profile": "Default user profile for simulation.",
+    },
+    "context": "some context",
+}
diff --git a/src/strands_evals/simulation/prompt_templates/__init__.py b/src/strands_evals/simulation/prompt_templates/__init__.py
new file mode 100644
index 0000000..0d0771d
--- /dev/null
+++ b/src/strands_evals/simulation/prompt_templates/__init__.py
@@ -0,0 +1,11 @@
+"""Prompt templates for actor simulation."""
+
+from .actor_profile_extraction import ACTOR_PROFILE_PROMPT_TEMPLATE
+from .actor_system_prompt import DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE
+from .goal_completion import GOAL_COMPLETION_PROMPT
+
+__all__ = [
+    "ACTOR_PROFILE_PROMPT_TEMPLATE",
+    "DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE",
+    "GOAL_COMPLETION_PROMPT",
+]
diff --git a/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py b/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py
new file mode 100644
index 0000000..79623cd
--- /dev/null
+++ b/src/strands_evals/simulation/prompt_templates/actor_profile_extraction.py
@@ -0,0 +1,25 @@
+"""
+Prompt template for actor profile generation.
+
+This module contains the prompt template used to generate realistic actor profiles
+from scenario information for conversation simulation.
+"""
+
+from textwrap import dedent
+
+ACTOR_PROFILE_PROMPT_TEMPLATE = dedent("""Generate exactly 1 realistic actor profile for the following task:
+
+Actor's Initial Query: {initial_query}
+Tasks Description: {task_description}
+
+Generate a complete actor profile with the following structure with:
+1. Traits: Key traits (as key-value pairs)
+2. Context: Background context (as a paragraph in 2-3 sentences)
+3. Actor Goal: What the actor ultimately wants to achieve in this interaction - should be
+   specific, actionable, and written from the actor's perspective
+
+IMPORTANT: Return JSON in the following format! IT MUST HAVE THE EXACT STRUCTURE YOU SEE HERE WITH EXACTLY THESE KEYS.
+
+{example}
+
+Be specific and realistic.""")
diff --git a/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py b/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py
new file mode 100644
index 0000000..2b863fd
--- /dev/null
+++ b/src/strands_evals/simulation/prompt_templates/actor_system_prompt.py
@@ -0,0 +1,64 @@
+"""
+Default system prompt for actor simulation.
+
+This module contains the default system prompt that configures the actor's behavior,
+communication style, and response protocols for realistic conversation simulation.
+"""
+
+from textwrap import dedent
+
+DEFAULT_USER_SIMULATOR_PROMPT_TEMPLATE = dedent("""## User Simulation
+
+Core Identity:
+- You are simulating a user seeking assistance from an AI assistant
+- You speak in first person only
+- You strictly follow your defined User Goal and User Profile throughout the conversation
+
+## User Profile
+{actor_profile}
+
+
+Response Protocols:
+ When assistant requests information:
+   - Provide brief, specific information
+   - Maximum 2-3 sentences
+
+ When assistant provides solutions/answers:
+   - Ask follow-ups, seek clarification, or express satisfaction. Do no deviate from the User Goal.
+   - While following up, do not increase the conversation scope beyond your User Goal.
+
+Communication Rules:
+1. STRICT maximum response length: 2-3 sentences
+2. You are seeking help, NOT providing help - never give solutions!
+3. Maintain your user profile and expertise level consistently
+4. Express more of your user profile - let your background, expertise level, and personality
+   shine through in your responses
+5. Don't break character by mentioning "assistant" or "AI" explicitly
+6. Address AI assistant responses in second person ("Your suggestion..." not "The assistant's suggestion...")
+7. Do not explicitly mention conversation redirection
+8. Never include meta-references or self-instructions in your responses. These reveal you
+   are a simulator and is not how a real human would communicate. Don't write phrases like:
+   -  I need to respond as the user would ...
+   -  As the simulated user, I should ...
+   -  Here's how the user might respond ...
+   -  Based on my user goal, I need to ...
+9. Use the Exit Conditions strictly to stick to User Goal.
+10. Use all relevant tools first to ground your responses, and then respond
+
+Exit Conditions:
+1. Use get_conversation_goal_completion tool to check if your User Goal is met. When your User Goal is met:
+   - Just generate "<stop/>" to terminate conversation
+2. If conversation becomes unproductive or unsafe:
+   - Naturally steer back towards your User Goal
+   - If this becomes impossible, just generate: "<stop/>" to terminate conversation
+
+CRITICAL BEHAVIORAL CONSTRAINTS:
+- You are ONLY a user seeking assistance, NEVER the one providing assistance.
+- NEVER generate comprehensive responses, detailed plans, or extensive information.
+- NEVER solve problems yourself - that's the assistant's job. Under no circumstances,
+  you can use your tools to solve your user goal/sub goals.
+- If you find yourself writing more than 3 sentences, you're doing it wrong.
+- Generate only "<stop/>" to terminate conversation
+
+Response Format:
+Generate ONLY the next SHORT message (1-3 sentences). No explanations, no solutions, no comprehensive information.""")
diff --git a/src/strands_evals/simulation/prompt_templates/goal_completion.py b/src/strands_evals/simulation/prompt_templates/goal_completion.py
new file mode 100644
index 0000000..d27871c
--- /dev/null
+++ b/src/strands_evals/simulation/prompt_templates/goal_completion.py
@@ -0,0 +1,27 @@
+"""
+Goal completion assessment prompt template for actor simulation.
+
+This module contains the prompt template used to evaluate whether a conversation
+has successfully achieved the actor's initial goals using a 3-point assessment scale.
+"""
+
+from textwrap import dedent
+
+GOAL_COMPLETION_PROMPT = dedent(
+    """Please evaluate the following conversation against its intended goals using this
+3-point assessment scale:
+
+1 = Does not meet the goal at all
+2 = Partially meets the goal with significant gaps
+3 = Fully meets the goal
+
+Initial Goal:
+{initial_goal}
+
+Conversation to evaluate:
+{conversation}
+
+Please provide:
+- A score (1-3)
+- Brief one line justification"""
+)
diff --git a/src/strands_evals/simulation/tools/__init__.py b/src/strands_evals/simulation/tools/__init__.py
new file mode 100644
index 0000000..6d0145b
--- /dev/null
+++ b/src/strands_evals/simulation/tools/__init__.py
@@ -0,0 +1,5 @@
+"""Tools for actor simulation."""
+
+from .goal_completion import get_conversation_goal_completion
+
+__all__ = ["get_conversation_goal_completion"]
diff --git a/src/strands_evals/simulation/tools/goal_completion.py b/src/strands_evals/simulation/tools/goal_completion.py
new file mode 100644
index 0000000..f18abb5
--- /dev/null
+++ b/src/strands_evals/simulation/tools/goal_completion.py
@@ -0,0 +1,93 @@
+import logging
+
+from strands import Agent, tool
+from typing_extensions import Any
+
+from strands_evals.simulation.prompt_templates.goal_completion import GOAL_COMPLETION_PROMPT
+
+logger = logging.getLogger(__name__)
+
+
+@tool
+def get_conversation_goal_completion(initial_goal: str, conversation: list[dict[str, str]]) -> str:
+    """
+    Evaluate conversation goal completion using a 3-point assessment scale.
+
+    Analyzes the conversation against the actor's initial goal and provides a score
+    with justification.
+
+    Args:
+        initial_goal: The actor's original goal or objective.
+        conversation: List of conversation turns, each with 'role' and 'content' keys.
+
+    Returns:
+        Assessment string with score (1-3) and brief justification.
+
+    Raises:
+        ValueError: If the conversation format is invalid.
+    """
+    # Format conversation for the prompt
+    conversation_text = _format_conversation_for_assessment(conversation)
+
+    # Create the assessment prompt
+    prompt = GOAL_COMPLETION_PROMPT.format(initial_goal=initial_goal, conversation=conversation_text)
+
+    goal_completion_agent = Agent(callback_handler=None)
+    response = goal_completion_agent(prompt)
+    logger.info("Successfully completed goal completion assessment")
+    return str(response)
+
+
+def _format_conversation_for_assessment(conversation: list[dict[str, Any]]) -> str:
+    """
+    Format conversation history for goal completion assessment.
+
+    Args:
+        conversation: List of conversation turns with 'role' and 'content' keys.
+            Content can be either a string or a list of content blocks.
+
+    Returns:
+        Formatted conversation string with each turn on a separate line.
+
+    Raises:
+        ValueError: If conversation format is invalid.
+    """
+    try:
+        formatted_turns = []
+
+        for i, turn in enumerate(conversation):
+            if not isinstance(turn, dict):
+                raise ValueError(f"Conversation turn {i} must be a dictionary")
+
+            role = turn.get("role", "").strip()
+            content_raw = turn.get("content", "")
+
+            # Handle both string format and list of content blocks
+            if isinstance(content_raw, str):
+                content = content_raw.strip()
+            elif isinstance(content_raw, list):
+                content_parts = []
+                for block in content_raw:
+                    if isinstance(block, dict) and "text" in block:
+                        content_parts.append(block["text"])
+                content = " ".join(content_parts).strip()
+            else:
+                logger.warning(f"Skipping conversation turn {i} with invalid content type: {type(content_raw)}")
+                continue
+
+            if not role or not content:
+                logger.warning(f"Skipping conversation turn {i} with missing role or content")
+                continue
+
+            formatted_turn = f"{role.upper()}: {content}"
+            formatted_turns.append(formatted_turn)
+
+        if not formatted_turns:
+            raise ValueError("No valid conversation turns found")
+
+        return "\n\n".join(formatted_turns)
+
+    except ValueError:
+        raise
+    except Exception as e:
+        raise ValueError("Error formatting conversation") from e
diff --git a/src/strands_evals/types/__init__.py b/src/strands_evals/types/__init__.py
index 77f81bf..60b322c 100644
--- a/src/strands_evals/types/__init__.py
+++ b/src/strands_evals/types/__init__.py
@@ -1,3 +1,11 @@
 from .evaluation import EvaluationData, EvaluationOutput, Interaction, TaskOutput
+from .simulation import ActorProfile, ActorResponse
 
-__all__ = ["Interaction", "TaskOutput", "EvaluationData", "EvaluationOutput"]
+__all__ = [
+    "Interaction",
+    "TaskOutput",
+    "EvaluationData",
+    "EvaluationOutput",
+    "ActorProfile",
+    "ActorResponse",
+]
diff --git a/src/strands_evals/types/simulation/__init__.py b/src/strands_evals/types/simulation/__init__.py
new file mode 100644
index 0000000..13a94b0
--- /dev/null
+++ b/src/strands_evals/types/simulation/__init__.py
@@ -0,0 +1,5 @@
+"""Data models for actor simulation."""
+
+from .actor import ActorProfile, ActorResponse
+
+__all__ = ["ActorProfile", "ActorResponse"]
diff --git a/src/strands_evals/types/simulation/actor.py b/src/strands_evals/types/simulation/actor.py
new file mode 100644
index 0000000..d30be94
--- /dev/null
+++ b/src/strands_evals/types/simulation/actor.py
@@ -0,0 +1,34 @@
+from pydantic import BaseModel, Field
+from typing_extensions import Any
+
+
+class ActorProfile(BaseModel):
+    """
+    Profile for actor simulation.
+
+    Attributes:
+        traits: Dictionary of actor characteristics and attributes.
+        context: Supplementary background information about the actor.
+        actor_goal: What the actor ultimately wants to achieve in the interaction.
+    """
+
+    traits: dict[str, Any] = Field(..., description="Actor traits for simulation")
+    context: str = Field(..., description="Supplementary actor background details")
+    actor_goal: str = Field(
+        ...,
+        description="What the actor ultimately wants to achieve in this interaction - "
+        "should be specific, actionable, and written from the actor's perspective",
+    )
+
+
+class ActorResponse(BaseModel):
+    """
+    Structured response from an actor.
+
+    Attributes:
+        reasoning: Internal reasoning process for the response.
+        message: The actual message content from the actor.
+    """
+
+    reasoning: str = Field(..., description="Reasoning for the actor's response")
+    message: str = Field(..., description="Message from the actor")
diff --git a/tests/strands_evals/simulation/__init__.py b/tests/strands_evals/simulation/__init__.py
new file mode 100644
index 0000000..9ad0280
--- /dev/null
+++ b/tests/strands_evals/simulation/__init__.py
@@ -0,0 +1 @@
+"""Tests for actor simulation module."""
diff --git a/tests/strands_evals/simulation/test_actor_simulator.py b/tests/strands_evals/simulation/test_actor_simulator.py
new file mode 100644
index 0000000..c491a6f
--- /dev/null
+++ b/tests/strands_evals/simulation/test_actor_simulator.py
@@ -0,0 +1,213 @@
+"""Tests for ActorSimulator class."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+from strands.agent.agent_result import AgentResult
+
+from strands_evals import Case
+from strands_evals.simulation import ActorSimulator
+from strands_evals.types.simulation import ActorProfile, ActorResponse
+
+
+@pytest.fixture
+def sample_actor_profile():
+    """Fixture providing a sample actor profile."""
+    return ActorProfile(
+        traits={
+            "expertise_level": "beginner",
+            "communication_style": "casual",
+            "patience_level": "high",
+        },
+        context="A beginner user learning about travel planning.",
+        actor_goal="Book a complete trip to Tokyo including flights and hotel.",
+    )
+
+
+@pytest.fixture
+def sample_case():
+    """Fixture providing a sample test case."""
+    return Case(
+        input="I want to plan a trip to Tokyo",
+        metadata={"task_description": "Complete travel package arranged"},
+    )
+
+
+def test_actor_simulator_init(sample_actor_profile):
+    """Test ActorSimulator initialization with profile."""
+    simulator = ActorSimulator(
+        actor_profile=sample_actor_profile,
+        initial_query="Hello, I need help",
+        system_prompt_template="Test prompt: {actor_profile}",
+        tools=None,
+        model=None,
+    )
+
+    assert simulator.actor_profile == sample_actor_profile
+    assert simulator.initial_query == "Hello, I need help"
+    assert simulator.agent is not None
+    assert len(simulator.conversation_history) == 2  # greeting + initial query
+
+
+def test_initialize_conversation(sample_actor_profile):
+    """Test conversation initialization creates greeting and initial query."""
+    simulator = ActorSimulator(
+        actor_profile=sample_actor_profile,
+        initial_query="I need help with travel",
+        system_prompt_template="Test: {actor_profile}",
+    )
+
+    history = simulator.conversation_history
+    assert len(history) == 2
+    assert history[0]["role"] == "user"
+    assert any(greeting in history[0]["content"][0]["text"] for greeting in ActorSimulator.INITIAL_GREETINGS)
+    assert history[1]["role"] == "assistant"
+    assert history[1]["content"][0]["text"] == "I need help with travel"
+
+
+@patch("strands_evals.simulation.actor_simulator.Agent")
+def test_from_case_for_user_simulator(mock_agent_class, sample_case):
+    """Test factory method creates simulator from case."""
+    # Mock the profile generation agent
+    mock_profile_agent = MagicMock()
+    mock_profile = ActorProfile(
+        traits={"test": "trait"},
+        context="Test context",
+        actor_goal="Test goal",
+    )
+    mock_result = MagicMock()
+    mock_result.structured_output = mock_profile
+    mock_profile_agent.return_value = mock_result
+
+    # Mock the main simulator agent
+    mock_simulator_agent = MagicMock()
+
+    # Configure mock to return different instances
+    mock_agent_class.side_effect = [mock_profile_agent, mock_simulator_agent]
+
+    simulator = ActorSimulator.from_case_for_user_simulator(case=sample_case)
+
+    assert simulator.actor_profile == mock_profile
+    assert simulator.initial_query == sample_case.input
+    assert mock_agent_class.call_count == 2  # Once for profile gen, once for simulator
+
+
+@patch("strands_evals.simulation.actor_simulator.Agent")
+def test_generate_profile_from_case(mock_agent_class, sample_case):
+    """Test profile generation from case."""
+    mock_agent = MagicMock()
+    mock_profile = ActorProfile(
+        traits={"generated": "trait"},
+        context="Generated context",
+        actor_goal="Generated goal",
+    )
+    mock_result = MagicMock()
+    mock_result.structured_output = mock_profile
+    mock_agent.return_value = mock_result
+    mock_agent_class.return_value = mock_agent
+
+    profile = ActorSimulator._generate_profile_from_case(sample_case)
+
+    assert profile == mock_profile
+    assert mock_agent.called
+    # Verify structured_output_model was passed
+    call_args = mock_agent.call_args
+    assert call_args[1]["structured_output_model"] == ActorProfile
+
+
+def test_act_generates_response(sample_actor_profile):
+    """Test act method generates actor response."""
+    simulator = ActorSimulator(
+        actor_profile=sample_actor_profile,
+        initial_query="Hello",
+        system_prompt_template="Test: {actor_profile}",
+    )
+
+    # Mock the agent's response
+    mock_response = MagicMock(spec=AgentResult)
+    mock_actor_response = ActorResponse(
+        reasoning="Test reasoning",
+        message="Test response message",
+    )
+    mock_response.structured_output = mock_actor_response
+    simulator.agent = MagicMock(return_value=mock_response)
+
+    result = simulator.act("What can I help you with?")
+
+    assert result == mock_response
+    assert result.structured_output.message == "Test response message"
+    simulator.agent.assert_called_once()
+
+
+def test_act_uses_structured_output(sample_actor_profile):
+    """Test act method requests structured output."""
+    simulator = ActorSimulator(
+        actor_profile=sample_actor_profile,
+        initial_query="Hello",
+        system_prompt_template="Test: {actor_profile}",
+    )
+
+    mock_response = MagicMock(spec=AgentResult)
+    mock_actor_response = ActorResponse(reasoning="Test", message="Test message")
+    mock_response.structured_output = mock_actor_response
+    simulator.agent = MagicMock(return_value=mock_response)
+
+    simulator.act("Test message")
+
+    # Verify structured_output_model parameter
+    call_kwargs = simulator.agent.call_args[1]
+    assert call_kwargs["structured_output_model"] == ActorResponse
+
+
+def test_has_next_returns_true_initially(sample_actor_profile):
+    """Test has_next returns True before any turns."""
+    simulator = ActorSimulator(
+        actor_profile=sample_actor_profile,
+        initial_query="Hello",
+        system_prompt_template="Test: {actor_profile}",
+    )
+
+    assert simulator.has_next() is True
+
+
+def test_has_next_respects_max_turns(sample_actor_profile):
+    """Test has_next returns False after max_turns reached."""
+    simulator = ActorSimulator(
+        actor_profile=sample_actor_profile,
+        initial_query="Hello",
+        system_prompt_template="Test: {actor_profile}",
+        max_turns=3,
+    )
+
+    # Mock responses
+    mock_response = MagicMock(spec=AgentResult)
+    mock_actor_response = ActorResponse(reasoning="Test", message="Continue")
+    mock_response.structured_output = mock_actor_response
+    simulator.agent = MagicMock(return_value=mock_response)
+
+    # Simulate 3 turns with max_turns=3
+    for _ in range(3):
+        assert simulator.has_next() is True
+        simulator.act("Test message")
+
+    # After 3 turns, should return False
+    assert simulator.has_next() is False
+
+
+def test_has_next_detects_stop_token(sample_actor_profile):
+    """Test has_next returns False when stop token is present."""
+    simulator = ActorSimulator(
+        actor_profile=sample_actor_profile,
+        initial_query="Hello",
+        system_prompt_template="Test: {actor_profile}",
+    )
+
+    # Mock response with stop token
+    mock_response = MagicMock(spec=AgentResult)
+    mock_actor_response = ActorResponse(reasoning="Done", message="Thanks! <stop/>")
+    mock_response.structured_output = mock_actor_response
+    simulator.agent = MagicMock(return_value=mock_response)
+
+    # After act with stop token, has_next should return False
+    simulator.act("Test message")
+    assert simulator.has_next() is False
diff --git a/tests/strands_evals/simulation/test_goal_completion.py b/tests/strands_evals/simulation/test_goal_completion.py
new file mode 100644
index 0000000..5a7a99a
--- /dev/null
+++ b/tests/strands_evals/simulation/test_goal_completion.py
@@ -0,0 +1,196 @@
+"""Tests for goal completion assessment tool."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from strands_evals.simulation.tools.goal_completion import (
+    _format_conversation_for_assessment,
+    get_conversation_goal_completion,
+)
+
+
+@pytest.fixture
+def sample_conversation():
+    """Fixture providing a sample conversation."""
+    return [
+        {"role": "user", "content": "I want to book a flight to Tokyo"},
+        {"role": "assistant", "content": "I can help with that. What dates?"},
+        {"role": "user", "content": "Next month"},
+    ]
+
+
+@pytest.fixture
+def sample_conversation_with_blocks():
+    """Fixture providing conversation with content blocks."""
+    return [
+        {"role": "user", "content": [{"text": "Hello"}]},
+        {"role": "assistant", "content": [{"text": "Hi there"}, {"text": "How can I help?"}]},
+    ]
+
+
+def test_format_conversation_simple(sample_conversation):
+    """Test formatting simple conversation."""
+    result = _format_conversation_for_assessment(sample_conversation)
+
+    assert "USER: I want to book a flight to Tokyo" in result
+    assert "ASSISTANT: I can help with that. What dates?" in result
+    assert "USER: Next month" in result
+    assert result.count("\n\n") == 2  # Two separators for three turns
+
+
+def test_format_conversation_with_content_blocks(sample_conversation_with_blocks):
+    """Test formatting conversation with content blocks."""
+    result = _format_conversation_for_assessment(sample_conversation_with_blocks)
+
+    assert "USER: Hello" in result
+    assert "ASSISTANT: Hi there How can I help?" in result
+
+
+def test_format_conversation_mixed_formats():
+    """Test formatting conversation with mixed string and block formats."""
+    conversation = [
+        {"role": "user", "content": "String content"},
+        {"role": "assistant", "content": [{"text": "Block content"}]},
+    ]
+
+    result = _format_conversation_for_assessment(conversation)
+
+    assert "USER: String content" in result
+    assert "ASSISTANT: Block content" in result
+
+
+def test_format_conversation_empty_list():
+    """Test formatting empty conversation raises error."""
+    with pytest.raises(ValueError, match="No valid conversation turns found"):
+        _format_conversation_for_assessment([])
+
+
+def test_format_conversation_invalid_turn_type():
+    """Test formatting conversation with invalid turn type."""
+    with pytest.raises(ValueError, match="must be a dictionary"):
+        _format_conversation_for_assessment(["not a dict"])
+
+
+def test_format_conversation_missing_role():
+    """Test formatting conversation skips turns with missing role."""
+    conversation = [
+        {"role": "user", "content": "Valid turn"},
+        {"content": "Missing role"},
+        {"role": "assistant", "content": "Another valid turn"},
+    ]
+
+    result = _format_conversation_for_assessment(conversation)
+
+    assert "USER: Valid turn" in result
+    assert "ASSISTANT: Another valid turn" in result
+    assert "Missing role" not in result
+
+
+def test_format_conversation_missing_content():
+    """Test formatting conversation skips turns with missing content."""
+    conversation = [
+        {"role": "user", "content": "Valid turn"},
+        {"role": "assistant"},
+        {"role": "user", "content": "Another valid turn"},
+    ]
+
+    result = _format_conversation_for_assessment(conversation)
+
+    assert "USER: Valid turn" in result
+    assert "USER: Another valid turn" in result
+    assert result.count("ASSISTANT") == 0
+
+
+def test_format_conversation_empty_strings():
+    """Test formatting conversation skips turns with empty strings."""
+    conversation = [
+        {"role": "user", "content": "Valid turn"},
+        {"role": "", "content": "Empty role"},
+        {"role": "assistant", "content": ""},
+    ]
+
+    result = _format_conversation_for_assessment(conversation)
+
+    assert "USER: Valid turn" in result
+    assert "Empty role" not in result
+
+
+def test_format_conversation_whitespace_handling():
+    """Test formatting conversation strips whitespace."""
+    conversation = [
+        {"role": "  user  ", "content": "  Content with spaces  "},
+    ]
+
+    result = _format_conversation_for_assessment(conversation)
+
+    assert "USER: Content with spaces" in result
+
+
+def test_format_conversation_invalid_content_type():
+    """Test formatting conversation with invalid content type logs warning."""
+    conversation = [
+        {"role": "user", "content": "Valid"},
+        {"role": "assistant", "content": 123},  # Invalid type
+    ]
+
+    # Should not raise, but skip invalid turn
+    result = _format_conversation_for_assessment(conversation)
+    assert "USER: Valid" in result
+
+
+@patch("strands_evals.simulation.tools.goal_completion.Agent")
+def test_get_conversation_goal_completion(mock_agent_class, sample_conversation):
+    """Test goal completion assessment."""
+    mock_agent = MagicMock()
+    mock_response = MagicMock()
+    mock_response.__str__ = MagicMock(return_value="Score: 3 - Goal fully met")
+    mock_agent.return_value = mock_response
+    mock_agent_class.return_value = mock_agent
+
+    result = get_conversation_goal_completion(
+        initial_goal="Book a flight to Tokyo",
+        conversation=sample_conversation,
+    )
+
+    assert result == "Score: 3 - Goal fully met"
+    mock_agent.assert_called_once()
+
+
+@patch("strands_evals.simulation.tools.goal_completion.Agent")
+def test_get_conversation_goal_completion_formats_prompt(mock_agent_class, sample_conversation):
+    """Test goal completion formats prompt correctly."""
+    mock_agent = MagicMock()
+    mock_response = MagicMock()
+    mock_response.__str__ = MagicMock(return_value="Assessment")
+    mock_agent.return_value = mock_response
+    mock_agent_class.return_value = mock_agent
+
+    get_conversation_goal_completion(
+        initial_goal="Test goal",
+        conversation=sample_conversation,
+    )
+
+    # Verify prompt contains goal and conversation
+    call_args = mock_agent.call_args[0][0]
+    assert "Test goal" in call_args
+    assert "USER:" in call_args
+    assert "ASSISTANT:" in call_args
+
+
+def test_get_conversation_goal_completion_empty_conversation():
+    """Test goal completion with empty conversation raises error."""
+    with pytest.raises(ValueError):
+        get_conversation_goal_completion(
+            initial_goal="Test goal",
+            conversation=[],
+        )
+
+
+def test_get_conversation_goal_completion_invalid_conversation():
+    """Test goal completion with invalid conversation raises error."""
+    with pytest.raises(ValueError):
+        get_conversation_goal_completion(
+            initial_goal="Test goal",
+            conversation=["not", "valid"],
+        )
diff --git a/tests/strands_evals/test_dataset.py b/tests/strands_evals/test_dataset.py
index c80f61c..609b730 100644
--- a/tests/strands_evals/test_dataset.py
+++ b/tests/strands_evals/test_dataset.py
@@ -39,8 +39,8 @@ def mock_span():
 def simple_task():
     """Fixture that provides a simple echo task function"""
 
-    def task(input_val):
-        return input_val
+    def task(case):
+        return case.input
 
     return task
 
@@ -124,8 +124,8 @@ def test_dataset__run_task_simple_output(mock_evaluator):
     case = Case(name="test", input="hello", expected_output="world")
     dataset = Dataset(cases=[case], evaluator=mock_evaluator)
 
-    def simple_task(input_val):
-        return f"response to {input_val}"
+    def simple_task(c):
+        return f"response to {c.input}"
 
     result = dataset._run_task(simple_task, case)
 
@@ -145,8 +145,8 @@ def test_dataset__run_task_dict_output(mock_evaluator):
     case = Case(name="test", input="hello", expected_output="world")
     dataset = Dataset(cases=[case], evaluator=mock_evaluator)
 
-    def dict_task(input_val):
-        return {"output": f"response to {input_val}", "trajectory": ["step1", "step2"]}
+    def dict_task(c):
+        return {"output": f"response to {c.input}", "trajectory": ["step1", "step2"]}
 
     result = dataset._run_task(dict_task, case)
 
@@ -160,9 +160,9 @@ def test_dataset_run_task_dict_output_with_interactions(mock_evaluator):
     case = Case(name="test", input="hello", expected_output="world", expected_interactions=interactions)
     dataset = Dataset(cases=[case], evaluator=mock_evaluator)
 
-    def dict_task(input_val):
+    def dict_task(c):
         return {
-            "output": f"response to {input_val}",
+            "output": f"response to {c.input}",
             "trajectory": ["step1", "step2"],
             "interactions": interactions,
         }
@@ -182,8 +182,8 @@ def test_dataset__run_task_dict_output_with_input_update(mock_evaluator):
     case = Case(name="test", input="original_input", expected_output="world")
     dataset = Dataset(cases=[case], evaluator=mock_evaluator)
 
-    def task_with_input_update(input_val):
-        return {"output": f"response to {input_val}", "input": "updated_input", "trajectory": ["step1"]}
+    def task_with_input_update(c):
+        return {"output": f"response to {c.input}", "input": "updated_input", "trajectory": ["step1"]}
 
     result = dataset._run_task(task_with_input_update, case)
 
@@ -198,8 +198,8 @@ async def test_dataset__run_task_async_with_input_update():
     case = Case(name="test", input="original_input", expected_output="world")
     dataset = Dataset(cases=[case], evaluator=MockEvaluator())
 
-    def task_with_input_update(input_val):
-        return {"output": f"response to {input_val}", "input": "async_updated_input"}
+    def task_with_input_update(c):
+        return {"output": f"response to {c.input}", "input": "async_updated_input"}
 
     result = await dataset._run_task_async(task_with_input_update, case)
 
@@ -212,8 +212,8 @@ def test_dataset__run_task_async_function_raises_error(mock_evaluator):
     case = Case(name="test", input="hello", expected_output="world")
     dataset = Dataset(cases=[case], evaluator=mock_evaluator)
 
-    async def async_task(input_val):
-        return f"response to {input_val}"
+    async def async_task(c):
+        return f"response to {c.input}"
 
     with pytest.raises(ValueError, match="Async task is not supported. Please use run_evaluations_async instead."):
         dataset._run_task(async_task, case)
@@ -223,8 +223,8 @@ async def async_task(input_val):
 async def test_dataset__run_task_async_with_sync_task():
     """Test _run_task_async with a synchronous task function"""
 
-    def sync_task(input_val):
-        return input_val
+    def sync_task(c):
+        return c.input
 
     case = Case(name="test", input="hello", expected_output="world")
     dataset = Dataset(cases=[case], evaluator=MockEvaluator())
@@ -238,8 +238,8 @@ def sync_task(input_val):
 async def test_dataset__run_task_async_with_async_task():
     """Test _run_task_async with an asynchronous task function"""
 
-    async def async_task(input_val):
-        return input_val
+    async def async_task(c):
+        return c.input
 
     case = Case(name="test", input="hello", expected_output="world")
     dataset = Dataset(cases=[case], evaluator=MockEvaluator())
@@ -257,8 +257,8 @@ def test_dataset_run_evaluations(mock_evaluator):
     ]
     dataset = Dataset(cases=cases, evaluator=mock_evaluator)
 
-    def echo_task(input_val):
-        return input_val
+    def echo_task(c):
+        return c.input
 
     report = dataset.run_evaluations(echo_task)
 
@@ -600,8 +600,8 @@ def test_dataset_from_dict_InteractionsEvaluator_defaults():
 async def test_dataset_run_evaluations_async():
     """Test run_evaluations_async with a simple task"""
 
-    def task(input_str):
-        return input_str
+    def task(c):
+        return c.input
 
     case = Case(name="test", input="hello", expected_output="hello")
     case1 = Case(name="test1", input="world", expected_output="world")
@@ -619,9 +619,9 @@ def task(input_str):
 async def test_dataset_run_evaluations_async_with_async_task():
     """Test run_evaluations_async with an async task"""
 
-    async def async_task(input_str):
+    async def async_task(c):
         await asyncio.sleep(0.01)
-        return input_str
+        return c.input
 
     case = Case(name="test", input="hello", expected_output="hello")
     case1 = Case(name="test1", input="world", expected_output="world")
@@ -638,10 +638,10 @@ async def async_task(input_str):
 async def test_datset_run_evaluations_async_with_errors():
     """Test run_evaluations_async handles errors gracefully"""
 
-    def failing_task(input_str):
-        if input_str == "hello":
+    def failing_task(c):
+        if c.input == "hello":
             raise ValueError("Test error")
-        return input_str
+        return c.input
 
     case = Case(name="test", input="hello", expected_output="hello")
     case1 = Case(name="test1", input="world", expected_output="world")
@@ -660,8 +660,8 @@ def test_dataset_run_evaluations_with_interactions():
     case = Case(name="test", input="hello", expected_output="world", expected_interactions=interactions)
     dataset = Dataset(cases=[case], evaluator=MockEvaluator())
 
-    def task_with_interactions(input_val):
-        return {"output": input_val, "interactions": interactions}
+    def task_with_interactions(c):
+        return {"output": c.input, "interactions": interactions}
 
     report = dataset.run_evaluations(task_with_interactions)
 
@@ -744,8 +744,8 @@ def test_dataset_run_evaluations_with_trajectory_in_span(mock_span):
 
     with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span):
 
-        def task_with_trajectory(input_val):
-            return {"output": input_val, "trajectory": ["step1", "step2"]}
+        def task_with_trajectory(c):
+            return {"output": c.input, "trajectory": ["step1", "step2"]}
 
         dataset.run_evaluations(task_with_trajectory)
 
@@ -766,8 +766,8 @@ def test_dataset_run_evaluations_with_interactions_in_span(mock_span):
 
     with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span):
 
-        def task_with_interactions(input_val):
-            return {"output": input_val, "interactions": interactions}
+        def task_with_interactions(c):
+            return {"output": c.input, "interactions": interactions}
 
         dataset.run_evaluations(task_with_interactions)
 
@@ -788,7 +788,7 @@ def test_dataset_run_evaluations_records_exception_in_span(mock_span):
 
     with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span):
 
-        def failing_task(input_val):
+        def failing_task(c):
             raise ValueError("Test error")
 
         dataset.run_evaluations(failing_task)
@@ -819,8 +819,8 @@ async def test_dataset_run_evaluations_async_creates_spans(mock_span):
 
     with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span) as mock_start_span:
 
-        async def async_task(input_val):
-            return input_val
+        async def async_task(c):
+            return c.input
 
         await dataset.run_evaluations_async(async_task)
 
@@ -842,7 +842,7 @@ async def test_dataset_run_evaluations_async_records_exception(mock_span):
 
     with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span):
 
-        async def failing_async_task(input_val):
+        async def failing_async_task(c):
             raise ValueError("Async test error")
 
         await dataset.run_evaluations_async(failing_async_task)
@@ -860,8 +860,8 @@ async def test_dataset_run_evaluations_async_with_dict_output(mock_span):
 
     with patch.object(dataset._tracer, "start_as_current_span", return_value=mock_span):
 
-        async def async_task_with_dict(input_val):
-            return {"output": input_val, "trajectory": ["step1"], "interactions": interactions}
+        async def async_task_with_dict(c):
+            return {"output": c.input, "trajectory": ["step1"], "interactions": interactions}
 
         await dataset.run_evaluations_async(async_task_with_dict)
 
diff --git a/tests/test_integration.py b/tests/test_integration.py
index b1f61ed..ee09b44 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -70,8 +70,8 @@ def test_integration_dataset_with_simple_evaluator(cases):
     """Test complete workflow: Dataset + Cases + SimpleEvaluator + EvaluationReport"""
     dataset = Dataset(cases=cases, evaluator=SimpleEvaluator())
 
-    def echo_task(input_val):
-        return input_val
+    def echo_task(case):
+        return case.input
 
     report = dataset.run_evaluations(echo_task)
 
@@ -89,9 +89,9 @@ def test_integration_dataset_with_dict_output_task(cases):
     """Test Dataset with task returning dictionary output"""
     dataset = Dataset(cases=cases, evaluator=SimpleEvaluator())
 
-    def dict_task(input_val):
+    def dict_task(case):
         return TaskOutput(
-            output=input_val,
+            output=case.input,
             trajectory=["step1", "step2"],
             interactions=[Interaction(node_name="agent1", dependencies=[], messages=["processing hello"])],
         )
@@ -115,8 +115,8 @@ def test_integration_dataset_with_output_evaluator(mock_agent_class, cases, mock
     output_evaluator = OutputEvaluator(rubric="Test if outputs match exactly")
     dataset = Dataset(cases=cases, evaluator=output_evaluator)
 
-    def simple_task(input_val):
-        return f"processed_{input_val}"
+    def simple_task(case):
+        return f"processed_{case.input}"
 
     report = dataset.run_evaluations(simple_task)
 
@@ -131,8 +131,8 @@ def test_integration_evaluation_report_display(cases):
     """Test that EvaluationReport display works with real data"""
     dataset = Dataset(cases=cases, evaluator=SimpleEvaluator())
 
-    def mixed_task(input_val):
-        if input_val == "hello":
+    def mixed_task(case):
+        if case.input == "hello":
             return "hello"
         return "different"
 
@@ -156,8 +156,8 @@ def test_integration_dataset_with_trajectory_evaluator(mock_agent_class, cases,
     trajectory_evaluator = TrajectoryEvaluator(rubric="Test if trajectories match exactly")
     dataset = Dataset(cases=cases, evaluator=trajectory_evaluator)
 
-    def simple_task(input_val):
-        return {"output": f"processed_{input_val}", "trajectory": ["step1", "step2"]}
+    def simple_task(case):
+        return {"output": f"processed_{case.input}", "trajectory": ["step1", "step2"]}
 
     report = dataset.run_evaluations(simple_task)
 
@@ -175,8 +175,8 @@ def test_integration_dataset_with_list_inputs():
     ]
     dataset = Dataset(cases=cases, evaluator=SimpleEvaluator())
 
-    def list_task(input_val):
-        return input_val
+    def list_task(case):
+        return case.input
 
     report = dataset.run_evaluations(list_task)
 
@@ -195,8 +195,8 @@ async def test_integration_async_dataset_with_simple_evaluator(cases):
     """Test async workflow: Dataset + Cases + SimpleEvaluator"""
     dataset = Dataset(cases=cases, evaluator=SimpleEvaluator())
 
-    def echo_task(input_val):
-        return input_val
+    def echo_task(case):
+        return case.input
 
     report = await dataset.run_evaluations_async(echo_task)
 
@@ -215,9 +215,9 @@ async def test_integration_async_dataset_with_async_task(cases):
     """Test async workflow with async task function"""
     dataset = Dataset(cases=cases, evaluator=SimpleEvaluator())
 
-    async def async_echo_task(input_val):
+    async def async_echo_task(case):
         await asyncio.sleep(0.01)  # Simulate async work
-        return input_val
+        return case.input
 
     report = await dataset.run_evaluations_async(async_echo_task)
 
@@ -240,8 +240,8 @@ async def test_integration_async_dataset_with_output_evaluator(mock_agent_class,
     output_evaluator = OutputEvaluator(rubric="Test if outputs match exactly")
     dataset = Dataset(cases=cases, evaluator=output_evaluator)
 
-    def simple_task(input_val):
-        return f"processed_{input_val}"
+    def simple_task(case):
+        return f"processed_{case.input}"
 
     report = await dataset.run_evaluations_async(simple_task)
 
@@ -259,9 +259,9 @@ async def test_integration_async_dataset_concurrency():
     dataset = Dataset(cases=many_cases, evaluator=SimpleEvaluator())
 
     # Create a task with noticeable delay
-    async def slow_task(input_val):
+    async def slow_task(case):
         await asyncio.sleep(0.1)  # Each task takes 0.1s
-        return input_val
+        return case.input
 
     # Time the execution
     start_time = asyncio.get_event_loop().time()
@@ -285,7 +285,7 @@ def test_dataset_with_interactions_evaluator(mock_agent_class, interaction_case,
     interactions_evaluator = InteractionsEvaluator(rubric="Test if interactions match expected sequence")
     dataset = Dataset(cases=interaction_case, evaluator=interactions_evaluator)
 
-    def task_with_interactions(input_val):
+    def task_with_interactions(case):
         return {
             "output": "world",
             "interactions": [
@@ -308,7 +308,7 @@ async def test_async_dataset_with_interactions(interaction_case):
     """Test async Dataset with interactions data"""
     dataset = Dataset(cases=interaction_case, evaluator=SimpleEvaluator())
 
-    async def async_interactions_task(input_val):
+    async def async_interactions_task(case):
         await asyncio.sleep(0.01)
         return {
             "output": "world",