11import asyncio
22
3+ from opentelemetry import trace
4+ from opentelemetry .sdk .trace .export import BatchSpanProcessor
5+ from opentelemetry .sdk .trace .export .in_memory_span_exporter import InMemorySpanExporter
36from strands import Agent
47
58from strands_evals import ActorSimulator , Case , Dataset
6- from strands_evals .evaluators import OutputEvaluator
9+ from strands_evals .evaluators import HelpfulnessEvaluator
10+ from strands_evals .mappers import StrandsInMemorySessionMapper
11+ from strands_evals .telemetry import StrandsEvalsTelemetry
712
13+ # ======================================
14+ # SETUP TELEMETRY
15+ # ======================================
16+ # Target agent telemetry
17+ target_telemetry = StrandsEvalsTelemetry ()
18+ target_exporter = InMemorySpanExporter ()
19+ target_telemetry .tracer_provider .add_span_processor (BatchSpanProcessor (target_exporter ))
820
9- async def simulate_conversation (case : Case ) -> dict :
21+ # Simulator telemetry (no exporter)
22+ simulator_telemetry = StrandsEvalsTelemetry ()
23+
24+
25+ async def task_function (case : Case ) -> dict :
1026 """Simulate a multi-turn conversation between user and agent."""
1127 # Create agent under test
28+ trace .set_tracer_provider (target_telemetry .tracer_provider )
1229 agent = Agent (system_prompt = "You are a helpful travel assistant." , callback_handler = None )
1330
1431 # Create user simulator from case
32+ trace .set_tracer_provider (simulator_telemetry .tracer_provider )
1533 user_sim = ActorSimulator .from_case_for_user_simulator (case = case )
1634
17- # Multi-turn conversation
18- conversation = []
19- max_turns = 10
20-
21- # Initial user message
35+ # Multi-turn conversation (max 10 turns as failsafe)
2236 user_message = case .input
23- conversation .append ({"role" : "user" , "content" : user_message })
2437
25- for turn_num in range (max_turns ):
26- print (f"Starting turn number { turn_num } " )
27- # Check for completion
38+ for _ in range (10 ):
2839 if "<stop/>" in user_message :
2940 break
3041
3142 # Agent responds
43+ trace .set_tracer_provider (target_telemetry .tracer_provider )
3244 agent_response = agent (user_message )
3345 agent_message = str (agent_response )
34- conversation .append ({"role" : "assistant" , "content" : agent_message })
3546
3647 # User acts
48+ trace .set_tracer_provider (simulator_telemetry .tracer_provider )
3749 user_result = user_sim .act (agent_message )
3850 user_message = str (user_result .structured_output .message )
39- conversation .append ({"role" : "user" , "content" : user_message })
4051
41- return {
42- "output" : conversation [- 1 ]["content" ] if conversation else "" ,
43- "conversation" : conversation ,
44- "turns" : len (conversation ) // 2 ,
45- }
52+ # Collect traces
53+ finished_spans = target_exporter .get_finished_spans ()
54+ mapper = StrandsInMemorySessionMapper ()
55+ session = mapper .map_to_session (finished_spans , session_id = "test-session" )
56+
57+ return {"output" : str (agent_response ), "trajectory" : session }
4658
4759
4860# Create test cases
@@ -62,15 +74,15 @@ async def simulate_conversation(case: Case) -> dict:
6274]
6375
6476# Create evaluator
65- evaluator = OutputEvaluator ( rubric = "Evaluate if the agent successfully helped achieve the user's travel goal." )
77+ evaluator = HelpfulnessEvaluator ( )
6678
6779# Create dataset
6880dataset = Dataset [str , str ](cases = test_cases , evaluator = evaluator )
6981
7082
7183# Run evaluations
7284async def main ():
73- report = await dataset .run_evaluations_async (simulate_conversation )
85+ report = await dataset .run_evaluations_async (task_function )
7486 report .run_display ()
7587
7688
0 commit comments