scenario/python/scenario/scenario_executor.py at 30aeb1fd4f490548b6da0fbe10e3bf86bd5adde7 · langwatch/scenario · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Scenario execution engine for agent testing.

This module contains the core ScenarioExecutor class that orchestrates the execution
of scenario tests, managing the interaction between user simulators, agents under test,
and judge agents to determine test success or failure.
"""

import json
import sys
from typing import (
    TYPE_CHECKING,
    Any,
    Awaitable,
    Callable,
    Dict,
    List,
    Optional,
    Set,
    Tuple,
    Union,
    TypedDict,
    cast,
)

if TYPE_CHECKING:
    from .voice.playback import FfmpegPlayback
import logging
import time
import warnings
import termcolor
import asyncio
import concurrent.futures

logger = logging.getLogger("scenario")

from scenario.config import ScenarioConfig
from langwatch.attributes import AttributeKey
from scenario._utils import (
    convert_agent_return_types_to_openai_messages,
    check_valid_return_type,
    print_openai_messages,
    show_spinner,
    await_if_awaitable,
    get_batch_run_id,
    generate_scenario_run_id,
    SerializableWithStringFallback,
)
from openai.types.chat import (
    ChatCompletionMessageParam,
    ChatCompletionUserMessageParam,
    ChatCompletionAssistantMessageParam,
)

from .types import (
    AgentInput,
    AgentRole,
    ChatCompletionMessageParamWithTrace,
    JudgmentRequest,
    ScenarioResult,
    ScriptStep,
)
from ._error_messages import agent_response_not_awaitable
from .cache import context_scenario
from .agent_adapter import AgentAdapter
from .script import proceed
from pksuid import PKSUID
from .scenario_state import ScenarioState
from ._events import (
    ScenarioEventBus,
    ScenarioEvent,
    ScenarioRunStartedEvent,
    ScenarioMessageSnapshotEvent,
    ScenarioRunFinishedEvent,
    ScenarioRunStartedEventMetadata,
    ScenarioRunFinishedEventResults,
    ScenarioRunFinishedEventVerdict,
    ScenarioRunFinishedEventStatus,
    convert_messages_to_api_client_messages,
)
from rx.subject.subject import Subject
from rx.core.observable.observable import Observable

import litellm
import langwatch
import langwatch.telemetry.context
from langwatch.telemetry.tracing import LangWatchTrace


def _extract_text_content(content: object) -> str:
    """Extract a plain-text string from a message content value.

    ``content`` may be a plain string or a list of content-part dicts
    (e.g. ``[{"type": "text", "text": "hello"}, {"type": "image_url", ...}]``).
    Passing a list directly to LangWatch's ``trace.update()`` produces a
    Python repr string (``"[{'type': 'text', ...}]"``), which is unreadable.
    This helper concatenates only the ``"text"`` parts so the trace value is
    always a human-readable string.
    """
    if isinstance(content, str):
        return content
    if isinstance(content, list):
        return " ".join(
            part.get("text", "")
            for part in content
            if isinstance(part, dict) and part.get("type") == "text"
        )
    return str(content)


class ScenarioExecutor:
    """
    Core orchestrator for scenario-based agent testing.

    The ScenarioExecutor manages the complete lifecycle of a scenario test, including:
    - Orchestrating conversations between user simulators, agents, and judges
    - Managing turn-based execution flow
    - Handling script-based scenario control
    - Collecting and reporting test results
    - Supporting debug mode for interactive testing

    This class serves as both a builder (for configuration) and an executor (for running tests).
    Most users will interact with it through the high-level `scenario.run()` function rather
    than instantiating it directly.

    Attributes:
        name: Human-readable name for the scenario
        description: Detailed description of what the scenario tests
        agents: List of agent adapters participating in the scenario
        script: Optional list of script steps to control scenario flow
        config: Configuration settings for execution behavior
    """

    name: str
    description: str
    agents: List[AgentAdapter]
    script: List[ScriptStep]

    config: ScenarioConfig

    _state: ScenarioState
    _total_start_time: float
    _pending_messages: Dict[int, List[ChatCompletionMessageParam]]

    _pending_roles_on_turn: List[AgentRole] = []
    _pending_agents_on_turn: Set[AgentAdapter] = set()
    _agent_times: Dict[int, float] = {}
    _events: Subject
    _trace: LangWatchTrace
    _ffmpeg_playback: Optional["FfmpegPlayback"] = None

    event_bus: ScenarioEventBus

    batch_run_id: str
    scenario_set_id: str

    def __init__(
        self,
        name: str,
        description: str,
        agents: List[AgentAdapter] = [],
        script: Optional[List[ScriptStep]] = None,
        # Config
        max_turns: Optional[int] = None,
        verbose: Optional[Union[bool, int]] = None,
        cache_key: Optional[str] = None,
        debug: Optional[bool] = None,
        event_bus: Optional[ScenarioEventBus] = None,
        set_id: Optional[str] = None,
        metadata: Optional[Dict[str, Any]] = None,
        on_audio_chunk: Optional[Callable[[Any], None]] = None,
        on_voice_event: Optional[Callable[[Any], None]] = None,
        audio_playback: bool = False,
    ):
        """
        Initialize a scenario executor.

        Args:
            name: Human-readable name for the scenario (used in reports and logs)
            description: Detailed description of what the scenario tests.
                        This guides the user simulator's behavior and provides context.
            agents: List of agent adapters participating in the scenario.
                   Typically includes: agent under test, user simulator, and judge.
            script: Optional list of script steps to control scenario flow.
                   If not provided, defaults to automatic proceeding.
            max_turns: Maximum number of conversation turns before timeout.
                      Overrides global configuration for this scenario.
            verbose: Whether to show detailed output during execution.
                    Can be True/False or integer level (2 for extra details).
            cache_key: Cache key for deterministic behavior across runs.
                      Overrides global configuration for this scenario.
            debug: Whether to enable debug mode with step-by-step execution.
                  Overrides global configuration for this scenario.
            event_bus: Optional event bus that will subscribe to this executor's events
            set_id: Optional set identifier for grouping related scenarios
            metadata: Optional metadata to attach to the scenario run.
                     Accepts arbitrary key-value pairs. The ``langwatch`` key
                     is reserved for platform-internal use.
        """
        self.name = name
        self.description = description
        self.agents = agents
        self.script = script or [proceed()]
        self.metadata = metadata
        self._on_audio_chunk = on_audio_chunk
        self._on_voice_event = on_voice_event
        self._audio_playback = audio_playback

        config = ScenarioConfig(
            max_turns=max_turns,
            verbose=verbose,
            cache_key=cache_key,
            debug=debug,
            headless=None,
        )
        self.config = (ScenarioConfig.default_config or ScenarioConfig()).merge(config)

        self.batch_run_id = get_batch_run_id()
        self.scenario_set_id = set_id or "default"
        self._scenario_run_id = generate_scenario_run_id()

        # Create executor's own event stream
        self._events = Subject()

        # Create and configure event bus to subscribe to our events
        self.event_bus = event_bus or ScenarioEventBus()
        self.event_bus.subscribe_to_events(self._events)

    @property
    def events(self) -> Observable:
        """Expose event stream for subscribers like the event bus."""
        return self._events

    def _emit_event(self, event: ScenarioEvent) -> None:
        """
        Emit a domain event to all subscribers.

        This method publishes scenario events to the internal event stream,
        which subscribers (like the event bus) can observe and react to.
        The timestamp is automatically set to the current time.

        Args:
            event: The scenario event to emit
        """
        event.timestamp = int(time.time() * 1000)
        self._events.on_next(event)

    def reset(self):
        """
        Reset the scenario executor to initial state.

        This method reinitializes all internal state for a fresh scenario run,
        including conversation history, turn counters, and agent timing information.
        Called automatically during initialization and can be used to rerun scenarios.
        """
        self._state = ScenarioState(
            description=self.description,
            messages=[],
            thread_id=str(PKSUID("scenariothread")),
            current_turn=0,
            config=self.config,
            _executor=self,
        )
        # Pydantic doesn't actually set the _executor field from the constructor, as it's private, so we need to do it manually
        self._state._executor = self

        self._pending_messages = {}
        self._total_start_time = time.time()
        self._agent_times = {}
        self._checkpoint_results: List[dict] = []

        self._new_turn()
        self._state.current_turn = 0

        context_scenario.set(self)

    @property
    def _compiled_checkpoints(self) -> tuple[List[str], List[str]]:
        """Compile all checkpoint results into aggregated passed/failed criteria."""
        passed: List[str] = []
        failed: List[str] = []
        for cp in self._checkpoint_results:
            passed.extend(cp["passed_criteria"])
            failed.extend(cp["failed_criteria"])
        return passed, failed

    def add_message(
        self, message: ChatCompletionMessageParam, from_agent_idx: Optional[int] = None
    ):
        """
        Add a message to the conversation and broadcast to other agents.

        This method adds a message to the conversation history and makes it available
        to other agents in their next call. It's used internally by the executor
        and can be called from script steps to inject custom messages.

        Args:
            message: OpenAI-compatible message to add to the conversation
            from_agent_idx: Index of the agent that generated this message.
                           Used to avoid broadcasting the message back to its creator.

        Example:
            ```
            def inject_system_message(state: ScenarioState) -> None:
                state.add_message({
                    "role": "system",
                    "content": "The user is now in a hurry"
                })

            # Use in script
            result = await scenario.run(
               name="system message test",
               agents=[agent, user_sim, judge],
               script=[
                   scenario.user("Hello"),
                   scenario.agent(),
                   inject_system_message,
                   scenario.user(),  # Will see the system message
                   scenario.succeed()
               ]
            )
            ```
        """
        message = cast(ChatCompletionMessageParamWithTrace, message)
        message["trace_id"] = self._trace.trace_id
        self._state.messages.append(message)

        # Broadcast the message to other agents
        for idx, _ in enumerate(self.agents):
            if idx == from_agent_idx:
                continue
            if idx not in self._pending_messages:
                self._pending_messages[idx] = []
            self._pending_messages[idx].append(message)

        # Update trace with input/output.
        # Extract text from content (str or list of content parts) so we
        # always pass a str to LangWatch — avoids Python repr of list objects.
        if message["role"] == "user":
            content = message["content"]
            self._trace.update(input=_extract_text_content(content))
        elif message["role"] == "assistant":
            content = (
                message["content"]
                if "content" in message
                else json.dumps(message, cls=SerializableWithStringFallback)
            )
            self._trace.update(output=_extract_text_content(content))

    def rollback_messages_to(self, index: int) -> List[ChatCompletionMessageParam]:
        """Remove all messages from position `index` onward.

        Truncates state.messages and removes matching references from
        _pending_messages queues so no agent sees stale messages.

        .. note::
            This method is safe to call only during an agent's ``call()``
            invocation.  The executor runs agents sequentially, so no
            other agent can observe stale ``new_messages`` references.
            Calling this from outside that flow may leave already-delivered
            ``new_messages`` out of sync.

        Args:
            index: Truncate point.  Messages at positions >= index are
                removed.  Clamped to ``[0, len(messages)]``.

        Returns:
            The removed messages (empty list if nothing to remove).

        Raises:
            ValueError: If *index* is negative.
        """
        if index < 0:
            raise ValueError(
                f"rollback_messages_to: index must be >= 0, got {index}"
            )
        # Clamp to message length — rolling back past the end is a no-op.
        index = min(index, len(self._state.messages))

        removed = list(self._state.messages[index:])
        if not removed:
            return []

        removed_ids = set(id(m) for m in removed)

        del self._state.messages[index:]

        for idx in self._pending_messages:
            self._pending_messages[idx] = [
                m for m in self._pending_messages[idx]
                if id(m) not in removed_ids
            ]

        # Annotate the current trace span so the rollback is visible in
        # tracing dashboards (the removed messages themselves are gone from
        # the conversation, but this event records *that* it happened).
        if hasattr(self, "_trace") and self._trace is not None:
            try:
                self._trace.update(
                    metadata={
                        "scenario.rollback_index": index,
                        "scenario.rollback_removed_count": len(removed),
                    }
                )
            except Exception as exc:
                warnings.warn(
                    f"Failed to update trace metadata during rollback: {exc}",
                    stacklevel=2,
                )

        return cast(List[ChatCompletionMessageParam], removed)

    def add_messages(
        self,
        messages: List[ChatCompletionMessageParam],
        from_agent_idx: Optional[int] = None,
    ):
        """
        Add multiple messages to the conversation.

        Convenience method for adding multiple messages at once. Each message
        is added individually using add_message().

        Args:
            messages: List of OpenAI-compatible messages to add
            from_agent_idx: Index of the agent that generated these messages

        Example:
            ```
            # Agent returns multiple messages for a complex interaction
            messages = [
                {"role": "assistant", "content": "Let me search for that..."},
                {"role": "assistant", "content": "Here's what I found: ..."}
            ]
            executor.add_messages(messages, from_agent_idx=0)
            ```
        """
        for message in messages:
            self.add_message(message, from_agent_idx)

    def _new_turn(self):
        if hasattr(self, "_trace") and self._trace is not None:
            self._trace.__exit__(None, None, None)

        self._trace = langwatch.trace(
            name="Scenario Turn",
            metadata={
                "labels": ["scenario"],
                "thread_id": self._state.thread_id,
                "scenario.name": self.name,
                "scenario.batch_id": self.batch_run_id,
                "scenario.set_id": self.scenario_set_id,
                "scenario.turn": self._state.current_turn,
            },
        ).__enter__()

        if self._trace.root_span is not None:
            attrs = {
                "langwatch.origin": "simulation",
                "scenario.run_id": self._scenario_run_id,
            }
            for role, tier_value in getattr(self, '_modality_resolutions', {}).items():
                attrs[f"scenario.modality.{role}.resolved"] = tier_value
                attrs[f"scenario.modality.{role}.tier"] = tier_value
            self._trace.root_span.set_attributes(attrs)

        self._pending_agents_on_turn = set(self.agents)
        self._pending_roles_on_turn = [
            AgentRole.USER,
            AgentRole.AGENT,
            AgentRole.JUDGE,
        ]
        self._state.current_turn += 1

    async def step(self) -> Union[List[ChatCompletionMessageParam], ScenarioResult]:
        """
        Execute a single step in the scenario.

        A step consists of calling the next agent in the current turn's sequence
        and processing their response. This method is used internally by the
        scenario execution flow.

        Returns:
            Either a list of messages (if the scenario continues) or a
            ScenarioResult (if the scenario should end)

        Raises:
            ValueError: If no result is returned from the internal step method

        Note:
            This is primarily an internal method. Most users should use the
            high-level run() method or script DSL functions instead.
        """
        result = await self._step()
        if result is None:
            raise ValueError("No result from step")
        return result

    async def _step(
        self,
        go_to_next_turn=True,
        on_turn: Optional[
            Union[
                Callable[["ScenarioState"], None],
                Callable[["ScenarioState"], Awaitable[None]],
            ]
        ] = None,
    ) -> Union[List[ChatCompletionMessageParam], ScenarioResult, None]:
        if len(self._pending_roles_on_turn) == 0:
            if not go_to_next_turn:
                return None

            self._new_turn()

            if on_turn:
                await await_if_awaitable(on_turn(self._state))

            if self._state.current_turn >= (self.config.max_turns or 10):
                return self._reached_max_turns()

        current_role = self._pending_roles_on_turn[0]
        idx, next_agent = self._next_agent_for_role(current_role)
        if not next_agent:
            self._pending_roles_on_turn.pop(0)
            return await self._step(go_to_next_turn=go_to_next_turn, on_turn=on_turn)

        self._pending_agents_on_turn.remove(next_agent)
        return await self._call_agent(idx, role=current_role)

    def _next_agent_for_role(
        self, role: AgentRole
    ) -> Tuple[int, Optional[AgentAdapter]]:
        for idx, agent in enumerate(self.agents):
            if (
                role == agent.role
                and agent in self._pending_agents_on_turn
                and agent.role in self._pending_roles_on_turn
            ):
                return idx, agent
        return -1, None

    def _reached_max_turns(self, error_message: Optional[str] = None) -> ScenarioResult:
        # If we reached max turns without conclusion, fail the test
        agent_roles_agents_idx = [
            idx
            for idx, agent in enumerate(self.agents)
            if agent.role == AgentRole.AGENT
        ]
        agent_times = [
            self._agent_times[idx]
            for idx in agent_roles_agents_idx
            if idx in self._agent_times
        ]
        agent_time = sum(agent_times)

        return ScenarioResult(
            success=False,
            messages=self._state.messages,
            reasoning=error_message
            or f"Reached maximum turns ({self.config.max_turns or 10}) without conclusion",
            total_time=time.time() - self._total_start_time,
            agent_time=agent_time,
        )

    async def run(self) -> ScenarioResult:
        """
        Run a scenario against the agent under test.

        Args:
            context: Optional initial context for the agent

        Returns:
            ScenarioResult containing the test outcome
        """
        scenario_run_id = generate_scenario_run_id()
        self._scenario_run_id = scenario_run_id
        _check_failure: Optional[BaseException] = None

        # Connect all voice adapters before script runs; disconnect in finally.
        await self._voice_connect_all()

        # Resolve modality per role and store for span stamping.
        from .voice.modality_resolver import resolve_modality
        from .user_simulator_agent import UserSimulatorAgent
        from .judge_agent import JudgeAgent

        self._modality_resolutions: dict = {}  # role -> tier value string
        for agent in self.agents:
            if isinstance(agent, UserSimulatorAgent):
                decl = getattr(agent, 'modality', None)
                tier, _mod_warnings = resolve_modality(declaration=decl, model_id=getattr(agent, 'model', '') or '')
                for w in _mod_warnings:
                    logger.warning(w)
                self._modality_resolutions['simulator'] = tier.value
            elif isinstance(agent, JudgeAgent):
                decl = getattr(agent, 'modality', None)
                tier, _mod_warnings = resolve_modality(declaration=decl, model_id=getattr(agent, 'model', '') or '')
                for w in _mod_warnings:
                    logger.warning(w)
                self._modality_resolutions['judge'] = tier.value

        try:
            self._emit_run_started_event(scenario_run_id)

            if self.config.verbose:
                print("")  # new line

            self.reset()

            for i, script_step in enumerate(self.script):
                try:
                    callable = script_step(self._state)
                    if isinstance(callable, Awaitable):
                        result = await callable
                    else:
                        result = callable
                except AssertionError as e:
                    _check_failure = e
                    break

                self._emit_message_snapshot_event(scenario_run_id)

                if isinstance(result, ScenarioResult):
                    compiled_passed, _ = self._compiled_checkpoints
                    result.passed_criteria = compiled_passed + result.passed_criteria

                    status = (
                        ScenarioRunFinishedEventStatus.SUCCESS
                        if result.success
                        else ScenarioRunFinishedEventStatus.FAILED
                    )
                    result = self._attach_voice_output(result)
                    self._emit_run_finished_event(scenario_run_id, result, status)
                    return result

            if _check_failure is not None:
                compiled_passed, compiled_failed = self._compiled_checkpoints
                error_result = ScenarioResult(
                    success=False,
                    messages=self._state.messages,
                    reasoning=f"Scenario failed with error: {str(_check_failure)}",
                    passed_criteria=compiled_passed,
                    failed_criteria=compiled_failed + [str(_check_failure)],
                    total_time=time.time() - self._total_start_time,
                    agent_time=0,
                )
                self._emit_run_finished_event(
                    scenario_run_id,
                    error_result,
                    ScenarioRunFinishedEventStatus.ERROR,
                )
                raise _check_failure

            elif self._checkpoint_results:
                compiled_passed, compiled_failed = self._compiled_checkpoints
                agent_roles_agents_idx = [
                    idx
                    for idx, agent in enumerate(self.agents)
                    if agent.role == AgentRole.AGENT
                ]
                agent_times = [
                    self._agent_times[idx]
                    for idx in agent_roles_agents_idx
                    if idx in self._agent_times
                ]
                agent_time = sum(agent_times)

                result = ScenarioResult(
                    success=len(compiled_failed) == 0,
                    messages=self._state.messages,
                    reasoning="All inline criteria checkpoints passed",
                    passed_criteria=compiled_passed,
                    failed_criteria=compiled_failed,
                    total_time=time.time() - self._total_start_time,
                    agent_time=agent_time,
                )
                result = self._attach_voice_output(result)

                status = (
                    ScenarioRunFinishedEventStatus.SUCCESS
                    if result.success
                    else ScenarioRunFinishedEventStatus.FAILED
                )
                self._emit_run_finished_event(scenario_run_id, result, status)
                return result
            else:
                result = self._reached_max_turns(
                    """Reached end of script without conclusion, add one of the following:

- Add `scenario.judge()` to the script to force criteria judgement
- Add `scenario.succeed()` or `scenario.fail()` to end the test with an explicit result
- If your script already has a judge but is hitting max_turns, increase `max_turns` in your config
                    """
                )

                status = (
                    ScenarioRunFinishedEventStatus.SUCCESS
                    if result.success
                    else ScenarioRunFinishedEventStatus.FAILED
                )
                self._emit_run_finished_event(scenario_run_id, result, status)
                return result

        except Exception as e:
            if _check_failure is not None:
                # Already handled above — just propagate
                raise

            # Publish failure event before propagating the error
            error_result = ScenarioResult(
                success=False,
                messages=self._state.messages,
                reasoning=f"Scenario failed with error: {str(e)}",
                total_time=time.time() - self._total_start_time,
                agent_time=0,
            )
            self._emit_run_finished_event(
                scenario_run_id, error_result, ScenarioRunFinishedEventStatus.ERROR
            )
            raise  # Re-raise the exception after cleanup
        finally:
            await self._voice_disconnect_all()

    async def _voice_connect_all(self) -> None:
        """Invoke ``connect()`` on every VoiceAgentAdapter in the scenario."""
        from .voice.adapter import VoiceAgentAdapter
        from .voice.recording import LatencyMetrics, VoiceRecording
        from .voice.playback import FfmpegPlayback

        self._voice_recording: VoiceRecording = VoiceRecording()
        self._voice_timeline: list = []
        self._voice_latency: LatencyMetrics = LatencyMetrics()
        self._voice_recording_started_at: float = time.monotonic()
        self._pending_agent_task = None
        self._ffmpeg_playback = None

        if self._audio_playback:
            player = FfmpegPlayback()
            player.start()
            self._ffmpeg_playback = player
            # Wrap the user-supplied on_audio_chunk so playback coexists with it.
            user_callback = self._on_audio_chunk

            def _playback_and_forward(chunk: Any) -> None:
                player.feed(chunk)
                if user_callback is not None:
                    user_callback(chunk)

            self._on_audio_chunk = _playback_and_forward

        # Phase 1: static validation against adapter ClassVars (before connect)
        from .voice.modality_resolver import ModalityNegotiationError, validate_modality_setup, resolve_modality
        for agent in self.agents:
            if isinstance(agent, VoiceAgentAdapter):
                model_id = getattr(agent, 'model', None) or getattr(agent, '_model', '') or ''
                if model_id:
                    tier, _mod_warnings = resolve_modality(declaration=None, model_id=model_id)
                    for w in _mod_warnings:
                        logger.warning(w)
                    validate_modality_setup(
                        tier=tier,
                        adapter_input_formats=list(agent.capabilities.input_formats),
                        adapter_name=type(agent).__name__,
                    )

        # Phase 2: connect with live-transport failure catching
        from .voice.adapters._stub import PendingTransportError
        for agent in self.agents:
            if isinstance(agent, VoiceAgentAdapter):
                try:
                    await agent.connect()
                except PendingTransportError as e:
                    raise ModalityNegotiationError(
                        f"Live transport {type(agent).__name__!r} cannot honor "
                        f"required modality — connect failed: {e}. "
                        f"Negotiated requirement: audio-in (pcm16/24000)"
                    ) from e

        # Phase 3: validate script step requirements against connected adapter capabilities
        from .voice.capabilities import UnsupportedCapabilityError
        for step in self.script:
            if getattr(step, '_requires_streaming_transcripts', False):
                for agent in self.agents:
                    if isinstance(agent, VoiceAgentAdapter):
                        if not agent.capabilities.streaming_transcripts:
                            raise UnsupportedCapabilityError(
                                type(agent).__name__,
                                "streaming_transcripts",
                                hint=(
                                    "interrupt(after_words=N) needs incremental transcripts. "
                                    "Use interrupt(content) without after_words on this adapter — "
                                    "the executor fires barge-in at the agent's first audio chunk."
                                ),
                            )

    def _attach_voice_output(self, result: ScenarioResult) -> ScenarioResult:
        """Populate result.audio/timeline/latency if any voice adapter ran."""
        from .voice.adapter import VoiceAgentAdapter

        has_voice = any(isinstance(a, VoiceAgentAdapter) for a in self.agents)
        if not has_voice:
            return result
        recording = getattr(self, "_voice_recording", None)
        timeline = getattr(self, "_voice_timeline", None)
        latency = getattr(self, "_voice_latency", None)
        if recording is not None and recording.segments:
            result.audio = recording
            # Pin the timeline onto the recording too so save_segments() can
            # write events into the manifest. The result already exposes
            # timeline directly; this just makes it accessible from the
            # recording object for serialisation.
            recording.timeline = list(timeline) if timeline else []
            # Mark agent segments whose span contains a user_interrupt event:
            # the chunk-level transcripts come from the AUT's API and reflect
            # the agent's INTENDED reply, not what actually played to the user
            # before the interrupt cut the audio. Flag these so consumers
            # (manifest readers, judges) know to re-transcribe from bytes.
            interrupts = [e for e in (timeline or []) if e.type == "user_interrupt"]
            for seg in recording.segments:
                if seg.speaker != "agent":
                    continue
                for evt in interrupts:
                    if seg.start_time <= evt.time <= seg.end_time:
                        seg.transcript_truncated = True
                        break
        if timeline:
            result.timeline = list(timeline)
        if latency is not None and latency.measurements:
            result.latency = latency
        return result

    async def _voice_disconnect_all(self) -> None:
        """Invoke ``disconnect()`` on every VoiceAgentAdapter.

        Swallows exceptions so cleanup always completes — disconnect failures
        are logged but do not mask the primary scenario result.
        """
        from .voice.adapter import VoiceAgentAdapter

        for agent in self.agents:
            if not isinstance(agent, VoiceAgentAdapter):
                continue
            try:
                await agent.disconnect()
            except Exception:
                logger.warning(
                    "voice adapter %s disconnect failed",
                    type(agent).__name__,
                    exc_info=True,
                )

        if self._ffmpeg_playback is not None:
            try:
                await asyncio.to_thread(self._ffmpeg_playback.stop)
            except Exception:
                logger.warning(
                    "ffmpeg playback stop failed during voice disconnect",
                    exc_info=True,
                )
            self._ffmpeg_playback = None

    async def _call_agent(
        self, idx: int, role: AgentRole, judgment_request: Optional[JudgmentRequest] = None
    ) -> Union[List[ChatCompletionMessageParam], ScenarioResult, None]:
        agent = self.agents[idx]

        if role == AgentRole.USER and self.config.debug:
            print(
                f"\n{self._scenario_name()}{termcolor.colored('[Debug Mode]', 'yellow')} Press enter to continue or type a message to send"
            )
            input_message = input(
                self._scenario_name() + termcolor.colored("User: ", "green")
            )

            # Clear the input prompt lines completely
            for _ in range(3):
                sys.stdout.write("\033[F")  # Move up to the input line
                sys.stdout.write("\033[2K")  # Clear the entire input line
            sys.stdout.flush()  # Make sure the clearing is visible

            if input_message:
                return [
                    ChatCompletionUserMessageParam(role="user", content=input_message)
                ]

        try:
            with self._trace.span(
                type="agent", name=f"{agent.__class__.__name__}.call"
            ) as span:
                span.set_attributes(
                    {
                        AttributeKey.LangWatchThreadId: self._state.thread_id,
                        "scenario.role": role.value if isinstance(role, AgentRole) else str(role),
                    }
                )
                with show_spinner(
                    text=(
                        "Judging..."
                        if role == AgentRole.JUDGE
                        else f"{role.value if isinstance(role, AgentRole) else role}:"
                    ),
                    color=(
                        "blue"
                        if role == AgentRole.AGENT
                        else "green" if role == AgentRole.USER else "yellow"
                    ),
                    enabled=self.config.verbose,
                ):
                    start_time = time.time()

                    # Suppress noisy pydantic serializer warnings emitted by
                    # litellm + langwatch tracing when dispatching the
                    # ChatCompletionMessageParam union (developer/system/user/
                    # assistant/tool/function variants). The previous scope
                    # only wrapped the call-coroutine *creation*; the await
                    # below is where litellm.completion actually runs and
                    # where the warnings fire. Keep the await inside.
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")

                        self._trace.autotrack_litellm_calls(litellm)

                        agent_response = agent.call(
                            AgentInput(
                                thread_id=self._state.thread_id,
                                messages=cast(
                                    List[ChatCompletionMessageParam],
                                    self._state.messages,
                                ),
                                new_messages=self._pending_messages.get(idx, []),
                                judgment_request=judgment_request,
                                scenario_state=self._state,
                            )
                        )
                        if not isinstance(agent_response, Awaitable):
                            raise Exception(
                                agent_response_not_awaitable(agent.__class__.__name__),
                            )

                        agent_response = await agent_response

                if idx not in self._agent_times:
                    self._agent_times[idx] = 0
                self._agent_times[idx] += time.time() - start_time

                self._pending_messages[idx] = []
                check_valid_return_type(agent_response, agent.__class__.__name__)

                messages = []
                if isinstance(agent_response, ScenarioResult):
                    # TODO: should be an event
                    span.add_evaluation(
                        name=f"{agent.__class__.__name__} Judgment",
                        status="processed",
                        passed=agent_response.success,
                        details=agent_response.reasoning,
                        score=(
                            len(agent_response.passed_criteria)
                            / len(agent_response.failed_criteria)
                            if agent_response.failed_criteria
                            else 1.0
                        ),
                    )

                    return agent_response
                else:
                    messages = convert_agent_return_types_to_openai_messages(
                        agent_response,
                        role="user" if role == AgentRole.USER else "assistant",
                    )

                self.add_messages(messages, from_agent_idx=idx)

                if messages and self.config.verbose:
                    print_openai_messages(
                        self._scenario_name(),
                        [m for m in messages if m["role"] != "system"],
                    )

                # Voice path: if a wait=False (or interrupt-scheduled) agent
                # turn is in flight when the user-sim produces its turn, fire
                # the interrupt sequence so the new audio lands mid-response.
                if role == AgentRole.USER and messages:
                    pending = getattr(self, "_pending_agent_task", None)
                    if pending is not None and not pending.done():
                        await self._fire_user_interrupt(messages[-1])

                return messages
        except Exception as e:
            agent_name = agent.__class__.__name__
            # str(e) is empty for no-args exceptions like asyncio.TimeoutError().
            # Fall back to the exception type name so the error body is never blank.
            error_detail = str(e) or type(e).__name__
            raise RuntimeError(f"[{agent_name}] {error_detail}") from e

    def _scenario_name(self):
        if self.config.verbose == 2:
            return termcolor.colored(f"[Scenario: {self.name}] ", "yellow")
        else:
            return ""