langwatch · drewdrewthis · Jun 18, 2026 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/python/scenario/judge_agent.py b/python/scenario/judge_agent.py
@@ -27,6 +27,7 @@
 from ._tracing import judge_span_collector, JudgeSpanCollector
 from .types import AgentInput, AgentReturnTypes, AgentRole, ScenarioResult
 from .voice._transcribe import transcribe_segments
+from .voice.modality_resolver import ModalityTier, resolve_modality
 
 
 logger = logging.getLogger("scenario")
@@ -247,6 +248,7 @@ def __init__(
         include_audio: Optional[bool] = None,
         include_timeline: Optional[bool] = None,
         include_traces: Optional[bool] = None,
+        modality: Optional[str] = None,
         **extra_params,
     ):
         """
@@ -274,6 +276,13 @@ def __init__(
             max_discovery_steps: Maximum number of expand/grep tool calls the judge
                                 can make before being forced to return a verdict.
                                 Defaults to 10.
+            modality: Explicit modality declaration for this role. Accepted values:
+                     ``"audio-in"`` (LLM receives raw audio), ``"stt-bridge"``
+                     (audio transcribed to text before the LLM), or ``"text"``
+                     (no audio in the stack). Complementary to ``include_audio``:
+                     ``include_audio=True/False`` takes precedence; ``modality=``
+                     applies when ``include_audio`` is ``None``. When ``None``
+                     (default), the modality is auto-detected from litellm capabilities.
 
         Raises:
             Exception: If no model is configured either in parameters or global config
@@ -318,6 +327,7 @@ def __init__(
         self.include_audio = include_audio
         self.include_timeline = include_timeline
         self.include_traces = include_traces
+        self.modality = modality
 
         if model:
             self.model = model
@@ -361,18 +371,24 @@ def __init__(
             raise Exception(agent_not_configured_error_message("JudgeAgent"))
 
     # --------------------------------------------- voice auto-detection (§4.3)
-    # Small single-purpose helpers; kept out of call() to preserve SRP.
-    _AUDIO_CAPABLE_MODEL_SUBSTRINGS = ("gpt-4o", "gemini-2.5", "gemini-2.0-flash")
-
-    def _model_supports_audio(self) -> bool:
-        m = (self.model or "").lower()
-        return any(s in m for s in self._AUDIO_CAPABLE_MODEL_SUBSTRINGS)
-
     def effective_include_audio(self, conversation_has_audio: bool) -> bool:
-        """Resolve include_audio: explicit wins, otherwise auto from model capability."""
+        """Resolve include_audio: explicit wins, otherwise use modality resolver.
+
+        Intentional behavior change (Bundle 3 / AC3b):
+          Before: gpt-4o → audio-capable (substring match).
+          After:  gpt-4o → text path (litellm advisory returns False).
+          Before: gpt-audio-mini → NOT audio-capable (not in list).
+          After:  gpt-audio-mini → audio-capable (litellm advisory returns True).
+        The old substring list was wrong; the resolver is the source of truth.
+        """
         if self.include_audio is not None:
+            # Explicit override always wins (AC3c)
             return self.include_audio and conversation_has_audio
-        return conversation_has_audio and self._model_supports_audio()
+        # Use resolver with per-role declaration (AC0, Bundle 6)
+        tier, warnings = resolve_modality(declaration=self.modality, model_id=self.model or "")
+        for w in warnings:
+            logger.warning(w)
+        return conversation_has_audio and (tier == ModalityTier.AUDIO_IN)
 
     def effective_include_timeline(self, conversation_has_audio: bool) -> bool:
         """Default timeline True for voice, False for text — unless explicitly set."""

diff --git a/python/scenario/scenario_executor.py b/python/scenario/scenario_executor.py
@@ -455,10 +455,14 @@ def _new_turn(self):
         ).__enter__()
 
         if self._trace.root_span is not None:
-            self._trace.root_span.set_attributes({
+            attrs = {
                 "langwatch.origin": "simulation",
                 "scenario.run_id": self._scenario_run_id,
-            })
+            }
+            for role, tier_value in getattr(self, '_modality_resolutions', {}).items():
+                attrs[f"scenario.modality.{role}.resolved"] = tier_value
+                attrs[f"scenario.modality.{role}.tier"] = tier_value
+            self._trace.root_span.set_attributes(attrs)
 
         self._pending_agents_on_turn = set(self.agents)
         self._pending_roles_on_turn = [
@@ -575,6 +579,26 @@ async def run(self) -> ScenarioResult:
         # Connect all voice adapters before script runs; disconnect in finally.
         await self._voice_connect_all()
 
+        # Resolve modality per role and store for span stamping.
+        from .voice.modality_resolver import resolve_modality
+        from .user_simulator_agent import UserSimulatorAgent
+        from .judge_agent import JudgeAgent
+
+        self._modality_resolutions: dict = {}  # role -> tier value string
+        for agent in self.agents:
+            if isinstance(agent, UserSimulatorAgent):
+                decl = getattr(agent, 'modality', None)
+                tier, _mod_warnings = resolve_modality(declaration=decl, model_id=getattr(agent, 'model', '') or '')
+                for w in _mod_warnings:
+                    logger.warning(w)
+                self._modality_resolutions['simulator'] = tier.value
+            elif isinstance(agent, JudgeAgent):
+                decl = getattr(agent, 'modality', None)
+                tier, _mod_warnings = resolve_modality(declaration=decl, model_id=getattr(agent, 'model', '') or '')
+                for w in _mod_warnings:
+                    logger.warning(w)
+                self._modality_resolutions['judge'] = tier.value
+
         try:
             self._emit_run_started_event(scenario_run_id)
 
@@ -724,9 +748,50 @@ def _playback_and_forward(chunk: Any) -> None:
 
             self._on_audio_chunk = _playback_and_forward
 
+        # Phase 1: static validation against adapter ClassVars (before connect)
+        from .voice.modality_resolver import ModalityNegotiationError, validate_modality_setup, resolve_modality
+        for agent in self.agents:
+            if isinstance(agent, VoiceAgentAdapter):
+                model_id = getattr(agent, 'model', None) or getattr(agent, '_model', '') or ''
+                if model_id:
+                    tier, _mod_warnings = resolve_modality(declaration=None, model_id=model_id)
+                    for w in _mod_warnings:
+                        logger.warning(w)
+                    validate_modality_setup(
+                        tier=tier,
+                        adapter_input_formats=list(agent.capabilities.input_formats),
+                        adapter_name=type(agent).__name__,
+                    )
+
+        # Phase 2: connect with live-transport failure catching
+        from .voice.adapters._stub import PendingTransportError
         for agent in self.agents:
             if isinstance(agent, VoiceAgentAdapter):
-                await agent.connect()
+                try:
+                    await agent.connect()
+                except PendingTransportError as e:
+                    raise ModalityNegotiationError(
+                        f"Live transport {type(agent).__name__!r} cannot honor "
+                        f"required modality — connect failed: {e}. "
+                        f"Negotiated requirement: audio-in (pcm16/24000)"
+                    ) from e
+
+        # Phase 3: validate script step requirements against connected adapter capabilities
+        from .voice.capabilities import UnsupportedCapabilityError
+        for step in self.script:
+            if getattr(step, '_requires_streaming_transcripts', False):
+                for agent in self.agents:
+                    if isinstance(agent, VoiceAgentAdapter):
+                        if not agent.capabilities.streaming_transcripts:
+                            raise UnsupportedCapabilityError(
+                                type(agent).__name__,
+                                "streaming_transcripts",
+                                hint=(
+                                    "interrupt(after_words=N) needs incremental transcripts. "
+                                    "Use interrupt(content) without after_words on this adapter — "
+                                    "the executor fires barge-in at the agent's first audio chunk."
+                                ),
+                            )
 
     def _attach_voice_output(self, result: ScenarioResult) -> ScenarioResult:
         """Populate result.audio/timeline/latency if any voice adapter ran."""

diff --git a/python/scenario/user_simulator_agent.py b/python/scenario/user_simulator_agent.py
@@ -22,6 +22,7 @@
 
 from ._error_messages import agent_not_configured_error_message
 from .types import AgentInput, AgentReturnTypes, AgentRole
+from .voice.modality_resolver import ModalityTier, resolve_modality
 
 
 logger = logging.getLogger("scenario")
@@ -158,6 +159,7 @@ def __init__(
         persona: Optional[str] = None,
         audio_effects: Optional[List[Callable[[bytes], bytes]]] = None,
         interrupt_probability: float = 0.0,
+        modality: Optional[str] = None,
         **extra_params,
     ):
         """
@@ -176,6 +178,11 @@ def __init__(
                        If not provided, uses model defaults.
             system_prompt: Custom system prompt to override default user simulation behavior.
                           Use this to create specialized user personas or behaviors.
+            modality: Explicit modality declaration for this role. Accepted values:
+                     ``"audio-in"`` (LLM receives raw audio), ``"stt-bridge"``
+                     (audio transcribed to text before the LLM), or ``"text"``
+                     (no audio in the stack). When ``None`` (default), the modality
+                     is auto-detected from the model's litellm capabilities.
 
         Raises:
             Exception: If no model is configured either in parameters or global config
@@ -217,6 +224,7 @@ def __init__(
         if not 0.0 <= interrupt_probability <= 1.0:
             raise ValueError("interrupt_probability must be in [0, 1]")
         self.interrupt_probability = interrupt_probability
+        self.modality = modality
 
         if model:
             self.model = model
@@ -364,11 +372,20 @@ async def _generate_text(
 
         scenario = input.scenario_state
 
+        tier, _warnings = resolve_modality(declaration=self.modality, model_id=self.model or "")
+        for w in _warnings:
+            logger.warning(w)
+
         persona_block = (
             f"\n\n<persona>\n{self.persona}\n</persona>\n"
             if self.persona
             else ""
         )
+        _history = (
+            list(input.messages)
+            if tier == ModalityTier.AUDIO_IN
+            else _strip_audio_content(input.messages)
+        )
         messages = [
             {
                 "role": "system",
@@ -410,7 +427,7 @@ async def _generate_text(
 {persona_block}"""),
             },
             {"role": "assistant", "content": "Hello, how can I help you today?"},
-            *_strip_audio_content(input.messages),
+            *_history,
         ]
 
         # User to assistant role reversal

diff --git a/python/scenario/voice/__init__.py b/python/scenario/voice/__init__.py
@@ -51,6 +51,7 @@
     transcribe,
 )
 from ._transcribe import transcribe_segments
+from .modality_resolver import ModalityNegotiationError, ModalityTier, resolve_modality
 from .tts import register_tts_provider, synthesize
 from .vad import WebRTCVadFallback
 
@@ -86,7 +87,10 @@
     "extract_audio",
     "get_stt_provider",
     "message_has_audio",
+    "ModalityNegotiationError",
+    "ModalityTier",
     "register_tts_provider",
+    "resolve_modality",
     "set_stt_provider",
     "silent_chunk",
     "synthesize",

diff --git a/python/scenario/voice/modality_resolver.py b/python/scenario/voice/modality_resolver.py
@@ -0,0 +1,107 @@
+"""Per-role voice modality resolution.
+
+Declaration-first: explicit per-role modality beats litellm advisory.
+Advisory is used as a hint only; mismatch emits a WARNING.
+"""
+from __future__ import annotations
+import logging
+from enum import Enum
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+
+class ModalityTier(str, Enum):
+    AUDIO_IN = "audio-in"    # LLM receives raw audio parts
+    STT_BRIDGE = "stt-bridge"  # audio -> STT -> text before LLM
+    TEXT = "text"            # no audio in the stack
+
+
+class ModalityNegotiationError(Exception):
+    """Raised when the declared modality is incompatible with adapter capabilities.
+
+    Message always contains both the declared modality string and the conflicting
+    capability value (e.g. 'realtime' and 'mulaw/8000').
+    """
+
+
+def _litellm_advisory(model_id: str) -> bool:
+    """Return True if litellm believes model_id can ingest audio input."""
+    try:
+        import litellm.utils
+        return bool(litellm.utils.supports_audio_input(model=model_id))
+    except Exception:
+        return False
+
+
+def resolve_modality(
+    *,
+    declaration: Optional[str],  # None = no explicit declaration
+    model_id: str,
+) -> tuple[ModalityTier, list[str]]:
+    """Resolve the modality tier for a single role.
+
+    Returns (tier, warnings).  Warnings are human-readable strings the caller
+    should emit via logger.warning().
+
+    Resolution rules:
+    - If declaration is given AND litellm agrees -> use declared tier, no warning.
+    - If declaration is given AND litellm disagrees -> use declared tier, emit WARNING.
+    - If no declaration -> use litellm advisory as truth, no warning.
+    """
+    advisory_audio = _litellm_advisory(model_id)
+
+    if declaration is None:
+        tier = ModalityTier.AUDIO_IN if advisory_audio else ModalityTier.TEXT
+        return tier, []
+
+    # Normalize declaration string to ModalityTier
+    try:
+        declared_tier = ModalityTier(declaration)
+    except ValueError:
+        raise ModalityNegotiationError(
+            f"Unknown modality declaration {declaration!r}; valid values: "
+            + ", ".join(t.value for t in ModalityTier)
+        )
+
+    warnings: list[str] = []
+    declared_audio = declared_tier == ModalityTier.AUDIO_IN
+
+    if declared_audio and not advisory_audio:
+        warnings.append(
+            f"Model {model_id!r} declared modality 'audio-in' but litellm "
+            f"reports it does NOT support audio input. "
+            f"The declared modality 'audio-in' will be used. "
+            f"If this is wrong, remove the declaration or file a litellm issue."
+        )
+    elif not declared_audio and advisory_audio:
+        warnings.append(
+            f"Model {model_id!r} declared modality {declaration!r} but litellm "
+            f"reports it DOES support audio input. "
+            f"The declared modality {declaration!r} will be used."
+        )
+
+    return declared_tier, warnings
+
+
+def validate_modality_setup(
+    *,
+    tier: ModalityTier,
+    adapter_input_formats: list[str],
+    adapter_name: str,
+) -> None:
+    """Raise ModalityNegotiationError if tier is statically incompatible with adapter.
+
+    'audio-in' requires a pcm16-family input format. Adapters that only offer
+    mulaw/* (telephony) cannot pass audio directly to the LLM.
+    """
+    if tier == ModalityTier.AUDIO_IN:
+        pcm_formats = [f for f in adapter_input_formats if f.startswith("pcm16")]
+        if adapter_input_formats and not pcm_formats:
+            # Has formats, none are pcm16-compatible — static impossible
+            raise ModalityNegotiationError(
+                f"Declared modality 'audio-in' is incompatible with adapter "
+                f"{adapter_name!r}: input formats {adapter_input_formats!r} "
+                f"contain no pcm16 path (conflicting capability: "
+                f"{adapter_input_formats[0]!r}). No resample path exists."
+            )
diff --git a/python/scenario/voice/script_steps.py b/python/scenario/voice/script_steps.py
@@ -159,6 +159,8 @@ async def _step(state: "ScenarioState") -> None:
         else:
             await executor.user(content if content else None)  # type: ignore[arg-type]
 
+    if after_words is not None:
+        _step._requires_streaming_transcripts = True  # type: ignore[attr-defined]
     return _step