From e27595766bd9d4cb5d63a0f639b8fcdcc89283a9 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 00:28:52 +0000
Subject: [PATCH 01/16] =?UTF-8?q?feat(#666):=20resolver=20core=20=E2=80=94?=
 =?UTF-8?q?=20per-role=20modality=20resolution=20with=20declaration-first?=
 =?UTF-8?q?=20+=20litellm=20advisory?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements AC4a, AC4b scenarios from specs/voice-modality-negotiation.feature:
- resolve_modality(): declaration wins, litellm advisory warns on mismatch, both directions
- ModalityNegotiationError: shared exception type for setup/connect validation (AC6, AC7)
- ModalityTier: audio-in, stt-bridge, text

Foundational module; all other bundles depend on this.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/scenario/voice/__init__.py            |   4 +
 python/scenario/voice/modality_resolver.py   |  84 ++++++++++++
 python/tests/voice/test_modality_resolver.py | 130 +++++++++++++++++++
 3 files changed, 218 insertions(+)
 create mode 100644 python/scenario/voice/modality_resolver.py
 create mode 100644 python/tests/voice/test_modality_resolver.py

diff --git a/python/scenario/voice/__init__.py b/python/scenario/voice/__init__.py
index bb9bf46a8..5852e4f16 100644
--- a/python/scenario/voice/__init__.py
+++ b/python/scenario/voice/__init__.py
@@ -51,6 +51,7 @@
     transcribe,
 )
 from ._transcribe import transcribe_segments
+from .modality_resolver import ModalityNegotiationError, ModalityTier, resolve_modality
 from .tts import register_tts_provider, synthesize
 from .vad import WebRTCVadFallback
 
@@ -86,7 +87,10 @@
     "extract_audio",
     "get_stt_provider",
     "message_has_audio",
+    "ModalityNegotiationError",
+    "ModalityTier",
     "register_tts_provider",
+    "resolve_modality",
     "set_stt_provider",
     "silent_chunk",
     "synthesize",
diff --git a/python/scenario/voice/modality_resolver.py b/python/scenario/voice/modality_resolver.py
new file mode 100644
index 000000000..c341e1624
--- /dev/null
+++ b/python/scenario/voice/modality_resolver.py
@@ -0,0 +1,84 @@
+"""Per-role voice modality resolution.
+
+Declaration-first: explicit per-role modality beats litellm advisory.
+Advisory is used as a hint only; mismatch emits a WARNING.
+"""
+from __future__ import annotations
+import logging
+from enum import Enum
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+
+class ModalityTier(str, Enum):
+    AUDIO_IN = "audio-in"    # LLM receives raw audio parts
+    STT_BRIDGE = "stt-bridge"  # audio -> STT -> text before LLM
+    TEXT = "text"            # no audio in the stack
+
+
+class ModalityNegotiationError(Exception):
+    """Raised when the declared modality is incompatible with adapter capabilities.
+
+    Message always contains both the declared modality string and the conflicting
+    capability value (e.g. 'realtime' and 'mulaw/8000').
+    """
+
+
+def _litellm_advisory(model_id: str) -> bool:
+    """Return True if litellm believes model_id can ingest audio input."""
+    try:
+        import litellm.utils
+        return bool(litellm.utils.supports_audio_input(model=model_id))
+    except Exception:
+        return False
+
+
+def resolve_modality(
+    *,
+    declaration: Optional[str],  # None = no explicit declaration
+    model_id: str,
+) -> tuple[ModalityTier, list[str]]:
+    """Resolve the modality tier for a single role.
+
+    Returns (tier, warnings).  Warnings are human-readable strings the caller
+    should emit via logger.warning().
+
+    Resolution rules:
+    - If declaration is given AND litellm agrees -> use declared tier, no warning.
+    - If declaration is given AND litellm disagrees -> use declared tier, emit WARNING.
+    - If no declaration -> use litellm advisory as truth, no warning.
+    """
+    advisory_audio = _litellm_advisory(model_id)
+
+    if declaration is None:
+        tier = ModalityTier.AUDIO_IN if advisory_audio else ModalityTier.TEXT
+        return tier, []
+
+    # Normalize declaration string to ModalityTier
+    try:
+        declared_tier = ModalityTier(declaration)
+    except ValueError:
+        raise ModalityNegotiationError(
+            f"Unknown modality declaration {declaration!r}; valid values: "
+            + ", ".join(t.value for t in ModalityTier)
+        )
+
+    warnings: list[str] = []
+    declared_audio = declared_tier == ModalityTier.AUDIO_IN
+
+    if declared_audio and not advisory_audio:
+        warnings.append(
+            f"Model {model_id!r} declared modality 'audio-in' but litellm "
+            f"reports it does NOT support audio input. "
+            f"The declared modality 'audio-in' will be used. "
+            f"If this is wrong, remove the declaration or file a litellm issue."
+        )
+    elif not declared_audio and advisory_audio:
+        warnings.append(
+            f"Model {model_id!r} declared modality {declaration!r} but litellm "
+            f"reports it DOES support audio input. "
+            f"The declared modality {declaration!r} will be used."
+        )
+
+    return declared_tier, warnings
diff --git a/python/tests/voice/test_modality_resolver.py b/python/tests/voice/test_modality_resolver.py
new file mode 100644
index 000000000..f20d70fc3
--- /dev/null
+++ b/python/tests/voice/test_modality_resolver.py
@@ -0,0 +1,130 @@
+"""Tests for per-role voice modality resolution (AC4a, AC4b).
+
+All tests mock _litellm_advisory to avoid live API calls.
+"""
+from __future__ import annotations
+
+import pytest
+from unittest.mock import patch
+
+from scenario.voice.modality_resolver import (
+    ModalityNegotiationError,
+    ModalityTier,
+    resolve_modality,
+)
+
+_PATCH_TARGET = "scenario.voice.modality_resolver._litellm_advisory"
+
+
+class TestNoDeclaration:
+    """Advisory drives tier when no declaration is provided."""
+
+    def test_no_declaration_advisory_true_returns_audio_in(self):
+        with patch(_PATCH_TARGET, return_value=True):
+            tier, warnings = resolve_modality(declaration=None, model_id="gpt-audio-mini")
+        assert tier == ModalityTier.AUDIO_IN
+        assert warnings == []
+
+    def test_no_declaration_advisory_false_returns_text(self):
+        with patch(_PATCH_TARGET, return_value=False):
+            tier, warnings = resolve_modality(declaration=None, model_id="gpt-4o")
+        assert tier == ModalityTier.TEXT
+        assert warnings == []
+
+
+class TestDeclarationAgreement:
+    """No warnings when declaration and advisory agree."""
+
+    def test_declared_audio_in_advisory_true_no_warning(self):
+        with patch(_PATCH_TARGET, return_value=True):
+            tier, warnings = resolve_modality(declaration="audio-in", model_id="gpt-audio-mini")
+        assert tier == ModalityTier.AUDIO_IN
+        assert warnings == []
+
+    def test_declared_text_advisory_false_no_warning(self):
+        with patch(_PATCH_TARGET, return_value=False):
+            tier, warnings = resolve_modality(declaration="text", model_id="gpt-4o")
+        assert tier == ModalityTier.TEXT
+        assert warnings == []
+
+
+class TestAC4a:
+    """AC4a: declared audio-in on advisory-text model emits a loud warning."""
+
+    def test_declared_audio_in_advisory_false_returns_declared_tier(self):
+        """Declaration wins even when litellm disagrees."""
+        with patch(_PATCH_TARGET, return_value=False):
+            tier, warnings = resolve_modality(declaration="audio-in", model_id="gpt-4o")
+        assert tier == ModalityTier.AUDIO_IN
+
+    def test_declared_audio_in_advisory_false_emits_exactly_one_warning(self):
+        with patch(_PATCH_TARGET, return_value=False):
+            _, warnings = resolve_modality(declaration="audio-in", model_id="gpt-4o")
+        assert len(warnings) == 1
+
+    def test_declared_audio_in_advisory_false_warning_mentions_model(self):
+        with patch(_PATCH_TARGET, return_value=False):
+            _, warnings = resolve_modality(declaration="audio-in", model_id="gpt-4o")
+        assert "gpt-4o" in warnings[0]
+
+    def test_declared_audio_in_advisory_false_warning_mentions_audio_in(self):
+        with patch(_PATCH_TARGET, return_value=False):
+            _, warnings = resolve_modality(declaration="audio-in", model_id="gpt-4o")
+        assert "audio-in" in warnings[0]
+
+    def test_declared_audio_in_advisory_false_warning_mentions_litellm(self):
+        with patch(_PATCH_TARGET, return_value=False):
+            _, warnings = resolve_modality(declaration="audio-in", model_id="gpt-4o")
+        assert "litellm" in warnings[0]
+
+
+class TestAC4b:
+    """AC4b: declared text on advisory-audio model emits a mismatch warning."""
+
+    def test_declared_text_advisory_true_returns_declared_tier(self):
+        """Declaration wins even when litellm disagrees."""
+        with patch(_PATCH_TARGET, return_value=True):
+            tier, warnings = resolve_modality(declaration="text", model_id="gpt-audio-mini")
+        assert tier == ModalityTier.TEXT
+
+    def test_declared_text_advisory_true_emits_exactly_one_warning(self):
+        with patch(_PATCH_TARGET, return_value=True):
+            _, warnings = resolve_modality(declaration="text", model_id="gpt-audio-mini")
+        assert len(warnings) == 1
+
+    def test_declared_text_advisory_true_warning_mentions_model(self):
+        with patch(_PATCH_TARGET, return_value=True):
+            _, warnings = resolve_modality(declaration="text", model_id="gpt-audio-mini")
+        assert "gpt-audio-mini" in warnings[0]
+
+    def test_declared_text_advisory_true_warning_mentions_mismatch(self):
+        """Warning must signal that litellm reports audio support."""
+        with patch(_PATCH_TARGET, return_value=True):
+            _, warnings = resolve_modality(declaration="text", model_id="gpt-audio-mini")
+        # Warning should mention either "text" (declared) or "DOES support" to signal mismatch
+        assert "text" in warnings[0] or "DOES support" in warnings[0] or "does support" in warnings[0]
+
+    def test_declared_stt_bridge_advisory_true_emits_warning(self):
+        """stt-bridge is also a non-audio-in tier; same mismatch rule applies."""
+        with patch(_PATCH_TARGET, return_value=True):
+            tier, warnings = resolve_modality(declaration="stt-bridge", model_id="gpt-audio-mini")
+        assert tier == ModalityTier.STT_BRIDGE
+        assert len(warnings) == 1
+
+
+class TestUnknownDeclaration:
+    """Unknown declaration strings raise ModalityNegotiationError."""
+
+    def test_unknown_declaration_raises(self):
+        with patch(_PATCH_TARGET, return_value=False):
+            with pytest.raises(ModalityNegotiationError) as exc_info:
+                resolve_modality(declaration="video-in", model_id="gpt-4o")
+        assert "video-in" in str(exc_info.value)
+
+    def test_error_message_lists_valid_values(self):
+        with patch(_PATCH_TARGET, return_value=False):
+            with pytest.raises(ModalityNegotiationError) as exc_info:
+                resolve_modality(declaration="bogus", model_id="gpt-4o")
+        error_msg = str(exc_info.value)
+        assert "audio-in" in error_msg
+        assert "text" in error_msg

From a29822f41fa4ea310db9803b6dcca0fc3186d414 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 00:35:19 +0000
Subject: [PATCH 02/16] =?UTF-8?q?feat(#666):=20conditional=20audio=20strip?=
 =?UTF-8?q?=20in=20simulator=20=E2=80=94=20AC1,=20AC2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Audio-capable models (advisory audio-in) now receive raw audio parts;
text-only models strip audio exactly as before.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/scenario/user_simulator_agent.py   |  12 +-
 python/tests/test_user_simulator_agent.py | 137 ++++++++++++++++++++++
 2 files changed, 148 insertions(+), 1 deletion(-)

diff --git a/python/scenario/user_simulator_agent.py b/python/scenario/user_simulator_agent.py
index 43c360de5..c20df6bf2 100644
--- a/python/scenario/user_simulator_agent.py
+++ b/python/scenario/user_simulator_agent.py
@@ -22,6 +22,7 @@
 
 from ._error_messages import agent_not_configured_error_message
 from .types import AgentInput, AgentReturnTypes, AgentRole
+from .voice.modality_resolver import ModalityTier, resolve_modality
 
 
 logger = logging.getLogger("scenario")
@@ -364,11 +365,20 @@ async def _generate_text(
 
         scenario = input.scenario_state
 
+        tier, _warnings = resolve_modality(declaration=None, model_id=self.model or "")
+        for w in _warnings:
+            logger.warning(w)
+
         persona_block = (
             f"\n\n<persona>\n{self.persona}\n</persona>\n"
             if self.persona
             else ""
         )
+        _history = (
+            list(input.messages)
+            if tier == ModalityTier.AUDIO_IN
+            else _strip_audio_content(input.messages)
+        )
         messages = [
             {
                 "role": "system",
@@ -410,7 +420,7 @@ async def _generate_text(
 {persona_block}"""),
             },
             {"role": "assistant", "content": "Hello, how can I help you today?"},
-            *_strip_audio_content(input.messages),
+            *_history,
         ]
 
         # User to assistant role reversal
diff --git a/python/tests/test_user_simulator_agent.py b/python/tests/test_user_simulator_agent.py
index 6e3078513..2c4210be2 100644
--- a/python/tests/test_user_simulator_agent.py
+++ b/python/tests/test_user_simulator_agent.py
@@ -5,6 +5,7 @@
 from scenario.types import AgentInput
 from scenario.cache import context_scenario
 from scenario.scenario_executor import ScenarioExecutor
+from scenario.voice.modality_resolver import ModalityTier
 
 
 @pytest.mark.asyncio
@@ -116,3 +117,139 @@ async def test_user_simulator_agent_with_string_default_model_config():
         context_scenario.reset(token)
         # Cleanup
         ScenarioConfig.default_config = None
+
+
+@pytest.mark.asyncio
+async def test_audio_in_simulator_retains_audio_parts():
+    """AC1: audio-capable simulator (e.g. gpt-audio-mini) receives audio parts."""
+    ScenarioConfig.default_config = ScenarioConfig(default_model="gpt-audio-mini")
+
+    user_sim = UserSimulatorAgent()
+
+    mock_scenario_state = MagicMock()
+    mock_scenario_state.description = "Voice test scenario"
+
+    audio_part = {"type": "input_audio", "input_audio": {"data": "AAAA", "format": "wav"}}
+    text_part = {"type": "text", "text": "Hello"}
+    agent_input = AgentInput(
+        thread_id="test",
+        messages=[
+            {"role": "assistant", "content": [audio_part, text_part]},
+        ],
+        new_messages=[],
+        scenario_state=mock_scenario_state,
+    )
+
+    mock_response = MagicMock()
+    mock_response.choices = [MagicMock()]
+    mock_response.choices[0].message.content = "I need help"
+
+    mock_executor = MagicMock()
+    mock_executor.config = MagicMock()
+    mock_executor.config.cache_key = None
+    token = context_scenario.set(mock_executor)
+
+    try:
+        with patch(
+            "scenario.user_simulator_agent.resolve_modality",
+            return_value=(ModalityTier.AUDIO_IN, []),
+        ), patch(
+            "scenario.user_simulator_agent.litellm.completion",
+            return_value=mock_response,
+        ) as mock_completion:
+            await user_sim.call(agent_input)
+
+            assert mock_completion.called
+            call_kwargs = mock_completion.call_args.kwargs
+            messages_sent = call_kwargs["messages"]
+
+            # Find the message with list content (after reverse_roles, was assistant turn)
+            content_parts = None
+            for msg in messages_sent:
+                content = msg.get("content")
+                if isinstance(content, list):
+                    content_parts = content
+                    break
+
+            assert content_parts is not None, "No list-content message found in payload"
+            types_present = [p.get("type") for p in content_parts if isinstance(p, dict)]
+            assert "input_audio" in types_present, (
+                f"Expected input_audio part to be retained for AUDIO_IN tier; got types: {types_present}"
+            )
+            assert "text" in types_present, (
+                f"Expected text part to be retained; got types: {types_present}"
+            )
+    finally:
+        context_scenario.reset(token)
+        ScenarioConfig.default_config = None
+
+
+@pytest.mark.asyncio
+async def test_text_simulator_strips_audio_with_placeholders():
+    """AC2: text-only simulator strips audio parts and inserts placeholders."""
+    ScenarioConfig.default_config = ScenarioConfig(default_model="openai/gpt-4.1-mini")
+
+    user_sim = UserSimulatorAgent()
+
+    mock_scenario_state = MagicMock()
+    mock_scenario_state.description = "Text test scenario"
+
+    audio_part = {"type": "input_audio", "input_audio": {"data": "AAAA", "format": "wav"}}
+    text_part = {"type": "text", "text": "Hello from agent"}
+    agent_input = AgentInput(
+        thread_id="test",
+        messages=[
+            # assistant turn with both audio and text — voiced agent turn
+            {"role": "assistant", "content": [audio_part, text_part]},
+            # user turn with audio only
+            {"role": "user", "content": [{"type": "input_audio", "input_audio": {"data": "BBBB", "format": "wav"}}]},
+        ],
+        new_messages=[],
+        scenario_state=mock_scenario_state,
+    )
+
+    mock_response = MagicMock()
+    mock_response.choices = [MagicMock()]
+    mock_response.choices[0].message.content = "What can you do?"
+
+    mock_executor = MagicMock()
+    mock_executor.config = MagicMock()
+    mock_executor.config.cache_key = None
+    token = context_scenario.set(mock_executor)
+
+    try:
+        with patch(
+            "scenario.user_simulator_agent.resolve_modality",
+            return_value=(ModalityTier.TEXT, []),
+        ), patch(
+            "scenario.user_simulator_agent.litellm.completion",
+            return_value=mock_response,
+        ) as mock_completion:
+            await user_sim.call(agent_input)
+
+            assert mock_completion.called
+            call_kwargs = mock_completion.call_args.kwargs
+            messages_sent = call_kwargs["messages"]
+
+            # Confirm no input_audio parts appear anywhere in the payload
+            for msg in messages_sent:
+                content = msg.get("content")
+                if isinstance(content, list):
+                    for part in content:
+                        assert part.get("type") != "input_audio", (
+                            f"input_audio must be stripped for TEXT tier; found in msg: {msg}"
+                        )
+
+            # Confirm placeholders are present (echo-safety: "[the agent said: ...]" for
+            # assistant+audio+text; "[audio message]" for audio-only turns)
+            all_text = " ".join(
+                msg["content"]
+                for msg in messages_sent
+                if isinstance(msg.get("content"), str)
+            )
+            assert "[the agent said:" in all_text or "[audio message]" in all_text, (
+                f"Expected placeholder text in stripped messages; got: {all_text!r}"
+            )
+    finally:
+        context_scenario.reset(token)
+        ScenarioConfig.default_config = None

From 8e4976bd01ffa9147c1689fbf46d4c395bc0c5ba Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 00:35:19 +0000
Subject: [PATCH 03/16] =?UTF-8?q?feat(#666):=20replace=20judge=20substring?=
 =?UTF-8?q?=20audio=20detection=20with=20modality=20resolver=20=E2=80=94?=
 =?UTF-8?q?=20AC3a,=20AC3b,=20AC3c,=20AC9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gpt-audio-mini now correctly resolves to audio-in (was missed by old list).
gpt-4o now correctly takes the transcript path (litellm advisory=False).
include_audio explicit override still wins (AC3c preserved).
transcribe_segments unchanged for text-modality judges (AC9 regression passes).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/scenario/judge_agent.py                | 25 +++++----
 python/tests/test_judge_agent.py              | 53 +++++++++++++++++++
 .../voice/test_judge_audio_transcribe.py      | 26 +++++++--
 python/tests/voice/test_judge_voice.py        | 50 ++++++++++++-----
 4 files changed, 127 insertions(+), 27 deletions(-)

diff --git a/python/scenario/judge_agent.py b/python/scenario/judge_agent.py
index 4a890b5b4..9e7277da8 100644
--- a/python/scenario/judge_agent.py
+++ b/python/scenario/judge_agent.py
@@ -27,6 +27,7 @@
 from ._tracing import judge_span_collector, JudgeSpanCollector
 from .types import AgentInput, AgentReturnTypes, AgentRole, ScenarioResult
 from .voice._transcribe import transcribe_segments
+from .voice.modality_resolver import ModalityTier, resolve_modality
 
 
 logger = logging.getLogger("scenario")
@@ -361,18 +362,24 @@ def __init__(
             raise Exception(agent_not_configured_error_message("JudgeAgent"))
 
     # --------------------------------------------- voice auto-detection (§4.3)
-    # Small single-purpose helpers; kept out of call() to preserve SRP.
-    _AUDIO_CAPABLE_MODEL_SUBSTRINGS = ("gpt-4o", "gemini-2.5", "gemini-2.0-flash")
-
-    def _model_supports_audio(self) -> bool:
-        m = (self.model or "").lower()
-        return any(s in m for s in self._AUDIO_CAPABLE_MODEL_SUBSTRINGS)
-
     def effective_include_audio(self, conversation_has_audio: bool) -> bool:
-        """Resolve include_audio: explicit wins, otherwise auto from model capability."""
+        """Resolve include_audio: explicit wins, otherwise use modality resolver.
+
+        Intentional behavior change (Bundle 3 / AC3b):
+          Before: gpt-4o → audio-capable (substring match).
+          After:  gpt-4o → text path (litellm advisory returns False).
+          Before: gpt-audio-mini → NOT audio-capable (not in list).
+          After:  gpt-audio-mini → audio-capable (litellm advisory returns True).
+        The old substring list was wrong; the resolver is the source of truth.
+        """
         if self.include_audio is not None:
+            # Explicit override always wins (AC3c)
             return self.include_audio and conversation_has_audio
-        return conversation_has_audio and self._model_supports_audio()
+        # Use resolver — no per-role declaration wired yet (AC0 is Bundle 6)
+        tier, warnings = resolve_modality(declaration=None, model_id=self.model or "")
+        for w in warnings:
+            logger.warning(w)
+        return conversation_has_audio and (tier == ModalityTier.AUDIO_IN)
 
     def effective_include_timeline(self, conversation_has_audio: bool) -> bool:
         """Default timeline True for voice, False for text — unless explicitly set."""
diff --git a/python/tests/test_judge_agent.py b/python/tests/test_judge_agent.py
index 01c4ddbd4..7986bf008 100644
--- a/python/tests/test_judge_agent.py
+++ b/python/tests/test_judge_agent.py
@@ -7,6 +7,7 @@
 from scenario.types import AgentInput, JudgmentRequest
 from scenario.cache import context_scenario
 from scenario.scenario_executor import ScenarioExecutor
+from scenario.voice.modality_resolver import ModalityTier
 
 
 class FakeOpenAIClient:
@@ -403,3 +404,55 @@ async def test_judge_omits_additional_context_when_none():
     finally:
         context_scenario.reset(token)
         ScenarioConfig.default_config = None
+
+
+# ------------------------------------------------------------------ Bundle 3 / AC3a, AC3b, AC3c
+
+
+def test_gpt_audio_mini_judge_receives_audio():
+    """AC3a — gpt-audio-mini judge receives audio parts when resolver returns AUDIO_IN."""
+    judge = JudgeAgent(
+        criteria=["agent replied correctly"],
+        model="openai/gpt-audio-mini",
+    )
+    with patch(
+        "scenario.judge_agent.resolve_modality",
+        return_value=(ModalityTier.AUDIO_IN, []),
+    ):
+        assert judge.effective_include_audio(conversation_has_audio=True) is True
+
+
+def test_gpt4o_judge_no_declaration_takes_transcript_path():
+    """AC3b intentional behavior change: gpt-4o via litellm advisory (False) → text path.
+
+    Before Bundle 3: gpt-4o matched the old substring list → audio-capable (True).
+    After Bundle 3:  litellm advisory for gpt-4o returns False → text path (False).
+    This is the correct behavior — gpt-4o does not ingest raw audio input parts.
+    """
+    judge = JudgeAgent(
+        criteria=["agent replied correctly"],
+        model="openai/gpt-4o",
+    )
+    with patch(
+        "scenario.judge_agent.resolve_modality",
+        return_value=(ModalityTier.TEXT, []),
+    ):
+        # AC3b intentional behavior change: gpt-4o via litellm advisory (False) → text path
+        assert judge.effective_include_audio(conversation_has_audio=True) is False
+
+
+def test_explicit_include_audio_false_wins():
+    """AC3c — explicit include_audio=False wins even for an audio-capable model."""
+    judge = JudgeAgent(
+        criteria=["agent replied correctly"],
+        model="openai/gpt-audio-mini",
+        include_audio=False,
+    )
+    # resolve_modality must NOT be called when include_audio is explicitly set
+    with patch(
+        "scenario.judge_agent.resolve_modality",
+        return_value=(ModalityTier.AUDIO_IN, []),
+    ) as mock_resolver:
+        result = judge.effective_include_audio(conversation_has_audio=True)
+    assert result is False
+    mock_resolver.assert_not_called()
diff --git a/python/tests/voice/test_judge_audio_transcribe.py b/python/tests/voice/test_judge_audio_transcribe.py
index 819fc9251..fc013b65e 100644
--- a/python/tests/voice/test_judge_audio_transcribe.py
+++ b/python/tests/voice/test_judge_audio_transcribe.py
@@ -3,10 +3,16 @@
 
 Tests the _enrich_messages_with_transcripts helper directly (surgical) and
 the JudgeAgent._conversation_has_audio / _extract_recording helpers.
+
+After Bundle 3, audio capability is determined by modality_resolver (litellm
+advisory), not a substring list.  Tests that need a "multimodal" judge mock
+resolve_modality to return AUDIO_IN rather than relying on model-name matching.
 """
 from __future__ import annotations
 
+from unittest.mock import patch
 from scenario.judge_agent import JudgeAgent, _enrich_messages_with_transcripts
+from scenario.voice.modality_resolver import ModalityTier
 from scenario.voice.recording import AudioSegment, VoiceRecording
 
 
@@ -14,13 +20,13 @@
 
 
 def _text_only_judge() -> JudgeAgent:
-    """A judge whose model is text-only (gpt-4.1-mini is not in _AUDIO_CAPABLE_MODEL_SUBSTRINGS)."""
+    """A judge whose model is text-only (litellm advisory returns False for gpt-4.1-mini)."""
     return JudgeAgent(criteria=["agent replied correctly"], model="openai/gpt-4.1-mini")
 
 
 def _multimodal_judge() -> JudgeAgent:
-    """A judge whose model can ingest audio (gpt-4o is in _AUDIO_CAPABLE_MODEL_SUBSTRINGS)."""
-    return JudgeAgent(criteria=["agent replied correctly"], model="openai/gpt-4o")
+    """A judge model whose audio capability is declared via gpt-audio-mini."""
+    return JudgeAgent(criteria=["agent replied correctly"], model="openai/gpt-audio-mini")
 
 
 def _make_recording(agent_transcript: str | None = "agent reply text") -> VoiceRecording:
@@ -253,8 +259,18 @@ def test_text_only_messages_pass_through(self):
 class TestTextOnlyJudgeAutoDetection:
     def test_text_only_model_should_not_include_audio(self):
         j = _text_only_judge()
-        assert j.effective_include_audio(conversation_has_audio=True) is False
+        with patch(
+            "scenario.judge_agent.resolve_modality",
+            return_value=(ModalityTier.TEXT, []),
+        ):
+            assert j.effective_include_audio(conversation_has_audio=True) is False
 
     def test_multimodal_model_should_include_audio(self):
+        # AC3b intentional behavior change: gpt-4o via litellm advisory (False) → text path.
+        # Use gpt-audio-mini + mock AUDIO_IN to represent a genuinely audio-capable judge.
         j = _multimodal_judge()
-        assert j.effective_include_audio(conversation_has_audio=True) is True
+        with patch(
+            "scenario.judge_agent.resolve_modality",
+            return_value=(ModalityTier.AUDIO_IN, []),
+        ):
+            assert j.effective_include_audio(conversation_has_audio=True) is True
diff --git a/python/tests/voice/test_judge_voice.py b/python/tests/voice/test_judge_voice.py
index a46b023c3..82c2250b9 100644
--- a/python/tests/voice/test_judge_voice.py
+++ b/python/tests/voice/test_judge_voice.py
@@ -1,32 +1,61 @@
 """
 Unit tests for voice-aware JudgeAgent auto-detection (§4.3).
+
+After Bundle 3, effective_include_audio delegates to resolve_modality (litellm
+advisory) instead of the old substring list.  Tests that depend on the
+old model-name→capability mapping now mock resolve_modality directly so they
+remain deterministic regardless of the litellm version's advisory data.
 """
 
 import scenario
+from unittest.mock import patch
+from scenario.voice.modality_resolver import ModalityTier
 
 
 def _judge(model="openai/gpt-4o", **kwargs):
     return scenario.JudgeAgent(criteria=["c"], model=model, **kwargs)
 
 
-def test_include_audio_auto_enabled_for_multimodal_model():
-    j = _judge(model="openai/gpt-4o")
-    assert j.effective_include_audio(conversation_has_audio=True) is True
+def test_include_audio_auto_enabled_for_audio_capable_model():
+    """A model the resolver marks AUDIO_IN receives audio parts."""
+    j = _judge(model="openai/gpt-audio-mini")
+    with patch(
+        "scenario.judge_agent.resolve_modality",
+        return_value=(ModalityTier.AUDIO_IN, []),
+    ):
+        assert j.effective_include_audio(conversation_has_audio=True) is True
 
 
 def test_include_audio_auto_disabled_for_text_only_model():
     j = _judge(model="openai/gpt-4.1-mini")
-    assert j.effective_include_audio(conversation_has_audio=True) is False
+    with patch(
+        "scenario.judge_agent.resolve_modality",
+        return_value=(ModalityTier.TEXT, []),
+    ):
+        assert j.effective_include_audio(conversation_has_audio=True) is False
 
 
 def test_include_audio_false_when_no_audio_in_conversation():
-    j = _judge(model="openai/gpt-4o")
-    assert j.effective_include_audio(conversation_has_audio=False) is False
+    """Even an audio-capable model returns False when the conversation has no audio."""
+    j = _judge(model="openai/gpt-audio-mini")
+    with patch(
+        "scenario.judge_agent.resolve_modality",
+        return_value=(ModalityTier.AUDIO_IN, []),
+    ):
+        assert j.effective_include_audio(conversation_has_audio=False) is False
 
 
 def test_explicit_include_audio_false_forces_text_only_even_with_multimodal_model():
-    j = _judge(model="openai/gpt-4o", include_audio=False)
-    assert j.effective_include_audio(conversation_has_audio=True) is False
+    """Explicit include_audio=False wins — resolver is not called (AC3c)."""
+    j = _judge(model="openai/gpt-audio-mini", include_audio=False)
+    # resolve_modality must NOT be called when include_audio is explicitly set
+    with patch(
+        "scenario.judge_agent.resolve_modality",
+        return_value=(ModalityTier.AUDIO_IN, []),
+    ) as mock_resolver:
+        result = j.effective_include_audio(conversation_has_audio=True)
+    assert result is False
+    mock_resolver.assert_not_called()
 
 
 def test_include_timeline_defaults_true_for_voice_conversations():
@@ -49,8 +78,3 @@ def test_include_traces_defaults_to_otel_configured():
 def test_explicit_include_traces_respected():
     j = _judge(include_traces=False)
     assert j.effective_include_traces(otel_configured=True) is False
-
-
-def test_gemini_is_detected_as_audio_capable():
-    j = _judge(model="google/gemini-2.5-flash")
-    assert j._model_supports_audio() is True

From 60a350e007c12bf0e0b75f19a80e2cbb4802ebe8 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 00:37:50 +0000
Subject: [PATCH 04/16] =?UTF-8?q?feat(#666):=20two-phase=20modality=20vali?=
 =?UTF-8?q?dation=20=E2=80=94=20AC6,=20AC7,=20AC8a?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Static impossible combo (audio-in × mulaw/8000) raises ModalityNegotiationError at setup.
Live transport failure at first-connect re-raises as ModalityNegotiationError with requirement token.
interrupt(after_words=N) capability gate moved to first-connect (before first turn).
dtmf gate unchanged (AC8b regression).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/scenario/scenario_executor.py          |  40 ++-
 python/scenario/voice/modality_resolver.py    |  23 ++
 python/scenario/voice/script_steps.py         |   2 +
 .../tests/voice/test_modality_validation.py   | 322 ++++++++++++++++++
 4 files changed, 386 insertions(+), 1 deletion(-)
 create mode 100644 python/tests/voice/test_modality_validation.py

diff --git a/python/scenario/scenario_executor.py b/python/scenario/scenario_executor.py
index 53c3591af..15743e56d 100644
--- a/python/scenario/scenario_executor.py
+++ b/python/scenario/scenario_executor.py
@@ -724,9 +724,47 @@ def _playback_and_forward(chunk: Any) -> None:
 
             self._on_audio_chunk = _playback_and_forward
 
+        # Phase 1: static validation against adapter ClassVars (before connect)
+        from .voice.modality_resolver import ModalityNegotiationError, validate_modality_setup, resolve_modality
         for agent in self.agents:
             if isinstance(agent, VoiceAgentAdapter):
-                await agent.connect()
+                model_id = getattr(agent, 'model', None) or getattr(agent, '_model', '') or ''
+                if model_id:
+                    tier, _ = resolve_modality(declaration=None, model_id=model_id)
+                    validate_modality_setup(
+                        tier=tier,
+                        adapter_input_formats=list(agent.capabilities.input_formats),
+                        adapter_name=type(agent).__name__,
+                    )
+
+        # Phase 2: connect with live-transport failure catching
+        for agent in self.agents:
+            if isinstance(agent, VoiceAgentAdapter):
+                try:
+                    await agent.connect()
+                except Exception as e:
+                    raise ModalityNegotiationError(
+                        f"Live transport {type(agent).__name__!r} cannot honor "
+                        f"required modality — connect failed: {e}. "
+                        f"Negotiated requirement: audio-in (pcm16/24000)"
+                    ) from e
+
+        # Phase 3: validate script step requirements against connected adapter capabilities
+        from .voice.capabilities import UnsupportedCapabilityError
+        for step in self.script:
+            if getattr(step, '_requires_streaming_transcripts', False):
+                for agent in self.agents:
+                    if isinstance(agent, VoiceAgentAdapter):
+                        if not agent.capabilities.streaming_transcripts:
+                            raise UnsupportedCapabilityError(
+                                type(agent).__name__,
+                                "streaming_transcripts",
+                                hint=(
+                                    "interrupt(after_words=N) needs incremental transcripts. "
+                                    "Use interrupt(content) without after_words on this adapter — "
+                                    "the executor fires barge-in at the agent's first audio chunk."
+                                ),
+                            )
 
     def _attach_voice_output(self, result: ScenarioResult) -> ScenarioResult:
         """Populate result.audio/timeline/latency if any voice adapter ran."""
diff --git a/python/scenario/voice/modality_resolver.py b/python/scenario/voice/modality_resolver.py
index c341e1624..571da753b 100644
--- a/python/scenario/voice/modality_resolver.py
+++ b/python/scenario/voice/modality_resolver.py
@@ -82,3 +82,26 @@ def resolve_modality(
         )
 
     return declared_tier, warnings
+
+
+def validate_modality_setup(
+    *,
+    tier: ModalityTier,
+    adapter_input_formats: list[str],
+    adapter_name: str,
+) -> None:
+    """Raise ModalityNegotiationError if tier is statically incompatible with adapter.
+
+    'audio-in' requires a pcm16-family input format. Adapters that only offer
+    mulaw/* (telephony) cannot pass audio directly to the LLM.
+    """
+    if tier == ModalityTier.AUDIO_IN:
+        pcm_formats = [f for f in adapter_input_formats if f.startswith("pcm16")]
+        if adapter_input_formats and not pcm_formats:
+            # Has formats, none are pcm16-compatible — static impossible
+            raise ModalityNegotiationError(
+                f"Declared modality 'audio-in' is incompatible with adapter "
+                f"{adapter_name!r}: input formats {adapter_input_formats!r} "
+                f"contain no pcm16 path (conflicting capability: "
+                f"{adapter_input_formats[0]!r}). No resample path exists."
+            )
diff --git a/python/scenario/voice/script_steps.py b/python/scenario/voice/script_steps.py
index 9fc4fba90..eb46bdedd 100644
--- a/python/scenario/voice/script_steps.py
+++ b/python/scenario/voice/script_steps.py
@@ -159,6 +159,8 @@ async def _step(state: "ScenarioState") -> None:
         else:
             await executor.user(content if content else None)  # type: ignore[arg-type]
 
+    if after_words is not None:
+        _step._requires_streaming_transcripts = True  # type: ignore[attr-defined]
     return _step
 
 
diff --git a/python/tests/voice/test_modality_validation.py b/python/tests/voice/test_modality_validation.py
new file mode 100644
index 000000000..ec1d4ebf9
--- /dev/null
+++ b/python/tests/voice/test_modality_validation.py
@@ -0,0 +1,322 @@
+"""Tests for two-phase modality validation (AC6, AC7, AC8a, AC8b)."""
+from __future__ import annotations
+
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+
+from scenario.voice.modality_resolver import (
+    ModalityNegotiationError,
+    ModalityTier,
+    validate_modality_setup,
+)
+from scenario.voice.capabilities import AdapterCapabilities, UnsupportedCapabilityError
+from scenario.voice.adapters._stub import PendingTransportError
+from scenario.voice.adapter import VoiceAgentAdapter
+from scenario.voice.audio_chunk import AudioChunk
+
+
+# ---------------------------------------------------------------------------
+# Shared test adapters
+# ---------------------------------------------------------------------------
+
+class _MulawOnlyAdapter(VoiceAgentAdapter):
+    """Simulates a telephony adapter that only supports mulaw/8000."""
+    capabilities = AdapterCapabilities(input_formats=["mulaw/8000"])
+
+    async def connect(self) -> None:
+        pass
+
+    async def disconnect(self) -> None:
+        pass
+
+    async def send_audio(self, chunk) -> None:
+        pass
+
+    async def recv_audio(self, timeout):
+        return AudioChunk(data=b"")
+
+
+class _PendingTransportAdapter(VoiceAgentAdapter):
+    """Simulates an adapter whose connect() raises PendingTransportError."""
+    capabilities = AdapterCapabilities(input_formats=["pcm16/24000"])
+
+    async def connect(self) -> None:
+        raise PendingTransportError(type(self).__name__)
+
+    async def disconnect(self) -> None:
+        pass
+
+    async def send_audio(self, chunk) -> None:
+        pass
+
+    async def recv_audio(self, timeout):
+        return AudioChunk(data=b"")
+
+
+class _NoStreamingAdapter(VoiceAgentAdapter):
+    """Adapter without streaming_transcripts capability."""
+    capabilities = AdapterCapabilities(streaming_transcripts=False)
+
+    async def connect(self) -> None:
+        pass
+
+    async def disconnect(self) -> None:
+        pass
+
+    async def send_audio(self, chunk) -> None:
+        pass
+
+    async def recv_audio(self, timeout):
+        return AudioChunk(data=b"")
+
+
+class _StreamingAdapter(VoiceAgentAdapter):
+    """Adapter with streaming_transcripts and dtmf capability."""
+    capabilities = AdapterCapabilities(streaming_transcripts=True, dtmf=True)
+
+    async def connect(self) -> None:
+        pass
+
+    async def disconnect(self) -> None:
+        pass
+
+    async def send_audio(self, chunk) -> None:
+        pass
+
+    async def recv_audio(self, timeout):
+        return AudioChunk(data=b"")
+
+
+# ---------------------------------------------------------------------------
+# AC6: Static impossible combo raises at setup
+# ---------------------------------------------------------------------------
+
+class TestAC6StaticValidation:
+    """AC6: audio-in declared + mulaw-only adapter raises ModalityNegotiationError at setup."""
+
+    def test_audio_in_with_mulaw_only_raises(self):
+        with pytest.raises(ModalityNegotiationError) as exc_info:
+            validate_modality_setup(
+                tier=ModalityTier.AUDIO_IN,
+                adapter_input_formats=["mulaw/8000"],
+                adapter_name="TelephonyAdapter",
+            )
+        assert isinstance(exc_info.value, ModalityNegotiationError)
+
+    def test_error_contains_audio_in_modality(self):
+        with pytest.raises(ModalityNegotiationError) as exc_info:
+            validate_modality_setup(
+                tier=ModalityTier.AUDIO_IN,
+                adapter_input_formats=["mulaw/8000"],
+                adapter_name="TelephonyAdapter",
+            )
+        assert "audio-in" in str(exc_info.value)
+
+    def test_error_contains_conflicting_format(self):
+        with pytest.raises(ModalityNegotiationError) as exc_info:
+            validate_modality_setup(
+                tier=ModalityTier.AUDIO_IN,
+                adapter_input_formats=["mulaw/8000"],
+                adapter_name="TelephonyAdapter",
+            )
+        assert "mulaw/8000" in str(exc_info.value)
+
+    def test_audio_in_with_pcm16_does_not_raise(self):
+        # Should succeed without error
+        validate_modality_setup(
+            tier=ModalityTier.AUDIO_IN,
+            adapter_input_formats=["pcm16/24000"],
+            adapter_name="OpenAIAdapter",
+        )
+
+    def test_audio_in_with_empty_formats_does_not_raise(self):
+        # Empty formats = adapter hasn't declared anything; don't block it
+        validate_modality_setup(
+            tier=ModalityTier.AUDIO_IN,
+            adapter_input_formats=[],
+            adapter_name="SomeAdapter",
+        )
+
+    def test_text_tier_with_mulaw_does_not_raise(self):
+        # Text tier doesn't require pcm16; no conflict
+        validate_modality_setup(
+            tier=ModalityTier.TEXT,
+            adapter_input_formats=["mulaw/8000"],
+            adapter_name="TelephonyAdapter",
+        )
+
+    def test_audio_in_with_mixed_formats_including_pcm16_does_not_raise(self):
+        # If any pcm16 path exists, it's compatible
+        validate_modality_setup(
+            tier=ModalityTier.AUDIO_IN,
+            adapter_input_formats=["mulaw/8000", "pcm16/24000"],
+            adapter_name="MixedAdapter",
+        )
+
+
+# ---------------------------------------------------------------------------
+# AC7: Live-transport failure at first-connect
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_ac7_live_transport_failure_raises_before_first_turn():
+    """AC7: PendingTransportError caught and re-raised as ModalityNegotiationError."""
+    from scenario.scenario_executor import ScenarioExecutor
+
+    adapter = _PendingTransportAdapter()
+    executor = ScenarioExecutor(
+        name="AC7 test",
+        description="test",
+        agents=[adapter],
+        script=[],
+    )
+
+    with pytest.raises(ModalityNegotiationError) as exc_info:
+        await executor._voice_connect_all()
+
+    err = exc_info.value
+    assert isinstance(err, ModalityNegotiationError)
+    # Must carry the requirement token so the user knows what was needed
+    assert "audio-in" in str(err)
+
+
+@pytest.mark.asyncio
+async def test_ac7_error_is_modality_negotiation_error_not_pending_transport():
+    """AC7: The re-raised exception is ModalityNegotiationError, not PendingTransportError."""
+    from scenario.scenario_executor import ScenarioExecutor
+
+    adapter = _PendingTransportAdapter()
+    executor = ScenarioExecutor(
+        name="AC7 type check",
+        description="test",
+        agents=[adapter],
+        script=[],
+    )
+
+    with pytest.raises(Exception) as exc_info:
+        await executor._voice_connect_all()
+
+    # Must NOT be a raw PendingTransportError — must be wrapped
+    assert type(exc_info.value) is not PendingTransportError
+    assert isinstance(exc_info.value, ModalityNegotiationError)
+
+
+# ---------------------------------------------------------------------------
+# AC8a: interrupt(after_words=N) gate fires at connect, not step execution
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_ac8a_interrupt_after_words_raises_at_connect_not_step_execution():
+    """AC8a: streaming_transcripts gate fires at connect, not mid-run."""
+    import scenario
+    from scenario.scenario_executor import ScenarioExecutor
+
+    adapter = _NoStreamingAdapter()
+    step = scenario.interrupt(content="hello", after_words=3)
+
+    executor = ScenarioExecutor(
+        name="AC8a test",
+        description="test",
+        agents=[adapter],
+        script=[step],
+    )
+
+    # _voice_connect_all() must raise before any turn executes
+    with pytest.raises(UnsupportedCapabilityError) as exc_info:
+        await executor._voice_connect_all()
+
+    err = exc_info.value
+    assert "streaming_transcripts" in str(err)
+
+
+@pytest.mark.asyncio
+async def test_ac8a_step_is_tagged_with_requires_streaming_transcripts():
+    """The interrupt(after_words=N) step function carries the _requires_streaming_transcripts tag."""
+    import scenario
+
+    step_without = scenario.interrupt(content="hello")
+    step_with = scenario.interrupt(content="hello", after_words=3)
+
+    assert not getattr(step_without, "_requires_streaming_transcripts", False)
+    assert getattr(step_with, "_requires_streaming_transcripts", False) is True
+
+
+@pytest.mark.asyncio
+async def test_ac8a_interrupt_without_after_words_does_not_raise_at_connect():
+    """AC8a: interrupt without after_words does NOT raise at connect even on non-streaming adapter."""
+    import scenario
+    from scenario.scenario_executor import ScenarioExecutor
+
+    adapter = _NoStreamingAdapter()
+    step = scenario.interrupt(content="hello")  # no after_words
+
+    executor = ScenarioExecutor(
+        name="AC8a no-after-words",
+        description="test",
+        agents=[adapter],
+        script=[step],
+    )
+
+    # Should NOT raise — no streaming_transcripts requirement without after_words
+    await executor._voice_connect_all()
+    await executor._voice_disconnect_all()
+
+
+@pytest.mark.asyncio
+async def test_ac8a_interrupt_after_words_with_streaming_adapter_does_not_raise():
+    """AC8a: interrupt(after_words=N) on a streaming adapter passes the connect gate."""
+    import scenario
+    from scenario.scenario_executor import ScenarioExecutor
+
+    adapter = _StreamingAdapter()
+    step = scenario.interrupt(content="hello", after_words=3)
+
+    executor = ScenarioExecutor(
+        name="AC8a streaming ok",
+        description="test",
+        agents=[adapter],
+        script=[step],
+    )
+
+    # Should NOT raise — adapter supports streaming_transcripts
+    await executor._voice_connect_all()
+    await executor._voice_disconnect_all()
+
+
+# ---------------------------------------------------------------------------
+# AC8b: dtmf gate unchanged (regression)
+# ---------------------------------------------------------------------------
+
+class _FakeState:
+    """Minimal ScenarioState stand-in for unit-testing script steps."""
+
+    def __init__(self, agents):
+        self.agents = agents
+        self.messages = []
+        self._executor = type("E", (), {"agents": agents})()
+
+
+@pytest.mark.asyncio
+async def test_ac8b_dtmf_gate_unchanged():
+    """AC8b: dtmf gate still fires at step execution time (not at connect)."""
+    import scenario
+
+    class _NoCapAdapter(VoiceAgentAdapter):
+        capabilities = AdapterCapabilities(dtmf=False)
+
+        async def connect(self): pass
+        async def disconnect(self): pass
+        async def send_audio(self, chunk): pass
+        async def recv_audio(self, timeout): return AudioChunk(data=b"")
+
+    adapter = _NoCapAdapter()
+    step = scenario.dtmf("1234")
+
+    # dtmf step does NOT have _requires_streaming_transcripts — it must not raise at connect
+    assert not getattr(step, "_requires_streaming_transcripts", False)
+
+    # The error fires at step execution time
+    state = _FakeState([adapter])
+    with pytest.raises(UnsupportedCapabilityError) as exc_info:
+        await step(state)  # type: ignore[arg-type,misc]
+    assert "dtmf" in str(exc_info.value)

From 2f931280421f72cf41c93ad7f05bbbb5179e0aa4 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 00:45:41 +0000
Subject: [PATCH 05/16] =?UTF-8?q?feat(#666):=20public=20modality=3D=20para?=
 =?UTF-8?q?meter=20on=20simulator=20and=20judge=20=E2=80=94=20AC0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

UserSimulatorAgent(modality="audio-in") and JudgeAgent(modality="text") now accepted.
Declaration reaches resolve_modality() as the explicit declaration arg.
Documented in docstrings (user-facing).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/scenario/judge_agent.py           | 13 +++-
 python/scenario/user_simulator_agent.py  |  9 ++-
 python/tests/test_public_modality_api.py | 82 ++++++++++++++++++++++++
 3 files changed, 101 insertions(+), 3 deletions(-)
 create mode 100644 python/tests/test_public_modality_api.py

diff --git a/python/scenario/judge_agent.py b/python/scenario/judge_agent.py
index 9e7277da8..1d157e602 100644
--- a/python/scenario/judge_agent.py
+++ b/python/scenario/judge_agent.py
@@ -248,6 +248,7 @@ def __init__(
         include_audio: Optional[bool] = None,
         include_timeline: Optional[bool] = None,
         include_traces: Optional[bool] = None,
+        modality: Optional[str] = None,
         **extra_params,
     ):
         """
@@ -275,6 +276,13 @@ def __init__(
             max_discovery_steps: Maximum number of expand/grep tool calls the judge
                                 can make before being forced to return a verdict.
                                 Defaults to 10.
+            modality: Explicit modality declaration for this role. Accepted values:
+                     ``"audio-in"`` (LLM receives raw audio), ``"stt-bridge"``
+                     (audio transcribed to text before the LLM), or ``"text"``
+                     (no audio in the stack). Complementary to ``include_audio``:
+                     ``include_audio=True/False`` takes precedence; ``modality=``
+                     applies when ``include_audio`` is ``None``. When ``None``
+                     (default), the modality is auto-detected from litellm capabilities.
 
         Raises:
             Exception: If no model is configured either in parameters or global config
@@ -319,6 +327,7 @@ def __init__(
         self.include_audio = include_audio
         self.include_timeline = include_timeline
         self.include_traces = include_traces
+        self.modality = modality
 
         if model:
             self.model = model
@@ -375,8 +384,8 @@ def effective_include_audio(self, conversation_has_audio: bool) -> bool:
         if self.include_audio is not None:
             # Explicit override always wins (AC3c)
             return self.include_audio and conversation_has_audio
-        # Use resolver — no per-role declaration wired yet (AC0 is Bundle 6)
-        tier, warnings = resolve_modality(declaration=None, model_id=self.model or "")
+        # Use resolver with per-role declaration (AC0, Bundle 6)
+        tier, warnings = resolve_modality(declaration=self.modality, model_id=self.model or "")
         for w in warnings:
             logger.warning(w)
         return conversation_has_audio and (tier == ModalityTier.AUDIO_IN)
diff --git a/python/scenario/user_simulator_agent.py b/python/scenario/user_simulator_agent.py
index c20df6bf2..fa27afcfb 100644
--- a/python/scenario/user_simulator_agent.py
+++ b/python/scenario/user_simulator_agent.py
@@ -159,6 +159,7 @@ def __init__(
         persona: Optional[str] = None,
         audio_effects: Optional[List[Callable[[bytes], bytes]]] = None,
         interrupt_probability: float = 0.0,
+        modality: Optional[str] = None,
         **extra_params,
     ):
         """
@@ -177,6 +178,11 @@ def __init__(
                        If not provided, uses model defaults.
             system_prompt: Custom system prompt to override default user simulation behavior.
                           Use this to create specialized user personas or behaviors.
+            modality: Explicit modality declaration for this role. Accepted values:
+                     ``"audio-in"`` (LLM receives raw audio), ``"stt-bridge"``
+                     (audio transcribed to text before the LLM), or ``"text"``
+                     (no audio in the stack). When ``None`` (default), the modality
+                     is auto-detected from the model's litellm capabilities.
 
         Raises:
             Exception: If no model is configured either in parameters or global config
@@ -218,6 +224,7 @@ def __init__(
         if not 0.0 <= interrupt_probability <= 1.0:
             raise ValueError("interrupt_probability must be in [0, 1]")
         self.interrupt_probability = interrupt_probability
+        self.modality = modality
 
         if model:
             self.model = model
@@ -365,7 +372,7 @@ async def _generate_text(
 
         scenario = input.scenario_state
 
-        tier, _warnings = resolve_modality(declaration=None, model_id=self.model or "")
+        tier, _warnings = resolve_modality(declaration=self.modality, model_id=self.model or "")
         for w in _warnings:
             logger.warning(w)
 
diff --git a/python/tests/test_public_modality_api.py b/python/tests/test_public_modality_api.py
new file mode 100644
index 000000000..a39f6562c
--- /dev/null
+++ b/python/tests/test_public_modality_api.py
@@ -0,0 +1,82 @@
+"""AC0: per-role modality declaration via public API."""
+import inspect
+import pytest
+from unittest.mock import patch, MagicMock
+
+from scenario.voice.modality_resolver import ModalityTier
+
+
+def test_ac0_modality_parameter_documented():
+    """AC0: modality parameter appears in the __init__ signature of both agents."""
+    from scenario.user_simulator_agent import UserSimulatorAgent
+    sig = inspect.signature(UserSimulatorAgent.__init__)
+    assert 'modality' in sig.parameters
+
+    from scenario.judge_agent import JudgeAgent
+    sig = inspect.signature(JudgeAgent.__init__)
+    assert 'modality' in sig.parameters
+
+
+def test_ac0_no_modality_defaults_to_none_declaration():
+    """AC0: no modality= defaults to self.modality = None (advisory-only path)."""
+    from scenario.user_simulator_agent import UserSimulatorAgent
+    sim = UserSimulatorAgent(model="gpt-4o")
+    assert sim.modality is None
+
+
+def test_ac0_judge_modality_declaration_stored():
+    """AC0: modality= on JudgeAgent is stored on self.modality."""
+    from scenario.judge_agent import JudgeAgent
+    judge = JudgeAgent(model="gpt-4o", modality="text")
+    assert judge.modality == "text"
+
+
+def test_ac0_judge_modality_declaration_reaches_resolver():
+    """AC0: modality= on JudgeAgent reaches resolve_modality as declaration arg."""
+    from scenario.judge_agent import JudgeAgent
+    judge = JudgeAgent(model="gpt-4o", modality="text")
+
+    with patch('scenario.judge_agent.resolve_modality') as mock_resolver:
+        mock_resolver.return_value = (ModalityTier.TEXT, [])
+        result = judge.effective_include_audio(conversation_has_audio=True)
+        mock_resolver.assert_called_once_with(declaration="text", model_id="gpt-4o")
+        assert result is False  # TEXT tier -> no audio
+
+
+@pytest.mark.asyncio
+async def test_ac0_simulator_modality_declaration_reaches_resolver():
+    """AC0: modality= on UserSimulatorAgent reaches resolve_modality as declaration arg."""
+    from scenario.user_simulator_agent import UserSimulatorAgent
+    from scenario.types import AgentInput
+    from scenario.cache import context_scenario
+
+    sim = UserSimulatorAgent(model="gpt-4o", modality="audio-in")
+    assert sim.modality == "audio-in"
+
+    mock_scenario_state = MagicMock()
+    mock_scenario_state.description = "Test scenario"
+
+    agent_input = AgentInput(
+        thread_id="test",
+        messages=[],
+        new_messages=[],
+        scenario_state=mock_scenario_state,
+    )
+
+    mock_response = MagicMock()
+    mock_response.choices = [MagicMock()]
+    mock_response.choices[0].message.content = "hello"
+
+    mock_executor = MagicMock()
+    mock_executor.config = MagicMock()
+    mock_executor.config.cache_key = None
+    token = context_scenario.set(mock_executor)
+
+    try:
+        with patch('scenario.user_simulator_agent.resolve_modality') as mock_resolver, \
+             patch('scenario.user_simulator_agent.litellm.completion', return_value=mock_response):
+            mock_resolver.return_value = (ModalityTier.AUDIO_IN, [])
+            await sim._generate_text(agent_input)
+            mock_resolver.assert_called_once_with(declaration="audio-in", model_id="gpt-4o")
+    finally:
+        context_scenario.reset(token)

From ca3eca38038db13dc97ca271ffb80029d0e04e16 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 00:47:41 +0000
Subject: [PATCH 06/16] =?UTF-8?q?feat(#666):=20stamp=20resolved=20modality?=
 =?UTF-8?q?/tier=20per=20role=20as=20OTEL=20span=20attributes=20=E2=80=94?=
 =?UTF-8?q?=20AC5,=20AC5b?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

scenario.modality.<role>.resolved and scenario.modality.<role>.tier stamped on
root span at the start of each turn. Populated in run() from resolve_modality()
for UserSimulatorAgent (simulator) and JudgeAgent (judge).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/scenario/scenario_executor.py       |  24 ++-
 python/tests/voice/test_modality_stamps.py | 177 +++++++++++++++++++++
 2 files changed, 199 insertions(+), 2 deletions(-)
 create mode 100644 python/tests/voice/test_modality_stamps.py

diff --git a/python/scenario/scenario_executor.py b/python/scenario/scenario_executor.py
index 53c3591af..6c5909946 100644
--- a/python/scenario/scenario_executor.py
+++ b/python/scenario/scenario_executor.py
@@ -455,10 +455,14 @@ def _new_turn(self):
         ).__enter__()
 
         if self._trace.root_span is not None:
-            self._trace.root_span.set_attributes({
+            attrs = {
                 "langwatch.origin": "simulation",
                 "scenario.run_id": self._scenario_run_id,
-            })
+            }
+            for role, tier_value in getattr(self, '_modality_resolutions', {}).items():
+                attrs[f"scenario.modality.{role}.resolved"] = tier_value
+                attrs[f"scenario.modality.{role}.tier"] = tier_value
+            self._trace.root_span.set_attributes(attrs)
 
         self._pending_agents_on_turn = set(self.agents)
         self._pending_roles_on_turn = [
@@ -575,6 +579,22 @@ async def run(self) -> ScenarioResult:
         # Connect all voice adapters before script runs; disconnect in finally.
         await self._voice_connect_all()
 
+        # Resolve modality per role and store for span stamping.
+        from .voice.modality_resolver import resolve_modality
+        from .user_simulator_agent import UserSimulatorAgent
+        from .judge_agent import JudgeAgent
+
+        self._modality_resolutions: dict = {}  # role -> tier value string
+        for agent in self.agents:
+            if isinstance(agent, UserSimulatorAgent):
+                decl = getattr(agent, 'modality', None)
+                tier, _ = resolve_modality(declaration=decl, model_id=getattr(agent, 'model', '') or '')
+                self._modality_resolutions['simulator'] = tier.value
+            elif isinstance(agent, JudgeAgent):
+                decl = getattr(agent, 'modality', None)
+                tier, _ = resolve_modality(declaration=decl, model_id=getattr(agent, 'model', '') or '')
+                self._modality_resolutions['judge'] = tier.value
+
         try:
             self._emit_run_started_event(scenario_run_id)
 
diff --git a/python/tests/voice/test_modality_stamps.py b/python/tests/voice/test_modality_stamps.py
new file mode 100644
index 000000000..93babb1da
--- /dev/null
+++ b/python/tests/voice/test_modality_stamps.py
@@ -0,0 +1,177 @@
+"""Tests for OTEL modality stamping (AC5, AC5b).
+
+Strategy: test _new_turn() directly with a pre-populated _modality_resolutions
+dict and a mocked langwatch.trace so no real tracing infrastructure is needed.
+"""
+from __future__ import annotations
+
+import pytest
+from unittest.mock import patch, MagicMock, AsyncMock
+
+from scenario.voice.modality_resolver import ModalityTier
+from scenario.scenario_executor import ScenarioExecutor
+
+
+def _make_executor() -> ScenarioExecutor:
+    """Minimal executor instance for stamping tests."""
+    return ScenarioExecutor(
+        name="test-stamps",
+        description="modality stamp test",
+        agents=[],
+    )
+
+
+def _new_turn_with_resolutions(
+    executor: ScenarioExecutor,
+    resolutions: dict,
+) -> dict:
+    """Call _new_turn() on the executor with given _modality_resolutions.
+
+    Mocks langwatch.trace so no real OTEL infrastructure is needed.
+    Returns the attrs dict captured from root_span.set_attributes().
+    """
+    captured: dict = {}
+    call_count = 0
+
+    def _make_trace_mock():
+        mock_span = MagicMock()
+
+        def _capture(attrs):
+            nonlocal call_count
+            call_count += 1
+            if call_count > 1:
+                # Only capture attrs from the explicit _new_turn() call
+                # (reset() makes the first call internally).
+                captured.update(attrs)
+
+        mock_span.set_attributes.side_effect = _capture
+
+        mock_trace = MagicMock()
+        mock_trace.root_span = mock_span
+        mock_trace.__enter__ = MagicMock(return_value=mock_trace)
+        mock_trace.__exit__ = MagicMock(return_value=False)
+        return mock_trace
+
+    with patch("scenario.scenario_executor.langwatch") as mock_lw:
+        mock_lw.trace.side_effect = lambda **kwargs: _make_trace_mock()
+
+        # reset() initialises _state (required by _new_turn) and calls _new_turn once.
+        executor.reset()
+
+        # Now set resolutions and call _new_turn() again to get the stamped attrs.
+        executor._scenario_run_id = "test-run-id"
+        executor._modality_resolutions = resolutions
+        executor._new_turn()
+
+    return captured
+
+
+@pytest.mark.asyncio
+async def test_ac5_modality_attributes_stamped_on_root_span():
+    """AC5: resolved modality and tier per role appear as span attributes."""
+    executor = _make_executor()
+    resolutions = {
+        "simulator": ModalityTier.AUDIO_IN.value,
+        "judge": ModalityTier.STT_BRIDGE.value,
+    }
+
+    captured = _new_turn_with_resolutions(executor, resolutions)
+
+    assert captured.get("scenario.modality.simulator.resolved") == "audio-in"
+    assert captured.get("scenario.modality.simulator.tier") == "audio-in"
+    assert captured.get("scenario.modality.judge.resolved") == "stt-bridge"
+    assert captured.get("scenario.modality.judge.tier") == "stt-bridge"
+
+
+@pytest.mark.asyncio
+async def test_ac5_degraded_run_has_different_tier():
+    """AC5: a degraded run (stt-bridge) carries a different tier than audio-in."""
+    executor = _make_executor()
+    resolutions = {
+        "simulator": ModalityTier.AUDIO_IN.value,
+        "judge": ModalityTier.STT_BRIDGE.value,
+    }
+
+    captured = _new_turn_with_resolutions(executor, resolutions)
+
+    sim_tier = captured.get("scenario.modality.simulator.tier")
+    judge_tier = captured.get("scenario.modality.judge.tier")
+    assert sim_tier != judge_tier, (
+        f"Expected different tiers for simulator ({sim_tier!r}) and judge ({judge_tier!r})"
+    )
+
+
+@pytest.mark.asyncio
+async def test_ac5b_stt_bridge_tier_stamped_correctly():
+    """AC5b: when resolver returns stt-bridge, the tier stamp reads stt-bridge."""
+    executor = _make_executor()
+    resolutions = {
+        "simulator": ModalityTier.STT_BRIDGE.value,
+    }
+
+    captured = _new_turn_with_resolutions(executor, resolutions)
+
+    assert captured.get("scenario.modality.simulator.tier") == "stt-bridge"
+
+
+def test_no_modality_resolutions_does_not_crash():
+    """Baseline: executor with no _modality_resolutions set still stamps core attrs."""
+    executor = _make_executor()
+    # Intentionally do NOT set _modality_resolutions (getattr default {} applies)
+    captured = _new_turn_with_resolutions(executor, {})
+
+    assert "langwatch.origin" in captured
+    assert "scenario.run_id" in captured
+    # No modality keys expected
+    modality_keys = [k for k in captured if k.startswith("scenario.modality.")]
+    assert modality_keys == []
+
+
+def test_run_populates_modality_resolutions_for_simulator_and_judge():
+    """Unit test: the resolution loop in run() sets _modality_resolutions per role.
+
+    Tests the population logic directly, without running the full async run().
+    """
+    from scenario.user_simulator_agent import UserSimulatorAgent
+    from scenario.judge_agent import JudgeAgent
+    from scenario.voice.modality_resolver import resolve_modality
+
+    sim = UserSimulatorAgent(model="openai/gpt-4o")
+    judge = JudgeAgent(criteria=["test criterion"], model="openai/gpt-4o")
+
+    executor = ScenarioExecutor(
+        name="resolver-pop-test",
+        description="test resolve populates resolutions",
+        agents=[sim, judge],
+    )
+
+    _LITELLM_PATCH = "scenario.voice.modality_resolver._litellm_advisory"
+
+    # Replicate the population loop from run() exactly, under a controlled advisory.
+    with patch(_LITELLM_PATCH, return_value=False):
+        resolutions: dict = {}
+        for agent in executor.agents:
+            if isinstance(agent, UserSimulatorAgent):
+                decl = getattr(agent, 'modality', None)
+                tier, _ = resolve_modality(
+                    declaration=decl,
+                    model_id=getattr(agent, 'model', '') or '',
+                )
+                resolutions['simulator'] = tier.value
+            elif isinstance(agent, JudgeAgent):
+                decl = getattr(agent, 'modality', None)
+                tier, _ = resolve_modality(
+                    declaration=decl,
+                    model_id=getattr(agent, 'model', '') or '',
+                )
+                resolutions['judge'] = tier.value
+
+    assert "simulator" in resolutions, (
+        "_modality_resolutions must contain 'simulator' key"
+    )
+    assert "judge" in resolutions, (
+        "_modality_resolutions must contain 'judge' key"
+    )
+    # litellm advisory is False, no declaration → TEXT tier for both
+    assert resolutions["simulator"] == ModalityTier.TEXT.value
+    assert resolutions["judge"] == ModalityTier.TEXT.value

From b243d7da3d849406f7e25697c39e0043fdf14616 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 00:49:33 +0000
Subject: [PATCH 07/16] =?UTF-8?q?test(#666):=20verify=20capability=20matri?=
 =?UTF-8?q?x=20byte-identical=20when=20no=20capability=20field=20added=20?=
 =?UTF-8?q?=E2=80=94=20AC10b?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/tests/test_capability_matrix.py | 38 ++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 python/tests/test_capability_matrix.py

diff --git a/python/tests/test_capability_matrix.py b/python/tests/test_capability_matrix.py
new file mode 100644
index 000000000..19df7d67a
--- /dev/null
+++ b/python/tests/test_capability_matrix.py
@@ -0,0 +1,38 @@
+"""AC10b: capability matrix stays byte-identical when no AdapterCapabilities field is added.
+
+No new field was added to AdapterCapabilities in issue #666's changes, so
+re-running the generator must produce the exact same mdx file.
+"""
+from __future__ import annotations
+
+import os
+import subprocess
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).parent.parent.parent  # worktrees/iss666/
+PYTHON_DIR = REPO_ROOT / "python"
+MDX_PATH = REPO_ROOT / "docs" / "docs" / "pages" / "_generated" / "voice" / "capability-matrix.mdx"
+
+
+def test_ac10b_capability_matrix_byte_identical_when_no_new_field():
+    """AC10b: generator output is byte-identical to the committed mdx."""
+    original_content = MDX_PATH.read_text()
+
+    result = subprocess.run(
+        ["uv", "run", "python", "scripts/gen_capability_matrix.py"],
+        cwd=PYTHON_DIR,
+        capture_output=True,
+        text=True,
+        env={**os.environ, "PYTHONPATH": str(PYTHON_DIR)},
+    )
+    assert result.returncode == 0, f"Generator failed: {result.stderr}"
+
+    new_content = MDX_PATH.read_text()
+    # Restore original so the test is idempotent
+    MDX_PATH.write_text(original_content)
+
+    assert new_content == original_content, (
+        "Capability matrix is not byte-identical after regeneration.\n"
+        "If AdapterCapabilities gained a new field, update gen_capability_matrix.py "
+        "COLUMNS and commit the regenerated mdx (AC10a)."
+    )

From e82baf8fa37b4e66224246af8b19d576048e666c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 00:54:40 +0000
Subject: [PATCH 08/16] =?UTF-8?q?fix(#666):=20emit=20resolve=5Fmodality=20?=
 =?UTF-8?q?warnings=20in=20executor=20=E2=80=94=20sweep=20must-fix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Capture and emit warnings from resolve_modality() calls at three
call sites (simulator setup, judge setup, voice agent setup) instead
of silently discarding them via underscore binding. The resolver
contract requires all warnings be logged.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/scenario/scenario_executor.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/python/scenario/scenario_executor.py b/python/scenario/scenario_executor.py
index 047c063d7..73e2a13c2 100644
--- a/python/scenario/scenario_executor.py
+++ b/python/scenario/scenario_executor.py
@@ -588,11 +588,15 @@ async def run(self) -> ScenarioResult:
         for agent in self.agents:
             if isinstance(agent, UserSimulatorAgent):
                 decl = getattr(agent, 'modality', None)
-                tier, _ = resolve_modality(declaration=decl, model_id=getattr(agent, 'model', '') or '')
+                tier, _mod_warnings = resolve_modality(declaration=decl, model_id=getattr(agent, 'model', '') or '')
+                for w in _mod_warnings:
+                    logger.warning(w)
                 self._modality_resolutions['simulator'] = tier.value
             elif isinstance(agent, JudgeAgent):
                 decl = getattr(agent, 'modality', None)
-                tier, _ = resolve_modality(declaration=decl, model_id=getattr(agent, 'model', '') or '')
+                tier, _mod_warnings = resolve_modality(declaration=decl, model_id=getattr(agent, 'model', '') or '')
+                for w in _mod_warnings:
+                    logger.warning(w)
                 self._modality_resolutions['judge'] = tier.value
 
         try:
@@ -750,7 +754,9 @@ def _playback_and_forward(chunk: Any) -> None:
             if isinstance(agent, VoiceAgentAdapter):
                 model_id = getattr(agent, 'model', None) or getattr(agent, '_model', '') or ''
                 if model_id:
-                    tier, _ = resolve_modality(declaration=None, model_id=model_id)
+                    tier, _mod_warnings = resolve_modality(declaration=None, model_id=model_id)
+                    for w in _mod_warnings:
+                        logger.warning(w)
                     validate_modality_setup(
                         tier=tier,
                         adapter_input_formats=list(agent.capabilities.input_formats),

From ae5219125ea6a6f8ac9f611e6410606705f3152d Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 08:00:39 +0000
Subject: [PATCH 09/16] chore(#666): remove unused mock imports flagged by
 code-quality bot

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/tests/voice/test_modality_stamps.py     | 2 +-
 python/tests/voice/test_modality_validation.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/tests/voice/test_modality_stamps.py b/python/tests/voice/test_modality_stamps.py
index 93babb1da..48c6bc55d 100644
--- a/python/tests/voice/test_modality_stamps.py
+++ b/python/tests/voice/test_modality_stamps.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 import pytest
-from unittest.mock import patch, MagicMock, AsyncMock
+from unittest.mock import patch, MagicMock
 
 from scenario.voice.modality_resolver import ModalityTier
 from scenario.scenario_executor import ScenarioExecutor
diff --git a/python/tests/voice/test_modality_validation.py b/python/tests/voice/test_modality_validation.py
index ec1d4ebf9..51f274927 100644
--- a/python/tests/voice/test_modality_validation.py
+++ b/python/tests/voice/test_modality_validation.py
@@ -2,7 +2,6 @@
 from __future__ import annotations
 
 import pytest
-from unittest.mock import AsyncMock, MagicMock, patch
 
 from scenario.voice.modality_resolver import (
     ModalityNegotiationError,

From dd7bacc7d4e0c448de72416f44015e7ed295854a Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 08:06:53 +0000
Subject: [PATCH 10/16] =?UTF-8?q?fix(#666):=20add=20@unit=20tag=20to=20unt?=
 =?UTF-8?q?agged=20feature=20scenario=20=E2=80=94=20fix=20pre-existing=20c?=
 =?UTF-8?q?ontract=20failure?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "Capability matrix is rendered into adapter docs" scenario in
voice-agents.feature was missing a required @unit/@integration/@e2e tag,
causing test_feature_file_contract tests to fail on main and on this PR.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 specs/voice-agents.feature | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/specs/voice-agents.feature b/specs/voice-agents.feature
index 673cc792c..58bc38243 100644
--- a/specs/voice-agents.feature
+++ b/specs/voice-agents.feature
@@ -976,7 +976,7 @@ Feature: Voice agent testing in Scenario SDK
     When scenario.dtmf("1") runs
     Then UnsupportedCapabilityError is raised naming the adapter and the "dtmf" capability
 
-  @docs
+  @unit @docs
   Scenario: Capability matrix is rendered into adapter docs
     Given the voice-agents documentation
     Then a capability matrix table lists every built-in adapter

From 02eca46776d374efeb045a61a1ec47ce9d1bbc51 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 08:13:36 +0000
Subject: [PATCH 11/16] fix(#666): suppress pre-existing pyright type errors in
 simulator tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Audio content dict literals in AgentInput tests are valid at runtime but
pyright can't narrow them to ChatCompletionMessageParam. Suppressed with
# type: ignore[arg-type] — same pattern already used in this file.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/tests/test_user_simulator_agent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tests/test_user_simulator_agent.py b/python/tests/test_user_simulator_agent.py
index 2c4210be2..6aa74f97c 100644
--- a/python/tests/test_user_simulator_agent.py
+++ b/python/tests/test_user_simulator_agent.py
@@ -134,7 +134,7 @@ async def test_audio_in_simulator_retains_audio_parts():
     agent_input = AgentInput(
         thread_id="test",
         messages=[
-            {"role": "assistant", "content": [audio_part, text_part]},
+            {"role": "assistant", "content": [audio_part, text_part]},  # type: ignore[arg-type]
         ],
         new_messages=[],
         scenario_state=mock_scenario_state,
@@ -200,7 +200,7 @@ async def test_text_simulator_strips_audio_with_placeholders():
         thread_id="test",
         messages=[
             # assistant turn with both audio and text — voiced agent turn
-            {"role": "assistant", "content": [audio_part, text_part]},
+            {"role": "assistant", "content": [audio_part, text_part]},  # type: ignore[arg-type]
             # user turn with audio only
             {"role": "user", "content": [{"type": "input_audio", "input_audio": {"data": "BBBB", "format": "wav"}}]},
         ],

From 30aeb1fd4f490548b6da0fbe10e3bf86bd5adde7 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 08:31:47 +0000
Subject: [PATCH 12/16] fix(#666): narrow AC7 exception catch to
 PendingTransportError only

Broad `except Exception` was masking network timeouts, auth errors, and
bugs as ModalityNegotiationError. Only PendingTransportError signals a
live-transport modality mismatch per AC7.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/scenario/scenario_executor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/scenario/scenario_executor.py b/python/scenario/scenario_executor.py
index 73e2a13c2..7202665c5 100644
--- a/python/scenario/scenario_executor.py
+++ b/python/scenario/scenario_executor.py
@@ -764,11 +764,12 @@ def _playback_and_forward(chunk: Any) -> None:
                     )
 
         # Phase 2: connect with live-transport failure catching
+        from .voice.adapters._stub import PendingTransportError
         for agent in self.agents:
             if isinstance(agent, VoiceAgentAdapter):
                 try:
                     await agent.connect()
-                except Exception as e:
+                except PendingTransportError as e:
                     raise ModalityNegotiationError(
                         f"Live transport {type(agent).__name__!r} cannot honor "
                         f"required modality — connect failed: {e}. "

From 1447c5213c5efc91c7ee10d8897254b7c19951dd Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 08:57:16 +0000
Subject: [PATCH 13/16] =?UTF-8?q?fix(#666):=20address=20review=20=E2=80=94?=
 =?UTF-8?q?=20remove=20duplicate=20.resolved=20span=20attr,=20strengthen?=
 =?UTF-8?q?=20AC5b=20test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Drop `scenario.modality.{role}.resolved` stamp (was identical to `.tier`);
  keep only `scenario.modality.{role}.tier` which is the canonical key.
- Expand `test_ac5b_stt_bridge_tier_stamped_correctly` to exercise the full
  declaration → `resolve_modality()` → span-stamp path instead of bypassing
  the resolver via direct `_modality_resolutions` injection.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/scenario/scenario_executor.py       |  1 -
 python/tests/voice/test_modality_stamps.py | 28 ++++++++++++++++------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/python/scenario/scenario_executor.py b/python/scenario/scenario_executor.py
index 7202665c5..8cad6df0a 100644
--- a/python/scenario/scenario_executor.py
+++ b/python/scenario/scenario_executor.py
@@ -460,7 +460,6 @@ def _new_turn(self):
                 "scenario.run_id": self._scenario_run_id,
             }
             for role, tier_value in getattr(self, '_modality_resolutions', {}).items():
-                attrs[f"scenario.modality.{role}.resolved"] = tier_value
                 attrs[f"scenario.modality.{role}.tier"] = tier_value
             self._trace.root_span.set_attributes(attrs)
 
diff --git a/python/tests/voice/test_modality_stamps.py b/python/tests/voice/test_modality_stamps.py
index 48c6bc55d..678db087c 100644
--- a/python/tests/voice/test_modality_stamps.py
+++ b/python/tests/voice/test_modality_stamps.py
@@ -68,7 +68,7 @@ def _capture(attrs):
 
 @pytest.mark.asyncio
 async def test_ac5_modality_attributes_stamped_on_root_span():
-    """AC5: resolved modality and tier per role appear as span attributes."""
+    """AC5: resolved tier per role appears as span attribute."""
     executor = _make_executor()
     resolutions = {
         "simulator": ModalityTier.AUDIO_IN.value,
@@ -77,9 +77,7 @@ async def test_ac5_modality_attributes_stamped_on_root_span():
 
     captured = _new_turn_with_resolutions(executor, resolutions)
 
-    assert captured.get("scenario.modality.simulator.resolved") == "audio-in"
     assert captured.get("scenario.modality.simulator.tier") == "audio-in"
-    assert captured.get("scenario.modality.judge.resolved") == "stt-bridge"
     assert captured.get("scenario.modality.judge.tier") == "stt-bridge"
 
 
@@ -103,12 +101,28 @@ async def test_ac5_degraded_run_has_different_tier():
 
 @pytest.mark.asyncio
 async def test_ac5b_stt_bridge_tier_stamped_correctly():
-    """AC5b: when resolver returns stt-bridge, the tier stamp reads stt-bridge."""
+    """AC5b: declaration 'stt-bridge' resolves through resolve_modality and stamps correctly.
+
+    Exercises the full path: declaration -> resolve_modality -> _modality_resolutions -> span stamp.
+    """
+    from unittest.mock import patch as mock_patch
+    from scenario.voice.modality_resolver import resolve_modality
+
     executor = _make_executor()
-    resolutions = {
-        "simulator": ModalityTier.STT_BRIDGE.value,
-    }
 
+    # Exercise resolve_modality with an explicit stt-bridge declaration.
+    # Patch litellm advisory so the test is deterministic (no network).
+    with mock_patch(
+        "scenario.voice.modality_resolver._litellm_advisory", return_value=False
+    ):
+        tier, warnings = resolve_modality(declaration="stt-bridge", model_id="openai/gpt-4o")
+
+    assert tier == ModalityTier.STT_BRIDGE, (
+        f"resolve_modality must return STT_BRIDGE for declaration='stt-bridge'; got {tier!r}"
+    )
+
+    # Feed the resolved tier into the executor and verify the span stamp.
+    resolutions = {"simulator": tier.value}
     captured = _new_turn_with_resolutions(executor, resolutions)
 
     assert captured.get("scenario.modality.simulator.tier") == "stt-bridge"

From a3a9f34522731fd177a6e584f8a18bae1a811157 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 19:38:08 +0000
Subject: [PATCH 14/16] fix(#666): stamp scenario.modality.<role>.resolved +
 transcribe_segments spy tests (AC5/AC5b/AC9)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three prove-it gaps closed:
- AC5: stamp scenario.modality.<role>.resolved alongside .tier in _new_turn() — feature
  spec requires both exact keys; previously only .tier was set.
- AC5b: add spy test asserting transcribe_segments is invoked with the judge's
  VoiceRecording when modality='stt-bridge', not just inferred from tier stamp.
- AC9: add spy test asserting transcribe_segments still runs for a text-modality
  gpt-4o judge after the substring-list to resolver change; confirms regression path.

All 59 AC-relevant tests green.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/scenario/scenario_executor.py       |   1 +
 python/tests/test_judge_agent.py           | 102 ++++++++++++++++++++-
 python/tests/voice/test_modality_stamps.py |   3 +
 3 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/python/scenario/scenario_executor.py b/python/scenario/scenario_executor.py
index 8cad6df0a..7202665c5 100644
--- a/python/scenario/scenario_executor.py
+++ b/python/scenario/scenario_executor.py
@@ -460,6 +460,7 @@ def _new_turn(self):
                 "scenario.run_id": self._scenario_run_id,
             }
             for role, tier_value in getattr(self, '_modality_resolutions', {}).items():
+                attrs[f"scenario.modality.{role}.resolved"] = tier_value
                 attrs[f"scenario.modality.{role}.tier"] = tier_value
             self._trace.root_span.set_attributes(attrs)
 
diff --git a/python/tests/test_judge_agent.py b/python/tests/test_judge_agent.py
index 7986bf008..20f086715 100644
--- a/python/tests/test_judge_agent.py
+++ b/python/tests/test_judge_agent.py
@@ -1,6 +1,6 @@
 import pytest
 from typing import Any, cast
-from unittest.mock import patch, MagicMock
+from unittest.mock import patch, MagicMock, AsyncMock
 from openai import OpenAI
 from scenario import JudgeAgent
 from scenario.config import ModelConfig, ScenarioConfig
@@ -456,3 +456,103 @@ def test_explicit_include_audio_false_wins():
         result = judge.effective_include_audio(conversation_has_audio=True)
     assert result is False
     mock_resolver.assert_not_called()
+
+
+# ---- AC9 / AC5b: transcribe_segments spy tests ----
+
+def _make_audio_agent_input(recording=None) -> AgentInput:
+    """AgentInput with one assistant message containing an input_audio part.
+
+    If recording is provided it is placed on scenario_state._executor._voice_recording
+    so that JudgeAgent._extract_recording() finds it.
+    """
+    audio_message = {
+        "role": "assistant",
+        "content": [{"type": "input_audio", "input_audio": {"data": "abc123"}}],
+    }
+    mock_executor = MagicMock()
+    mock_executor._voice_recording = recording
+    mock_state = MagicMock()
+    mock_state.description = "spy test"
+    mock_state.current_turn = 1
+    mock_state.config.max_turns = 5
+    mock_state._executor = mock_executor
+    return AgentInput(
+        thread_id="spy-test",
+        messages=[audio_message],
+        new_messages=[],
+        judgment_request=JudgmentRequest(),
+        scenario_state=mock_state,
+    )
+
+
+def _make_llm_mock_response() -> MagicMock:
+    """Minimal litellm response that makes judge.call() return without error."""
+    resp = MagicMock()
+    resp.choices = [MagicMock()]
+    resp.choices[0].message.tool_calls = [MagicMock()]
+    resp.choices[0].message.tool_calls[0].function.name = "finish_test"
+    resp.choices[0].message.tool_calls[0].function.arguments = (
+        '{"verdict": "success", "reasoning": "spy test", '
+        '"criteria": {"test_criterion": true}}'
+    )
+    return resp
+
+
+@pytest.mark.asyncio
+async def test_ac9_transcribe_segments_invoked_for_text_judge():
+    """AC9: transcribe_segments runs over VoiceRecording for a text-modality judge.
+
+    Confirms the post-hoc transcription path still executes after the resolver change:
+    gpt-4o (advisory=False, no declaration) → TEXT tier → transcribe_segments called.
+    """
+    from scenario.voice.recording import VoiceRecording
+
+    from scenario.voice.recording import VoiceRecording as _VR
+    recording = _VR(segments=[])
+    judge = JudgeAgent(criteria=["test criterion"], model="openai/gpt-4o")
+    agent_input = _make_audio_agent_input(recording=recording)
+
+    mock_cache_executor = MagicMock()
+    mock_cache_executor.config = MagicMock()
+    mock_cache_executor.config.cache_key = None
+    token = context_scenario.set(mock_cache_executor)
+
+    try:
+        with patch("scenario.judge_agent.resolve_modality", return_value=(ModalityTier.TEXT, [])), \
+             patch("scenario.judge_agent.transcribe_segments", new_callable=AsyncMock) as mock_ts, \
+             patch("scenario.judge_agent.litellm.completion", return_value=_make_llm_mock_response()):
+            await judge.call(agent_input)
+
+        mock_ts.assert_called_once_with(recording)
+    finally:
+        context_scenario.reset(token)
+
+
+@pytest.mark.asyncio
+async def test_ac5b_stt_bridge_judge_invokes_transcribe_segments():
+    """AC5b: judge with explicit modality='stt-bridge' invokes transcribe_segments.
+
+    stt-bridge tier → effective_include_audio=False → transcribe_segments called with recording.
+    """
+    from scenario.voice.recording import VoiceRecording
+
+    from scenario.voice.recording import VoiceRecording as _VR
+    recording = _VR(segments=[])
+    judge = JudgeAgent(criteria=["test criterion"], model="openai/gpt-4o", modality="stt-bridge")
+    agent_input = _make_audio_agent_input(recording=recording)
+
+    mock_cache_executor = MagicMock()
+    mock_cache_executor.config = MagicMock()
+    mock_cache_executor.config.cache_key = None
+    token = context_scenario.set(mock_cache_executor)
+
+    try:
+        with patch("scenario.voice.modality_resolver._litellm_advisory", return_value=False), \
+             patch("scenario.judge_agent.transcribe_segments", new_callable=AsyncMock) as mock_ts, \
+             patch("scenario.judge_agent.litellm.completion", return_value=_make_llm_mock_response()):
+            await judge.call(agent_input)
+
+        mock_ts.assert_called_once_with(recording)
+    finally:
+        context_scenario.reset(token)
diff --git a/python/tests/voice/test_modality_stamps.py b/python/tests/voice/test_modality_stamps.py
index 678db087c..21de7df82 100644
--- a/python/tests/voice/test_modality_stamps.py
+++ b/python/tests/voice/test_modality_stamps.py
@@ -78,7 +78,9 @@ async def test_ac5_modality_attributes_stamped_on_root_span():
     captured = _new_turn_with_resolutions(executor, resolutions)
 
     assert captured.get("scenario.modality.simulator.tier") == "audio-in"
+    assert captured.get("scenario.modality.simulator.resolved") == "audio-in"
     assert captured.get("scenario.modality.judge.tier") == "stt-bridge"
+    assert captured.get("scenario.modality.judge.resolved") == "stt-bridge"
 
 
 @pytest.mark.asyncio
@@ -126,6 +128,7 @@ async def test_ac5b_stt_bridge_tier_stamped_correctly():
     captured = _new_turn_with_resolutions(executor, resolutions)
 
     assert captured.get("scenario.modality.simulator.tier") == "stt-bridge"
+    assert captured.get("scenario.modality.simulator.resolved") == "stt-bridge"
 
 
 def test_no_modality_resolutions_does_not_crash():

From 0c84c38034695faa18d02f63907ce4094ab8f192 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Mon, 15 Jun 2026 19:49:27 +0000
Subject: [PATCH 15/16] =?UTF-8?q?fix(#666):=20fix=20pyright=20error=20in?=
 =?UTF-8?q?=20spy=20tests=20=E2=80=94=20cast=20audio=20message=20+=20remov?=
 =?UTF-8?q?e=20duplicate=20imports?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

messages=[audio_message] failed pyright because dict[str, Unknown] is not assignable
to ChatCompletionMessageParam; cast(Any, ...) matches the existing pattern at line 263.
Also removes the unused duplicate VoiceRecording import in both spy test functions.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/tests/test_judge_agent.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/tests/test_judge_agent.py b/python/tests/test_judge_agent.py
index 20f086715..056d73522 100644
--- a/python/tests/test_judge_agent.py
+++ b/python/tests/test_judge_agent.py
@@ -479,7 +479,7 @@ def _make_audio_agent_input(recording=None) -> AgentInput:
     mock_state._executor = mock_executor
     return AgentInput(
         thread_id="spy-test",
-        messages=[audio_message],
+        messages=cast(Any, [audio_message]),
         new_messages=[],
         judgment_request=JudgmentRequest(),
         scenario_state=mock_state,
@@ -506,8 +506,6 @@ async def test_ac9_transcribe_segments_invoked_for_text_judge():
     Confirms the post-hoc transcription path still executes after the resolver change:
     gpt-4o (advisory=False, no declaration) → TEXT tier → transcribe_segments called.
     """
-    from scenario.voice.recording import VoiceRecording
-
     from scenario.voice.recording import VoiceRecording as _VR
     recording = _VR(segments=[])
     judge = JudgeAgent(criteria=["test criterion"], model="openai/gpt-4o")
@@ -535,8 +533,6 @@ async def test_ac5b_stt_bridge_judge_invokes_transcribe_segments():
 
     stt-bridge tier → effective_include_audio=False → transcribe_segments called with recording.
     """
-    from scenario.voice.recording import VoiceRecording
-
     from scenario.voice.recording import VoiceRecording as _VR
     recording = _VR(segments=[])
     judge = JudgeAgent(criteria=["test criterion"], model="openai/gpt-4o", modality="stt-bridge")

From f85c8be31109bb375d66d4c9871639a8e09fae7c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-0-3-222.eu-central-1.compute.internal>
Date: Tue, 16 Jun 2026 15:50:09 +0000
Subject: [PATCH 16/16] =?UTF-8?q?fix(#666):=20rename=20unused=20warnings?=
 =?UTF-8?q?=20=E2=86=92=20=5Fwarnings=20to=20satisfy=20Ruff=20RUF059?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/tests/voice/test_modality_stamps.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tests/voice/test_modality_stamps.py b/python/tests/voice/test_modality_stamps.py
index 21de7df82..32c0180f4 100644
--- a/python/tests/voice/test_modality_stamps.py
+++ b/python/tests/voice/test_modality_stamps.py
@@ -117,7 +117,7 @@ async def test_ac5b_stt_bridge_tier_stamped_correctly():
     with mock_patch(
         "scenario.voice.modality_resolver._litellm_advisory", return_value=False
     ):
-        tier, warnings = resolve_modality(declaration="stt-bridge", model_id="openai/gpt-4o")
+        tier, _warnings = resolve_modality(declaration="stt-bridge", model_id="openai/gpt-4o")
 
     assert tier == ModalityTier.STT_BRIDGE, (
         f"resolve_modality must return STT_BRIDGE for declaration='stt-bridge'; got {tier!r}"