Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
e275957
feat(#666): resolver core — per-role modality resolution with declara…
Jun 15, 2026
8ba9995
Merge Bundle 1: resolver core (AC4a, AC4b)
Jun 15, 2026
a29822f
feat(#666): conditional audio strip in simulator — AC1, AC2
Jun 15, 2026
8e4976b
feat(#666): replace judge substring audio detection with modality res…
Jun 15, 2026
60a350e
feat(#666): two-phase modality validation — AC6, AC7, AC8a
Jun 15, 2026
0ae04f5
Merge Bundle 2: simulator wiring (AC1, AC2)
Jun 15, 2026
b641a3f
Merge Bundle 3: judge wiring (AC3a, AC3b, AC3c, AC9)
Jun 15, 2026
51d5994
Merge Bundle 4: two-phase validation (AC6, AC7, AC8a, AC8b)
Jun 15, 2026
2f93128
feat(#666): public modality= parameter on simulator and judge — AC0
Jun 15, 2026
ca3eca3
feat(#666): stamp resolved modality/tier per role as OTEL span attrib…
Jun 15, 2026
d7489b4
Merge Bundle 5: OTEL modality stamps (AC5, AC5b)
Jun 15, 2026
9634b69
Merge Bundle 6: public modality= parameter (AC0)
Jun 15, 2026
b243d7d
test(#666): verify capability matrix byte-identical when no capabilit…
Jun 15, 2026
e82baf8
fix(#666): emit resolve_modality warnings in executor — sweep must-fix
Jun 15, 2026
ae52191
chore(#666): remove unused mock imports flagged by code-quality bot
Jun 15, 2026
dd7bacc
fix(#666): add @unit tag to untagged feature scenario — fix pre-exist…
Jun 15, 2026
02eca46
fix(#666): suppress pre-existing pyright type errors in simulator tests
Jun 15, 2026
30aeb1f
fix(#666): narrow AC7 exception catch to PendingTransportError only
Jun 15, 2026
1447c52
fix(#666): address review — remove duplicate .resolved span attr, str…
Jun 15, 2026
a3a9f34
fix(#666): stamp scenario.modality.<role>.resolved + transcribe_segme…
Jun 15, 2026
0c84c38
fix(#666): fix pyright error in spy tests — cast audio message + remo…
Jun 15, 2026
f85c8be
fix(#666): rename unused warnings → _warnings to satisfy Ruff RUF059
Jun 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 25 additions & 9 deletions python/scenario/judge_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from ._tracing import judge_span_collector, JudgeSpanCollector
from .types import AgentInput, AgentReturnTypes, AgentRole, ScenarioResult
from .voice._transcribe import transcribe_segments
from .voice.modality_resolver import ModalityTier, resolve_modality


logger = logging.getLogger("scenario")
Expand Down Expand Up @@ -247,6 +248,7 @@ def __init__(
include_audio: Optional[bool] = None,
include_timeline: Optional[bool] = None,
include_traces: Optional[bool] = None,
modality: Optional[str] = None,
**extra_params,
):
"""
Expand Down Expand Up @@ -274,6 +276,13 @@ def __init__(
max_discovery_steps: Maximum number of expand/grep tool calls the judge
can make before being forced to return a verdict.
Defaults to 10.
modality: Explicit modality declaration for this role. Accepted values:
``"audio-in"`` (LLM receives raw audio), ``"stt-bridge"``
(audio transcribed to text before the LLM), or ``"text"``
(no audio in the stack). Complementary to ``include_audio``:
``include_audio=True/False`` takes precedence; ``modality=``
applies when ``include_audio`` is ``None``. When ``None``
(default), the modality is auto-detected from litellm capabilities.

Raises:
Exception: If no model is configured either in parameters or global config
Expand Down Expand Up @@ -318,6 +327,7 @@ def __init__(
self.include_audio = include_audio
self.include_timeline = include_timeline
self.include_traces = include_traces
self.modality = modality

if model:
self.model = model
Expand Down Expand Up @@ -361,18 +371,24 @@ def __init__(
raise Exception(agent_not_configured_error_message("JudgeAgent"))

# --------------------------------------------- voice auto-detection (§4.3)
# Small single-purpose helpers; kept out of call() to preserve SRP.
_AUDIO_CAPABLE_MODEL_SUBSTRINGS = ("gpt-4o", "gemini-2.5", "gemini-2.0-flash")

def _model_supports_audio(self) -> bool:
m = (self.model or "").lower()
return any(s in m for s in self._AUDIO_CAPABLE_MODEL_SUBSTRINGS)

def effective_include_audio(self, conversation_has_audio: bool) -> bool:
"""Resolve include_audio: explicit wins, otherwise auto from model capability."""
"""Resolve include_audio: explicit wins, otherwise use modality resolver.

Intentional behavior change (Bundle 3 / AC3b):
Before: gpt-4o → audio-capable (substring match).
After: gpt-4o → text path (litellm advisory returns False).
Before: gpt-audio-mini → NOT audio-capable (not in list).
After: gpt-audio-mini → audio-capable (litellm advisory returns True).
The old substring list was wrong; the resolver is the source of truth.
"""
if self.include_audio is not None:
# Explicit override always wins (AC3c)
return self.include_audio and conversation_has_audio
return conversation_has_audio and self._model_supports_audio()
# Use resolver with per-role declaration (AC0, Bundle 6)
tier, warnings = resolve_modality(declaration=self.modality, model_id=self.model or "")
for w in warnings:
logger.warning(w)
return conversation_has_audio and (tier == ModalityTier.AUDIO_IN)

def effective_include_timeline(self, conversation_has_audio: bool) -> bool:
"""Default timeline True for voice, False for text — unless explicitly set."""
Expand Down
71 changes: 68 additions & 3 deletions python/scenario/scenario_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,10 +455,14 @@ def _new_turn(self):
).__enter__()

if self._trace.root_span is not None:
self._trace.root_span.set_attributes({
attrs = {
"langwatch.origin": "simulation",
"scenario.run_id": self._scenario_run_id,
})
}
for role, tier_value in getattr(self, '_modality_resolutions', {}).items():
attrs[f"scenario.modality.{role}.resolved"] = tier_value
attrs[f"scenario.modality.{role}.tier"] = tier_value
Comment thread
drewdrewthis marked this conversation as resolved.
self._trace.root_span.set_attributes(attrs)

self._pending_agents_on_turn = set(self.agents)
self._pending_roles_on_turn = [
Expand Down Expand Up @@ -575,6 +579,26 @@ async def run(self) -> ScenarioResult:
# Connect all voice adapters before script runs; disconnect in finally.
await self._voice_connect_all()

# Resolve modality per role and store for span stamping.
from .voice.modality_resolver import resolve_modality
from .user_simulator_agent import UserSimulatorAgent
from .judge_agent import JudgeAgent

self._modality_resolutions: dict = {} # role -> tier value string
for agent in self.agents:
if isinstance(agent, UserSimulatorAgent):
decl = getattr(agent, 'modality', None)
tier, _mod_warnings = resolve_modality(declaration=decl, model_id=getattr(agent, 'model', '') or '')
for w in _mod_warnings:
logger.warning(w)
self._modality_resolutions['simulator'] = tier.value
elif isinstance(agent, JudgeAgent):
decl = getattr(agent, 'modality', None)
tier, _mod_warnings = resolve_modality(declaration=decl, model_id=getattr(agent, 'model', '') or '')
for w in _mod_warnings:
logger.warning(w)
self._modality_resolutions['judge'] = tier.value

try:
self._emit_run_started_event(scenario_run_id)

Expand Down Expand Up @@ -724,9 +748,50 @@ def _playback_and_forward(chunk: Any) -> None:

self._on_audio_chunk = _playback_and_forward

# Phase 1: static validation against adapter ClassVars (before connect)
from .voice.modality_resolver import ModalityNegotiationError, validate_modality_setup, resolve_modality
for agent in self.agents:
if isinstance(agent, VoiceAgentAdapter):
model_id = getattr(agent, 'model', None) or getattr(agent, '_model', '') or ''
if model_id:
tier, _mod_warnings = resolve_modality(declaration=None, model_id=model_id)
for w in _mod_warnings:
logger.warning(w)
validate_modality_setup(
tier=tier,
adapter_input_formats=list(agent.capabilities.input_formats),
adapter_name=type(agent).__name__,
)

# Phase 2: connect with live-transport failure catching
from .voice.adapters._stub import PendingTransportError
for agent in self.agents:
if isinstance(agent, VoiceAgentAdapter):
await agent.connect()
try:
await agent.connect()
except PendingTransportError as e:
raise ModalityNegotiationError(
f"Live transport {type(agent).__name__!r} cannot honor "
f"required modality — connect failed: {e}. "
f"Negotiated requirement: audio-in (pcm16/24000)"
) from e
Comment thread
coderabbitai[bot] marked this conversation as resolved.

# Phase 3: validate script step requirements against connected adapter capabilities
from .voice.capabilities import UnsupportedCapabilityError
for step in self.script:
if getattr(step, '_requires_streaming_transcripts', False):
for agent in self.agents:
if isinstance(agent, VoiceAgentAdapter):
if not agent.capabilities.streaming_transcripts:
raise UnsupportedCapabilityError(
type(agent).__name__,
"streaming_transcripts",
hint=(
"interrupt(after_words=N) needs incremental transcripts. "
"Use interrupt(content) without after_words on this adapter — "
"the executor fires barge-in at the agent's first audio chunk."
),
)

def _attach_voice_output(self, result: ScenarioResult) -> ScenarioResult:
"""Populate result.audio/timeline/latency if any voice adapter ran."""
Expand Down
19 changes: 18 additions & 1 deletion python/scenario/user_simulator_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

from ._error_messages import agent_not_configured_error_message
from .types import AgentInput, AgentReturnTypes, AgentRole
from .voice.modality_resolver import ModalityTier, resolve_modality


logger = logging.getLogger("scenario")
Expand Down Expand Up @@ -158,6 +159,7 @@ def __init__(
persona: Optional[str] = None,
audio_effects: Optional[List[Callable[[bytes], bytes]]] = None,
interrupt_probability: float = 0.0,
modality: Optional[str] = None,
**extra_params,
):
"""
Expand All @@ -176,6 +178,11 @@ def __init__(
If not provided, uses model defaults.
system_prompt: Custom system prompt to override default user simulation behavior.
Use this to create specialized user personas or behaviors.
modality: Explicit modality declaration for this role. Accepted values:
``"audio-in"`` (LLM receives raw audio), ``"stt-bridge"``
(audio transcribed to text before the LLM), or ``"text"``
(no audio in the stack). When ``None`` (default), the modality
is auto-detected from the model's litellm capabilities.

Raises:
Exception: If no model is configured either in parameters or global config
Expand Down Expand Up @@ -217,6 +224,7 @@ def __init__(
if not 0.0 <= interrupt_probability <= 1.0:
raise ValueError("interrupt_probability must be in [0, 1]")
self.interrupt_probability = interrupt_probability
self.modality = modality

if model:
self.model = model
Expand Down Expand Up @@ -364,11 +372,20 @@ async def _generate_text(

scenario = input.scenario_state

tier, _warnings = resolve_modality(declaration=self.modality, model_id=self.model or "")
for w in _warnings:
logger.warning(w)

persona_block = (
f"\n\n<persona>\n{self.persona}\n</persona>\n"
if self.persona
else ""
)
_history = (
list(input.messages)
if tier == ModalityTier.AUDIO_IN
else _strip_audio_content(input.messages)
)
messages = [
{
"role": "system",
Expand Down Expand Up @@ -410,7 +427,7 @@ async def _generate_text(
{persona_block}"""),
},
{"role": "assistant", "content": "Hello, how can I help you today?"},
*_strip_audio_content(input.messages),
*_history,
]

# User to assistant role reversal
Expand Down
4 changes: 4 additions & 0 deletions python/scenario/voice/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
transcribe,
)
from ._transcribe import transcribe_segments
from .modality_resolver import ModalityNegotiationError, ModalityTier, resolve_modality
from .tts import register_tts_provider, synthesize
from .vad import WebRTCVadFallback

Expand Down Expand Up @@ -86,7 +87,10 @@
"extract_audio",
"get_stt_provider",
"message_has_audio",
"ModalityNegotiationError",
"ModalityTier",
"register_tts_provider",
"resolve_modality",
"set_stt_provider",
"silent_chunk",
"synthesize",
Expand Down
107 changes: 107 additions & 0 deletions python/scenario/voice/modality_resolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""Per-role voice modality resolution.

Declaration-first: explicit per-role modality beats litellm advisory.
Advisory is used as a hint only; mismatch emits a WARNING.
"""
from __future__ import annotations
import logging
from enum import Enum
from typing import Optional

logger = logging.getLogger(__name__)


class ModalityTier(str, Enum):
AUDIO_IN = "audio-in" # LLM receives raw audio parts
STT_BRIDGE = "stt-bridge" # audio -> STT -> text before LLM
TEXT = "text" # no audio in the stack


class ModalityNegotiationError(Exception):
"""Raised when the declared modality is incompatible with adapter capabilities.

Message always contains both the declared modality string and the conflicting
capability value (e.g. 'realtime' and 'mulaw/8000').
"""


def _litellm_advisory(model_id: str) -> bool:
"""Return True if litellm believes model_id can ingest audio input."""
try:
import litellm.utils
return bool(litellm.utils.supports_audio_input(model=model_id))
except Exception:
return False


def resolve_modality(
*,
declaration: Optional[str], # None = no explicit declaration
model_id: str,
) -> tuple[ModalityTier, list[str]]:
"""Resolve the modality tier for a single role.

Returns (tier, warnings). Warnings are human-readable strings the caller
should emit via logger.warning().

Resolution rules:
- If declaration is given AND litellm agrees -> use declared tier, no warning.
- If declaration is given AND litellm disagrees -> use declared tier, emit WARNING.
- If no declaration -> use litellm advisory as truth, no warning.
"""
advisory_audio = _litellm_advisory(model_id)

if declaration is None:
tier = ModalityTier.AUDIO_IN if advisory_audio else ModalityTier.TEXT
return tier, []

# Normalize declaration string to ModalityTier
try:
declared_tier = ModalityTier(declaration)
except ValueError:
raise ModalityNegotiationError(
f"Unknown modality declaration {declaration!r}; valid values: "
+ ", ".join(t.value for t in ModalityTier)
)

warnings: list[str] = []
declared_audio = declared_tier == ModalityTier.AUDIO_IN

if declared_audio and not advisory_audio:
warnings.append(
f"Model {model_id!r} declared modality 'audio-in' but litellm "
f"reports it does NOT support audio input. "
f"The declared modality 'audio-in' will be used. "
f"If this is wrong, remove the declaration or file a litellm issue."
)
elif not declared_audio and advisory_audio:
warnings.append(
f"Model {model_id!r} declared modality {declaration!r} but litellm "
f"reports it DOES support audio input. "
f"The declared modality {declaration!r} will be used."
)

return declared_tier, warnings


def validate_modality_setup(
*,
tier: ModalityTier,
adapter_input_formats: list[str],
adapter_name: str,
) -> None:
"""Raise ModalityNegotiationError if tier is statically incompatible with adapter.

'audio-in' requires a pcm16-family input format. Adapters that only offer
mulaw/* (telephony) cannot pass audio directly to the LLM.
"""
if tier == ModalityTier.AUDIO_IN:
pcm_formats = [f for f in adapter_input_formats if f.startswith("pcm16")]
if adapter_input_formats and not pcm_formats:
# Has formats, none are pcm16-compatible — static impossible
raise ModalityNegotiationError(
f"Declared modality 'audio-in' is incompatible with adapter "
f"{adapter_name!r}: input formats {adapter_input_formats!r} "
f"contain no pcm16 path (conflicting capability: "
f"{adapter_input_formats[0]!r}). No resample path exists."
)
2 changes: 2 additions & 0 deletions python/scenario/voice/script_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ async def _step(state: "ScenarioState") -> None:
else:
await executor.user(content if content else None) # type: ignore[arg-type]

if after_words is not None:
_step._requires_streaming_transcripts = True # type: ignore[attr-defined]
return _step


Expand Down
Loading
Loading