Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions python/scenario/judge_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,7 @@ async def call(
tool_choice=tool_choice,
spans=spans,
effective_criteria=effective_criteria,
input_messages=input.messages,
)

# Standard single-call path for small traces
Expand All @@ -649,7 +650,7 @@ async def call(
),
)

return self._parse_response(response, effective_criteria, messages)
return self._parse_response(response, effective_criteria, messages, input_messages=input.messages)

def _build_trace_digest(self, spans: Sequence[Any]) -> tuple[str, bool]:
"""
Expand Down Expand Up @@ -747,6 +748,7 @@ def _run_discovery_loop(
tool_choice: Any,
spans: Sequence[Any],
effective_criteria: List[str],
input_messages: Sequence[Any],
) -> AgentReturnTypes:
"""
Runs the multi-step discovery loop for large traces.
Expand Down Expand Up @@ -801,15 +803,15 @@ def _run_discovery_loop(
message = cast(Choices, response.choices[0]).message
if not message.tool_calls:
# No tool calls - try to parse as a response
return self._parse_response(response, effective_criteria, messages)
return self._parse_response(response, effective_criteria, messages, input_messages=input_messages)

# Check for terminal tool call
terminal_call = next(
(tc for tc in message.tool_calls if tc.function.name in terminal_tool_names),
None,
)
if terminal_call:
return self._parse_response(response, effective_criteria, messages)
return self._parse_response(response, effective_criteria, messages, input_messages=input_messages)

# Execute discovery tools and add results to messages
# Add the assistant message with tool calls
Expand Down Expand Up @@ -842,6 +844,7 @@ def _run_discovery_loop(
messages=messages,
tools=tools,
effective_criteria=effective_criteria,
input_messages=input_messages,
)

def _force_verdict(
Expand All @@ -850,6 +853,7 @@ def _force_verdict(
messages: List[dict],
tools: List[dict],
effective_criteria: List[str],
input_messages: Sequence[Any],
) -> AgentReturnTypes:
"""
Makes one final LLM call with tool_choice forced to finish_test.
Expand Down Expand Up @@ -897,7 +901,7 @@ def _force_verdict(
),
)
return self._parse_response(
forced_response, effective_criteria, rewritten_messages
forced_response, effective_criteria, rewritten_messages, input_messages=input_messages
)

def _execute_discovery_tool(self, tool_call: Any, spans: Sequence[Any]) -> str:
Expand Down Expand Up @@ -931,6 +935,8 @@ def _parse_response(
response: Any,
effective_criteria: List[str],
messages: List[dict],
*,
input_messages: Sequence[Any],
) -> AgentReturnTypes:
"""
Parses a litellm response into the appropriate return type.
Expand All @@ -940,7 +946,8 @@ def _parse_response(
Args:
response: The litellm ModelResponse.
effective_criteria: The criteria to evaluate against.
messages: The conversation messages (for inclusion in ScenarioResult).
messages: The judge's internal LLM messages (system prompt + transcript).
input_messages: The actual conversation messages to include in ScenarioResult.

Returns:
AgentReturnTypes: Either an empty list (continue) or ScenarioResult.
Expand Down Expand Up @@ -988,7 +995,7 @@ def _parse_response(

return ScenarioResult(
success=verdict == "success" and len(failed_criteria) == 0,
messages=cast(Any, messages),
messages=cast(Any, input_messages),
reasoning=reasoning,
passed_criteria=passed_criteria,
failed_criteria=failed_criteria,
Expand All @@ -1005,7 +1012,7 @@ def _parse_response(
)
return ScenarioResult(
success=False,
messages=cast(Any, messages),
messages=cast(Any, input_messages),
reasoning=(
"JudgeAgent: trace discovery did not converge on a "
"verdict within the step budget"
Expand Down
66 changes: 65 additions & 1 deletion python/tests/test_judge_agent.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pytest
from typing import Any
from typing import Any, cast
from unittest.mock import patch, MagicMock
from openai import OpenAI
from scenario import JudgeAgent
Expand Down Expand Up @@ -234,6 +234,70 @@ async def test_judge_is_last_message_on_final_turn(
ScenarioConfig.default_config = None


@pytest.mark.asyncio
async def test_judge_result_messages_is_conversation_not_judge_context():
"""ScenarioResult.messages must contain the actual conversation, not the judge's internal context.

Regression for #221: in 0.7.15 ScenarioResult.messages was set to the
judge's internal LLM messages (system prompt + transcript text) instead of
input.messages (the actual conversation between user-sim and agent under test).
"""
ScenarioConfig.default_config = ScenarioConfig(default_model="openai/gpt-4")
judge = JudgeAgent(criteria=["Agent replies helpfully"])

# This is the "real" conversation the judge is evaluating.
real_conversation = [
{"role": "user", "content": "Hello, what is the weather?"},
{"role": "assistant", "content": "It is sunny today!"},
{"role": "user", "content": "Thanks!"},
]

mock_scenario_state = MagicMock()
mock_scenario_state.description = "Weather query scenario"
mock_scenario_state.current_turn = 1
mock_scenario_state.config.max_turns = 10

agent_input = AgentInput(
thread_id="test",
messages=cast(Any, real_conversation),
new_messages=[],
judgment_request=JudgmentRequest(),
scenario_state=mock_scenario_state,
)

mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.tool_calls = [MagicMock()]
mock_response.choices[0].message.tool_calls[0].function.name = "finish_test"
mock_response.choices[0].message.tool_calls[
0
].function.arguments = '{"verdict": "success", "reasoning": "Agent replied helpfully", "criteria": {"agent_replies_helpfully": "true"}}'

mock_executor = MagicMock()
mock_executor.config = MagicMock()
mock_executor.config.cache_key = None
token = context_scenario.set(mock_executor)

try:
with patch(
"scenario.judge_agent.litellm.completion", return_value=mock_response
):
result = await judge.call(agent_input)

from scenario.types import ScenarioResult
assert isinstance(result, ScenarioResult), "JudgeAgent should return ScenarioResult on finish_test"
# The returned messages must be the actual conversation, NOT the
# judge's internal context (system prompt + transcript text).
assert result.messages == real_conversation, (
"ScenarioResult.messages should be the actual conversation "
f"(3 messages), got {len(result.messages)} messages: "
f"{[m.get('role') for m in result.messages]}"
)
finally:
context_scenario.reset(token)
ScenarioConfig.default_config = None


@pytest.mark.asyncio
async def test_judge_includes_additional_context_in_prompt():
"""JudgmentRequest.context is injected into the judge's user message under <additional_context>.
Expand Down
3 changes: 2 additions & 1 deletion python/tests/test_judge_force_verdict_hardening.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ def fake_completion(**kwargs):
messages=messages,
tools=tools,
effective_criteria=["Agent works"],
input_messages=[],
)

assert isinstance(result, ScenarioResult)
Expand Down Expand Up @@ -210,7 +211,7 @@ def test_leaked_discovery_tool_returns_inconclusive_not_exception(self):
"expand_trace", {"span_ids": ["xx"]}, call_id="tc-leak"
)

result = agent._parse_response(leaked, ["A", "B"], messages=[])
result = agent._parse_response(leaked, ["A", "B"], messages=[], input_messages=[])

assert isinstance(result, ScenarioResult)
assert result.success is False
Expand Down
Loading