diff --git a/python/scenario/judge_agent.py b/python/scenario/judge_agent.py index 6691a3b98..4a890b5b4 100644 --- a/python/scenario/judge_agent.py +++ b/python/scenario/judge_agent.py @@ -631,6 +631,7 @@ async def call( tool_choice=tool_choice, spans=spans, effective_criteria=effective_criteria, + input_messages=input.messages, ) # Standard single-call path for small traces @@ -649,7 +650,7 @@ async def call( ), ) - return self._parse_response(response, effective_criteria, messages) + return self._parse_response(response, effective_criteria, messages, input_messages=input.messages) def _build_trace_digest(self, spans: Sequence[Any]) -> tuple[str, bool]: """ @@ -747,6 +748,7 @@ def _run_discovery_loop( tool_choice: Any, spans: Sequence[Any], effective_criteria: List[str], + input_messages: Sequence[Any], ) -> AgentReturnTypes: """ Runs the multi-step discovery loop for large traces. @@ -801,7 +803,7 @@ def _run_discovery_loop( message = cast(Choices, response.choices[0]).message if not message.tool_calls: # No tool calls - try to parse as a response - return self._parse_response(response, effective_criteria, messages) + return self._parse_response(response, effective_criteria, messages, input_messages=input_messages) # Check for terminal tool call terminal_call = next( @@ -809,7 +811,7 @@ def _run_discovery_loop( None, ) if terminal_call: - return self._parse_response(response, effective_criteria, messages) + return self._parse_response(response, effective_criteria, messages, input_messages=input_messages) # Execute discovery tools and add results to messages # Add the assistant message with tool calls @@ -842,6 +844,7 @@ def _run_discovery_loop( messages=messages, tools=tools, effective_criteria=effective_criteria, + input_messages=input_messages, ) def _force_verdict( @@ -850,6 +853,7 @@ def _force_verdict( messages: List[dict], tools: List[dict], effective_criteria: List[str], + input_messages: Sequence[Any], ) -> AgentReturnTypes: """ Makes one final LLM call with tool_choice forced to finish_test. @@ -897,7 +901,7 @@ def _force_verdict( ), ) return self._parse_response( - forced_response, effective_criteria, rewritten_messages + forced_response, effective_criteria, rewritten_messages, input_messages=input_messages ) def _execute_discovery_tool(self, tool_call: Any, spans: Sequence[Any]) -> str: @@ -931,6 +935,8 @@ def _parse_response( response: Any, effective_criteria: List[str], messages: List[dict], + *, + input_messages: Sequence[Any], ) -> AgentReturnTypes: """ Parses a litellm response into the appropriate return type. @@ -940,7 +946,8 @@ def _parse_response( Args: response: The litellm ModelResponse. effective_criteria: The criteria to evaluate against. - messages: The conversation messages (for inclusion in ScenarioResult). + messages: The judge's internal LLM messages (system prompt + transcript). + input_messages: The actual conversation messages to include in ScenarioResult. Returns: AgentReturnTypes: Either an empty list (continue) or ScenarioResult. @@ -988,7 +995,7 @@ def _parse_response( return ScenarioResult( success=verdict == "success" and len(failed_criteria) == 0, - messages=cast(Any, messages), + messages=cast(Any, input_messages), reasoning=reasoning, passed_criteria=passed_criteria, failed_criteria=failed_criteria, @@ -1005,7 +1012,7 @@ def _parse_response( ) return ScenarioResult( success=False, - messages=cast(Any, messages), + messages=cast(Any, input_messages), reasoning=( "JudgeAgent: trace discovery did not converge on a " "verdict within the step budget" diff --git a/python/tests/test_judge_agent.py b/python/tests/test_judge_agent.py index 564d2c046..01c4ddbd4 100644 --- a/python/tests/test_judge_agent.py +++ b/python/tests/test_judge_agent.py @@ -1,5 +1,5 @@ import pytest -from typing import Any +from typing import Any, cast from unittest.mock import patch, MagicMock from openai import OpenAI from scenario import JudgeAgent @@ -234,6 +234,70 @@ async def test_judge_is_last_message_on_final_turn( ScenarioConfig.default_config = None +@pytest.mark.asyncio +async def test_judge_result_messages_is_conversation_not_judge_context(): + """ScenarioResult.messages must contain the actual conversation, not the judge's internal context. + + Regression for #221: in 0.7.15 ScenarioResult.messages was set to the + judge's internal LLM messages (system prompt + transcript text) instead of + input.messages (the actual conversation between user-sim and agent under test). + """ + ScenarioConfig.default_config = ScenarioConfig(default_model="openai/gpt-4") + judge = JudgeAgent(criteria=["Agent replies helpfully"]) + + # This is the "real" conversation the judge is evaluating. + real_conversation = [ + {"role": "user", "content": "Hello, what is the weather?"}, + {"role": "assistant", "content": "It is sunny today!"}, + {"role": "user", "content": "Thanks!"}, + ] + + mock_scenario_state = MagicMock() + mock_scenario_state.description = "Weather query scenario" + mock_scenario_state.current_turn = 1 + mock_scenario_state.config.max_turns = 10 + + agent_input = AgentInput( + thread_id="test", + messages=cast(Any, real_conversation), + new_messages=[], + judgment_request=JudgmentRequest(), + scenario_state=mock_scenario_state, + ) + + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.tool_calls = [MagicMock()] + mock_response.choices[0].message.tool_calls[0].function.name = "finish_test" + mock_response.choices[0].message.tool_calls[ + 0 + ].function.arguments = '{"verdict": "success", "reasoning": "Agent replied helpfully", "criteria": {"agent_replies_helpfully": "true"}}' + + mock_executor = MagicMock() + mock_executor.config = MagicMock() + mock_executor.config.cache_key = None + token = context_scenario.set(mock_executor) + + try: + with patch( + "scenario.judge_agent.litellm.completion", return_value=mock_response + ): + result = await judge.call(agent_input) + + from scenario.types import ScenarioResult + assert isinstance(result, ScenarioResult), "JudgeAgent should return ScenarioResult on finish_test" + # The returned messages must be the actual conversation, NOT the + # judge's internal context (system prompt + transcript text). + assert result.messages == real_conversation, ( + "ScenarioResult.messages should be the actual conversation " + f"(3 messages), got {len(result.messages)} messages: " + f"{[m.get('role') for m in result.messages]}" + ) + finally: + context_scenario.reset(token) + ScenarioConfig.default_config = None + + @pytest.mark.asyncio async def test_judge_includes_additional_context_in_prompt(): """JudgmentRequest.context is injected into the judge's user message under . diff --git a/python/tests/test_judge_force_verdict_hardening.py b/python/tests/test_judge_force_verdict_hardening.py index b83415761..ac4055912 100644 --- a/python/tests/test_judge_force_verdict_hardening.py +++ b/python/tests/test_judge_force_verdict_hardening.py @@ -168,6 +168,7 @@ def fake_completion(**kwargs): messages=messages, tools=tools, effective_criteria=["Agent works"], + input_messages=[], ) assert isinstance(result, ScenarioResult) @@ -210,7 +211,7 @@ def test_leaked_discovery_tool_returns_inconclusive_not_exception(self): "expand_trace", {"span_ids": ["xx"]}, call_id="tc-leak" ) - result = agent._parse_response(leaked, ["A", "B"], messages=[]) + result = agent._parse_response(leaked, ["A", "B"], messages=[], input_messages=[]) assert isinstance(result, ScenarioResult) assert result.success is False