langwatch · drewdrewthis · Jun 11, 2026 · May 25, 2026 · May 25, 2026 · Jun 10, 2026
diff --git a/python/scenario/judge_agent.py b/python/scenario/judge_agent.py
@@ -631,6 +631,7 @@ async def call(
                 tool_choice=tool_choice,
                 spans=spans,
                 effective_criteria=effective_criteria,
+                input_messages=input.messages,
             )
 
         # Standard single-call path for small traces
@@ -649,7 +650,7 @@ async def call(
             ),
         )
 
-        return self._parse_response(response, effective_criteria, messages)
+        return self._parse_response(response, effective_criteria, messages, input_messages=input.messages)
 
     def _build_trace_digest(self, spans: Sequence[Any]) -> tuple[str, bool]:
         """
@@ -747,6 +748,7 @@ def _run_discovery_loop(
         tool_choice: Any,
         spans: Sequence[Any],
         effective_criteria: List[str],
+        input_messages: Sequence[Any],
     ) -> AgentReturnTypes:
         """
         Runs the multi-step discovery loop for large traces.
@@ -801,15 +803,15 @@ def _run_discovery_loop(
             message = cast(Choices, response.choices[0]).message
             if not message.tool_calls:
                 # No tool calls - try to parse as a response
-                return self._parse_response(response, effective_criteria, messages)
+                return self._parse_response(response, effective_criteria, messages, input_messages=input_messages)
 
             # Check for terminal tool call
             terminal_call = next(
                 (tc for tc in message.tool_calls if tc.function.name in terminal_tool_names),
                 None,
             )
             if terminal_call:
-                return self._parse_response(response, effective_criteria, messages)
+                return self._parse_response(response, effective_criteria, messages, input_messages=input_messages)
 
             # Execute discovery tools and add results to messages
             # Add the assistant message with tool calls
@@ -842,6 +844,7 @@ def _run_discovery_loop(
             messages=messages,
             tools=tools,
             effective_criteria=effective_criteria,
+            input_messages=input_messages,
         )
 
     def _force_verdict(
@@ -850,6 +853,7 @@ def _force_verdict(
         messages: List[dict],
         tools: List[dict],
         effective_criteria: List[str],
+        input_messages: Sequence[Any],
     ) -> AgentReturnTypes:
         """
         Makes one final LLM call with tool_choice forced to finish_test.
@@ -897,7 +901,7 @@ def _force_verdict(
             ),
         )
         return self._parse_response(
-            forced_response, effective_criteria, rewritten_messages
+            forced_response, effective_criteria, rewritten_messages, input_messages=input_messages
         )
 
     def _execute_discovery_tool(self, tool_call: Any, spans: Sequence[Any]) -> str:
@@ -931,6 +935,8 @@ def _parse_response(
         response: Any,
         effective_criteria: List[str],
         messages: List[dict],
+        *,
+        input_messages: Sequence[Any],
     ) -> AgentReturnTypes:
         """
         Parses a litellm response into the appropriate return type.
@@ -940,7 +946,8 @@ def _parse_response(
         Args:
             response: The litellm ModelResponse.
             effective_criteria: The criteria to evaluate against.
-            messages: The conversation messages (for inclusion in ScenarioResult).
+            messages: The judge's internal LLM messages (system prompt + transcript).
+            input_messages: The actual conversation messages to include in ScenarioResult.
 
         Returns:
             AgentReturnTypes: Either an empty list (continue) or ScenarioResult.
@@ -988,7 +995,7 @@ def _parse_response(
 
                 return ScenarioResult(
                     success=verdict == "success" and len(failed_criteria) == 0,
-                    messages=cast(Any, messages),
+                    messages=cast(Any, input_messages),
                     reasoning=reasoning,
                     passed_criteria=passed_criteria,
                     failed_criteria=failed_criteria,
@@ -1005,7 +1012,7 @@ def _parse_response(
             )
             return ScenarioResult(
                 success=False,
-                messages=cast(Any, messages),
+                messages=cast(Any, input_messages),
                 reasoning=(
                     "JudgeAgent: trace discovery did not converge on a "
                     "verdict within the step budget"

diff --git a/python/tests/test_judge_agent.py b/python/tests/test_judge_agent.py
@@ -1,5 +1,5 @@
 import pytest
-from typing import Any
+from typing import Any, cast
 from unittest.mock import patch, MagicMock
 from openai import OpenAI
 from scenario import JudgeAgent
@@ -234,6 +234,70 @@ async def test_judge_is_last_message_on_final_turn(
         ScenarioConfig.default_config = None
 
 
+@pytest.mark.asyncio
+async def test_judge_result_messages_is_conversation_not_judge_context():
+    """ScenarioResult.messages must contain the actual conversation, not the judge's internal context.
+
+    Regression for #221: in 0.7.15 ScenarioResult.messages was set to the
+    judge's internal LLM messages (system prompt + transcript text) instead of
+    input.messages (the actual conversation between user-sim and agent under test).
+    """
+    ScenarioConfig.default_config = ScenarioConfig(default_model="openai/gpt-4")
+    judge = JudgeAgent(criteria=["Agent replies helpfully"])
+
+    # This is the "real" conversation the judge is evaluating.
+    real_conversation = [
+        {"role": "user", "content": "Hello, what is the weather?"},
+        {"role": "assistant", "content": "It is sunny today!"},
+        {"role": "user", "content": "Thanks!"},
+    ]
+
+    mock_scenario_state = MagicMock()
+    mock_scenario_state.description = "Weather query scenario"
+    mock_scenario_state.current_turn = 1
+    mock_scenario_state.config.max_turns = 10
+
+    agent_input = AgentInput(
+        thread_id="test",
+        messages=cast(Any, real_conversation),
+        new_messages=[],
+        judgment_request=JudgmentRequest(),
+        scenario_state=mock_scenario_state,
+    )
+
+    mock_response = MagicMock()
+    mock_response.choices = [MagicMock()]
+    mock_response.choices[0].message.tool_calls = [MagicMock()]
+    mock_response.choices[0].message.tool_calls[0].function.name = "finish_test"
+    mock_response.choices[0].message.tool_calls[
+        0
+    ].function.arguments = '{"verdict": "success", "reasoning": "Agent replied helpfully", "criteria": {"agent_replies_helpfully": "true"}}'
+
+    mock_executor = MagicMock()
+    mock_executor.config = MagicMock()
+    mock_executor.config.cache_key = None
+    token = context_scenario.set(mock_executor)
+
+    try:
+        with patch(
+            "scenario.judge_agent.litellm.completion", return_value=mock_response
+        ):
+            result = await judge.call(agent_input)
+
+            from scenario.types import ScenarioResult
+            assert isinstance(result, ScenarioResult), "JudgeAgent should return ScenarioResult on finish_test"
+            # The returned messages must be the actual conversation, NOT the
+            # judge's internal context (system prompt + transcript text).
+            assert result.messages == real_conversation, (
+                "ScenarioResult.messages should be the actual conversation "
+                f"(3 messages), got {len(result.messages)} messages: "
+                f"{[m.get('role') for m in result.messages]}"
+            )
+    finally:
+        context_scenario.reset(token)
+        ScenarioConfig.default_config = None
+
+
 @pytest.mark.asyncio
 async def test_judge_includes_additional_context_in_prompt():
     """JudgmentRequest.context is injected into the judge's user message under <additional_context>.

diff --git a/python/tests/test_judge_force_verdict_hardening.py b/python/tests/test_judge_force_verdict_hardening.py
@@ -168,6 +168,7 @@ def fake_completion(**kwargs):
                 messages=messages,
                 tools=tools,
                 effective_criteria=["Agent works"],
+                input_messages=[],
             )
 
         assert isinstance(result, ScenarioResult)
@@ -210,7 +211,7 @@ def test_leaked_discovery_tool_returns_inconclusive_not_exception(self):
             "expand_trace", {"span_ids": ["xx"]}, call_id="tc-leak"
         )
 
-        result = agent._parse_response(leaked, ["A", "B"], messages=[])
+        result = agent._parse_response(leaked, ["A", "B"], messages=[], input_messages=[])
 
         assert isinstance(result, ScenarioResult)
         assert result.success is False