More tests, more fixes

replayio · Jan 24, 2025 · 274973b · 274973b
1 parent 954e435
commit 274973b
Show file tree

Hide file tree

Showing 10 changed files with 270 additions and 66 deletions.
diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -369,7 +369,10 @@ def step(self, state: State) -> Action:
         params['tools'] = self.tools
         if self.mock_function_calling:
             params['mock_function_calling'] = True
-        # logger.debug(f'#######\nCodeActAgent.step: messages:\n{json.dumps(params)}\n\n#######\n')
+
+        # # Debug log the raw input to the LLM:
+        # logger.debug(f'#######\nCodeActAgent.step: RAW LLM INPUT:\n{repr(params)}\n\n#######\n')
+
         response = self.llm.completion(**params)
         actions = codeact_function_calling.response_to_actions(response, state)
         for action in actions:

diff --git a/openhands/core/schema/replay.py b/openhands/core/schema/replay.py
@@ -2,12 +2,19 @@
 
 
 class ReplayPhase(str, Enum):
+    """All Replay phases that an agent can be in."""
+
     Normal = 'normal'
-    """The agent is not doing anything related to Replay.
+    """The agent does not have access to a recording.
     """
+
     Analysis = 'analysis'
-    """The agent is analyzing a recording.
+    """The agent uses initial-analysis data and dedicated tools to analyze a Replay recording.
     """
+    ConfirmAnalysis = 'confirm_analysis'
+    """The agent is confirming the analysis.
+    """
+
     Edit = 'edit'
-    """The agent is editing the code.
+    """The agent finally edits the code.
     """
diff --git a/openhands/replay/replay_phases.py b/openhands/replay/replay_phases.py
@@ -97,21 +97,35 @@ def update_phase(new_phase: ReplayPhase, state: State, agent: Agent):
 # ReplayStateMachine.
 # ###########################################################################
 
-replay_state_transitions: dict[ReplayPhase, tuple[ReplayPhase, ...] | None] = {
-    ReplayPhase.Normal: None,
-    ReplayPhase.Analysis: (ReplayPhase.Edit,),
-    ReplayPhase.Edit: None,
+# Transition edges controlled by the agent.
+replay_agent_state_transitions: dict[ReplayPhase, tuple[ReplayPhase, ...]] = {
+    ReplayPhase.Analysis: (ReplayPhase.ConfirmAnalysis,),
+    ReplayPhase.ConfirmAnalysis: (ReplayPhase.Edit,),
 }
 
 
 # This machine represents state transitions that the agent can make.
-class ReplayStateMachine:
+class ReplayAgentStateMachine:
+    edges: list[tuple[ReplayPhase, ReplayPhase]]
+    """List of all edges."""
+
+    forward_edges: dict[ReplayPhase, list[ReplayPhase]]
+    """Adjacency list of forward edges."""
+
+    reverse_edges: dict[ReplayPhase, list[ReplayPhase]]
+    """Adjacency list of reverse edges."""
+
     def __init__(self):
-        self.forward_edges: dict[ReplayPhase, list[ReplayPhase] | None] = {
+        self.forward_edges = {
             phase: list(targets) if targets is not None else None
-            for phase, targets in replay_state_transitions.items()
+            for phase, targets in replay_agent_state_transitions.items()
         }
-        self.reverse_edges: dict[ReplayPhase, list[ReplayPhase]] = {}
+        self.edges = [
+            (source, target)
+            for source, targets in self.forward_edges.items()
+            for target in targets
+        ]
+        self.reverse_edges = {}
 
         for source, targets in self.forward_edges.items():
             if targets:
@@ -151,8 +165,8 @@ def get_child_phases(self, phase: ReplayPhase) -> list[ReplayPhase] | None:
         return self.forward_edges.get(phase, list())
 
 
-replay_state_machine = ReplayStateMachine()
+replay_agent_state_machine = ReplayAgentStateMachine()
 
 
 def get_next_agent_replay_phase(phase: ReplayPhase) -> ReplayPhase | None:
-    return replay_state_machine.get_unique_child_phase(phase)
+    return replay_agent_state_machine.get_unique_child_phase(phase)
diff --git a/openhands/replay/replay_prompts.py b/openhands/replay/replay_prompts.py
@@ -35,17 +35,25 @@ class PromptMap(TypedDict):
 
 
 phase_prompts: dict[ReplayPhase, PromptMap] = {
-    ReplayPhase.Edit: {
+    ReplayPhase.ConfirmAnalysis: {
         'enter': """
-You have concluded the analysis.
+You have submitted a draft of the hypothesis.
 
-IMPORTANT: NOW review, then implement the hypothesized changes using tools. The code is available in the workspace. Start by answering these questions:
+IMPORTANT: NOW review the hypothesis. When reviewing, PAY ATTENTION to this:
   1. What is the goal of the investigation according to the initial prompt and initial analysis? IMPORTANT. PAY ATTENTION TO THIS. THIS IS THE ENTRY POINT OF EVERYTHING.
   2. Given (1), is the hypothesis's `problem` description correct? Does it match the goal of the investigation?
   3. Do the `editSuggestions` actually address the issue?
-  4. Rephrase the hypothesis so that it is consistent and correct.
+
+Rephrase and confirm a revised hypothesis.
+"""
+    },
+    ReplayPhase.Edit: {
+        'enter': """
+You have confirmed the analysis.
+
+IMPORTANT: NOW implement the confirmed suggestions. The code is available in the workspace.
 """
-    }
+    },
 }
 
 

diff --git a/openhands/replay/replay_tools.py b/openhands/replay/replay_tools.py
@@ -12,17 +12,20 @@
 from openhands.controller.state.state import State
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.schema.replay import ReplayPhase
+from openhands.events.action.action import Action
+from openhands.events.action.empty import NullAction
 from openhands.events.action.replay import (
-    ReplayAction,
     ReplayPhaseUpdateAction,
     ReplayToolCmdRunAction,
 )
-from openhands.replay.replay_phases import get_next_agent_replay_phase
 
 
 class ReplayToolType(Enum):
     Analysis = 'analysis'
+    """Tools that allow analyzing a Replay recording."""
+
     PhaseTransition = 'phase_transition'
+    """Tools that trigger a phase transition. Phase transitions are generally used to improve self-consistency."""
 
 
 class ReplayTool(ChatCompletionToolParam):
@@ -45,16 +48,19 @@ def replay_analysis_tool(name: str, description: str, parameters: dict) -> Repla
 
 
 class ReplayPhaseTransitionTool(ReplayTool):
+    edges: list[tuple[ReplayPhase, ReplayPhase]]
     replay_tool_type = ReplayToolType.PhaseTransition
-    new_phase: ReplayPhase
 
 
 def replay_phase_tool(
-    new_phase: ReplayPhase, name: str, description: str, parameters: dict
+    edges: list[tuple[ReplayPhase, ReplayPhase]],
+    name: str,
+    description: str,
+    parameters: dict,
 ):
     tool = ReplayPhaseTransitionTool(
+        edges=edges,
         replay_tool_type=ReplayToolType.PhaseTransition,
-        new_phase=new_phase,
         type='function',
         function=ChatCompletionToolParamFunctionChunk(
             name=name,
@@ -127,27 +133,65 @@ def replay_phase_tool(
 
 replay_phase_transition_tools: list[ReplayPhaseTransitionTool] = [
     replay_phase_tool(
-        ReplayPhase.Edit,
+        [
+            (ReplayPhase.Analysis, ReplayPhase.ConfirmAnalysis),
+        ],
         'submit',
-        """Conclude your analysis.""",
+        """Submit your analysis.""",
         {
             'type': 'object',
             'properties': {
                 'problem': {
                     'type': 'string',
-                    'description': 'One-sentence explanation of the core problem that this will solve.',
+                    'description': 'One-sentence explanation of the core problem, according to user requirements and initial-analysis.',
                 },
                 'rootCauseHypothesis': {'type': 'string'},
+                'badCode': {
+                    'type': 'string',
+                    'description': 'Show exactly which code requires changing, according to the user requirements and initial-analysis.',
+                },
+            },
+            'required': ['problem', 'rootCauseHypothesis', 'badCode'],
+        },
+    ),
+    replay_phase_tool(
+        [
+            (ReplayPhase.ConfirmAnalysis, ReplayPhase.Edit),
+        ],
+        'confirm',
+        """Confirm your analysis and suggest specific code changes.""",
+        {
+            'type': 'object',
+            'properties': {
+                'problem': {
+                    'type': 'string',
+                    'description': 'One-sentence explanation of the core problem, according to user requirements and initial-analysis.',
+                },
+                'rootCauseHypothesis': {'type': 'string'},
+                'badCode': {
+                    'type': 'string',
+                    'description': 'Show exactly which code requires changing, according to the user requirements and initial-analysis.',
+                },
                 'editSuggestions': {
                     'type': 'string',
                     'description': 'Provide suggestions to fix the bug, if you know enough about the code that requires modification.',
                 },
             },
-            'required': ['problem', 'rootCauseHypothesis'],
+            'required': ['problem', 'rootCauseHypothesis', 'editSuggestions'],
         },
-    )
+    ),
 ]
 
+replay_phase_transition_tools_by_from_phase = {
+    phase: [
+        t
+        for t in replay_phase_transition_tools
+        for edge in t['edges']
+        if edge[0] == phase
+    ]
+    for phase in {edge[0] for t in replay_phase_transition_tools for edge in t['edges']}
+}
+
 # ###########################################################################
 # Bookkeeping + utilities.
 # ###########################################################################
@@ -181,22 +225,25 @@ def is_replay_tool(
 # ###########################################################################
 
 
-def get_replay_transition_tool_for_current_phase(
-    current_phase: ReplayPhase, name: str | None = None
+def get_replay_transition_tools(current_phase: ReplayPhase) -> list[ReplayTool] | None:
+    phase_tools = replay_phase_transition_tools_by_from_phase.get(current_phase, None)
+    if not phase_tools:
+        return None
+    assert len(phase_tools)
+    return phase_tools
+
+
+def get_replay_transition_tool(
+    current_phase: ReplayPhase, name: str
 ) -> ReplayTool | None:
-    next_phase = get_next_agent_replay_phase(current_phase)
-    if next_phase:
-        transition_tools = [
-            t
-            for t in replay_phase_transition_tools
-            if t['new_phase'] == next_phase
-            and (not name or t['function']['name'] == name)
-        ]
-        assert len(
-            transition_tools
-        ), f'replay_phase_transition_tools is missing tools for new_phase: {next_phase}'
-        return transition_tools[0]
-    return None
+    tools = get_replay_transition_tools(current_phase)
+    if not tools:
+        return None
+    matching = [t for t in tools if t['function']['name'] == name]
+    assert (
+        len(matching) == 1
+    ), f'replay_phase_transition_tools did not get unique matching tool for phase {current_phase} and name {name}'
+    return matching[0]
 
 
 def get_replay_tools(
@@ -206,14 +253,16 @@ def get_replay_tools(
         tools = default_tools
     elif replay_phase == ReplayPhase.Analysis:
         tools = list(replay_analysis_tools)
+    elif replay_phase == ReplayPhase.ConfirmAnalysis:
+        tools = list(replay_analysis_tools)
     elif replay_phase == ReplayPhase.Edit:
         tools = default_tools + list(replay_analysis_tools)
     else:
         raise ValueError(f'Unhandled ReplayPhase in get_tools: {replay_phase}')
 
-    next_phase_tool = get_replay_transition_tool_for_current_phase(replay_phase)
-    if next_phase_tool:
-        tools.append(next_phase_tool)
+    next_phase_tools = get_replay_transition_tools(replay_phase)
+    if next_phase_tools:
+        tools += next_phase_tools
 
     return tools
 
@@ -225,11 +274,11 @@ def get_replay_tools(
 
 def handle_replay_tool_call(
     tool_call: ChatCompletionMessageToolCall, arguments: dict, state: State
-) -> ReplayAction:
+) -> Action:
     logger.info(
         f'[REPLAY] TOOL_CALL {tool_call.function.name} - arguments: {json.dumps(arguments, indent=2)}'
     )
-    action: ReplayAction
+    action: Action
     name = tool_call.function.name
     if is_replay_tool(name, ReplayToolType.Analysis):
         if name == 'inspect-data':
@@ -248,14 +297,24 @@ def handle_replay_tool_call(
             )
     elif is_replay_tool(name, ReplayToolType.PhaseTransition):
         # Request a phase change.
-        tool = get_replay_transition_tool_for_current_phase(state.replay_phase, name)
-        assert tool, f'[REPLAY] Missing ReplayPhaseTransitionTool for {state.replay_phase} in Replay tool_call: {tool_call.function.name}'
-        new_phase = tool['new_phase']
+        tool = get_replay_transition_tool(state.replay_phase, name)
+        assert tool, f'[REPLAY] Missing ReplayPhaseTransitionTool for {state.replay_phase} in Replay tool_call({tool_call.function.name})'
+        new_phase = next(
+            (
+                to_phase
+                for [from_phase, to_phase] in tool['edges']
+                if from_phase == state.replay_phase
+            ),
+            None,
+        )
         assert (
             new_phase
         ), f'[REPLAY] Missing new_phase in Replay tool_call: {tool_call.function.name}'
         action = ReplayPhaseUpdateAction(
             new_phase=new_phase, info=json.dumps(arguments)
         )
+    else:
+        # NOTE: This is a weird bug where Claude sometimes might call a tool that it *had* but does not have anymore.
+        action = NullAction()
     assert action, f'[REPLAY] Unhandled Replay tool_call: {tool_call.function.name}'
     return action
diff --git a/replay_benchmarks/bolt/run-bolt.sh b/replay_benchmarks/bolt/run-bolt.sh
@@ -65,13 +65,11 @@ else
 fi
 
 # Config overrides + sanity checks.
-set -x
 export DEBUG=1
 # export REPLAY_DEV_MODE=1
 export REPLAY_ENABLE_TOOL_CACHE=1
 export WORKSPACE_BASE="$WORKSPACE_ROOT"
 export LLM_MODEL="anthropic/claude-3-5-sonnet-20241022"
-set +x
 
 if [[ -z "$LLM_API_KEY" ]]; then
     if [[ -z "$ANTHROPIC_API_KEY" ]]; then

diff --git a/tests/unit/replay/replay_test_util.py b/tests/unit/replay/replay_test_util.py
@@ -0,0 +1,40 @@
+import enum
+from typing import Collection, TypeVar
+
+from openhands.core.schema.replay import ReplayPhase
+from openhands.replay.replay_phases import (
+    replay_agent_state_machine,
+)
+
+T = TypeVar('T', bound=enum.Enum)
+
+
+def format_enums(enums: Collection[T]) -> set[str]:
+    """Convert any collection of enums to readable set of names."""
+    return {e.name for e in enums}
+
+
+def format_edge(edge: tuple[ReplayPhase, ReplayPhase]) -> str:
+    """Format state transition edge as 'from → to'."""
+    from_state, to_state = edge
+    return f'{from_state.name} → {to_state.name}'
+
+
+# The given edges should form a partition of the set of all agent edges.
+def assert_edges_partition(
+    edges: list[tuple[ReplayPhase, ReplayPhase]],
+) -> None:
+    """Verify tool edges form a partition of agent edges."""
+    # Get ground truth
+    all_agent_edges = replay_agent_state_machine.edges
+
+    # Convert to sorted lists of formatted strings for comparison
+    tool_edge_strs = sorted(f'{f.name} → {t.name}' for [f, t] in edges)
+    agent_edge_strs = sorted(f'{f.name} → {t.name}' for [f, t] in all_agent_edges)
+
+    # Check for duplicates in tool edges
+    if len(set(tool_edge_strs)) != len(tool_edge_strs):
+        raise AssertionError('Tool edges contain duplicates')
+
+    # Check lists match exactly
+    assert tool_edge_strs == agent_edge_strs