WIP: tool refactoring

replayio · Jan 21, 2025 · 8d6649c · 8d6649c
1 parent f7e3d8c
commit 8d6649c
Show file tree

Hide file tree

Showing 8 changed files with 285 additions and 249 deletions.
diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -40,7 +40,7 @@
 )
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
-from openhands.replay.replay_commands import replay_enhance_action
+from openhands.replay.replay_initial_analysis import replay_enhance_action
 from openhands.replay.replay_state_machine import (
     get_replay_observation_message,
 )
@@ -327,7 +327,7 @@ def replay_phase_changed(self, phase: ReplayDebuggingPhase) -> None:
             codeact_enable_jupyter=self.config.codeact_enable_jupyter,
             codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
             codeact_enable_replay=self.config.codeact_enable_replay,
-            codeact_replay_phase=phase,
+            replay_phase=phase,
         )
         logger.debug(
             f'[REPLAY] CodeActAgent.replay_phase_changed({phase}).'

diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
@@ -26,171 +26,13 @@
     IPythonRunCellAction,
     MessageAction,
 )
-from openhands.events.action.replay import (
-    ReplayPhaseUpdateAction,
-    ReplayToolCmdRunAction,
-)
 from openhands.events.tool import ToolCallMetadata
-
-# ---------------------------------------------------------
-# Tool: inspect-data
-# ---------------------------------------------------------
-_REPLAY_INSPECT_DATA_DESCRIPTION = """
-Explains value, data flow and origin information for `expression` at `point`.
-IMPORTANT: Prefer using inspect-data over inspect-point.
-"""
-
-ReplayInspectDataTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='inspect-data',
-        description=_REPLAY_INSPECT_DATA_DESCRIPTION.strip(),
-        parameters={
-            'type': 'object',
-            'properties': {
-                'expression': {
-                    'type': 'string',
-                    'description': 'A valid JS expression. IMPORTANT: First pick the best expression. If the expression is an object: Prefer "array[0]" over "array" and "o.x" over "o" to get closer to the origin and creation site of important data points. Prefer nested object over primitive expressions.',
-                },
-                'point': {
-                    'type': 'string',
-                    'description': 'The point at which to inspect the runtime. The first point comes from the `thisPoint` in the Initial analysis.',
-                },
-                'explanation': {
-                    'type': 'string',
-                    'description': 'Give a concise explanation as to why you take this investigative step.',
-                },
-                'explanation_source': {
-                    'type': 'string',
-                    'description': 'Explain which data you saw in the previous analysis results that informs this step.',
-                },
-            },
-            'required': ['expression', 'point', 'explanation', 'explanation_source'],
-        },
-    ),
-)
-
-# ---------------------------------------------------------
-# Tool: inspect-point
-# ---------------------------------------------------------
-_REPLAY_INSPECT_POINT_DESCRIPTION = """
-Explains dynamic control flow and data flow dependencies of the code at `point`.
-Use this tool instead of `inspect-data` only when you don't have a specific data point to investigate.
-"""
-
-ReplayInspectPointTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='inspect-point',
-        description=_REPLAY_INSPECT_POINT_DESCRIPTION.strip(),
-        parameters={
-            'type': 'object',
-            'properties': {
-                'point': {'type': 'string'},
-            },
-            'required': ['point'],
-        },
-    ),
-)
-
-# ---------------------------------------------------------
-# Tool: SubmitHypothesis
-# TODO: Divide this into multiple steps -
-#   1. The first submission must be as simple as possible to take little computational effort from the analysis steps.
-#   2. The second submission, after analysis has already concluded, must be as complete as possible.
-# ---------------------------------------------------------
-# _REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
-# Your investigation has yielded a complete thin slice from symptom to root cause,
-# enough proof to let the `CodeEdit` agent take over to fix the bug.
-# DO NOT GUESS. You must provide exact code in the exact right location to fix this bug,
-# based on evidence you have gathered.
-# """
-
-# ReplaySubmitHypothesisTool = ChatCompletionToolParam(
-#     type='function',
-#     function=ChatCompletionToolParamFunctionChunk(
-#         name='submit-hypothesis',
-#         description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
-#         parameters={
-#             'type': 'object',
-#             'properties': {
-#                 'rootCauseHypothesis': {'type': 'string'},
-#                 'thinSlice': {
-#                     'type': 'array',
-#                     'items': {
-#                         'type': 'object',
-#                         'properties': {
-#                             'point': {'type': 'string'},
-#                             'code': {'type': 'string'},
-#                             'role': {'type': 'string'},
-#                         },
-#                         'required': ['point', 'code', 'role'],
-#                     },
-#                 },
-#                 'modifications': {
-#                     'type': 'array',
-#                     'items': {
-#                         'type': 'object',
-#                         'properties': {
-#                             'kind': {
-#                                 'type': 'string',
-#                                 'enum': ['add', 'remove', 'modify'],
-#                             },
-#                             'newCode': {'type': 'string'},
-#                             'oldCode': {'type': 'string'},
-#                             'location': {'type': 'string'},
-#                             'point': {'type': 'string'},
-#                             # NOTE: Even though, we really want the `line` here, it will lead to much worse performance because the agent has a hard time computing correct line numbers from its point-based investigation.
-#                             # Instead of requiring a line number, the final fix will be more involved, as explained in the issue.
-#                             # see: https://linear.app/replay/issue/PRO-939/use-tools-data-flow-analysis-for-10608#comment-3b7ae176
-#                             # 'line': {'type': 'number'},
-#                             'briefExplanation': {'type': 'string'},
-#                             'verificationProof': {'type': 'string'},
-#                         },
-#                         'required': [
-#                             'kind',
-#                             'location',
-#                             'briefExplanation',
-#                             # 'line',
-#                             'verificationProof',
-#                         ],
-#                     },
-#                 },
-#             },
-#             'required': ['rootCauseHypothesis', 'thinSlice', 'modifications'],
-#         },
-#     ),
-# )
-_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
-# Use this tool to conclude your analysis and move on to code editing.
-# """
-
-ReplaySubmitHypothesisTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='submit-hypothesis',
-        description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
-        parameters={
-            'type': 'object',
-            'properties': {
-                'problem': {
-                    'type': 'string',
-                    'description': 'One-sentence explanation of the core problem that this will solve.',
-                },
-                'rootCauseHypothesis': {'type': 'string'},
-                'editSuggestions': {
-                    'type': 'string',
-                    'description': 'Provide suggestions to fix the bug, if you know enough about the code that requires modification.',
-                },
-            },
-            'required': ['rootCauseHypothesis'],
-        },
-    ),
+from openhands.replay.replay_tools import (
+    get_replay_tools,
+    handle_replay_tool_call,
+    is_replay_tool,
 )
 
-REPLAY_TOOLS = ['inspect-data', 'inspect-point', 'submit-hypothesis']
-
-
 # ---------------------------------------------------------
 # OH default tools.
 # ---------------------------------------------------------
@@ -631,36 +473,8 @@ def response_to_actions(response: ModelResponse, state: State) -> list[Action]:
                 ) from e
             if tool_call.function.name == 'execute_bash':
                 action = CmdRunAction(**arguments)
-            elif tool_call.function.name in REPLAY_TOOLS:
-                logger.info(
-                    f'[REPLAY] TOOL_CALL {tool_call.function.name} - arguments: {json.dumps(arguments, indent=2)}'
-                )
-                if tool_call.function.name == 'inspect-data':
-                    # Remove explanation props.
-                    arguments = {
-                        k: v for k, v in arguments.items() if 'explanation' not in k
-                    }
-                    action = ReplayToolCmdRunAction(
-                        command_name='inspect-data',
-                        command_args=arguments
-                        | {'recordingId': state.replay_recording_id},
-                    )
-                elif tool_call.function.name == 'inspect-point':
-                    # if arguments['expression'] == 'wiredRules':   # hackfix for 10608 experiment
-                    #     raise FunctionCallValidationError(f'wiredRules is irrelevant to the problem. Try something else.')
-                    action = ReplayToolCmdRunAction(
-                        command_name='inspect-point',
-                        command_args=arguments
-                        | {'recordingId': state.replay_recording_id},
-                    )
-                elif tool_call.function.name == 'submit-hypothesis':
-                    action = ReplayPhaseUpdateAction(
-                        new_phase=ReplayDebuggingPhase.Edit, info=json.dumps(arguments)
-                    )
-                else:
-                    raise ValueError(
-                        f'Unknown Replay tool. Make sure to add them all to REPLAY_TOOLS: {tool_call.function.name}'
-                    )
+            elif is_replay_tool(tool_call.function.name):
+                handle_replay_tool_call(tool_call, arguments, state)
             elif tool_call.function.name == 'execute_ipython_cell':
                 action = IPythonRunCellAction(**arguments)
             elif tool_call.function.name == 'delegate_to_browsing_agent':
@@ -727,31 +541,18 @@ def get_tools(
     codeact_enable_llm_editor: bool = False,
     codeact_enable_jupyter: bool = False,
     codeact_enable_replay: bool = False,
-    codeact_replay_phase: ReplayDebuggingPhase = ReplayDebuggingPhase.Normal,
+    replay_phase: ReplayDebuggingPhase = ReplayDebuggingPhase.Normal,
 ) -> list[ChatCompletionToolParam]:
     default_tools = get_default_tools(
         codeact_enable_browsing,
         codeact_enable_llm_editor,
         codeact_enable_jupyter,
     )
-    if not codeact_enable_replay or codeact_replay_phase == ReplayDebuggingPhase.Normal:
+    if not codeact_enable_replay or replay_phase == ReplayDebuggingPhase.Normal:
         # Use the default tools when not in a Replay-specific phase.
         return default_tools
 
     if codeact_enable_replay:
-        analysis_tools = [
-            ReplayInspectDataTool,
-            ReplayInspectPointTool,
-        ]
-        if codeact_replay_phase == ReplayDebuggingPhase.Analysis:
-            # Analysis tools only. This phase is concluded upon submit-hypothesis.
-            tools = analysis_tools + [ReplaySubmitHypothesisTool]
-        elif codeact_replay_phase == ReplayDebuggingPhase.Edit:
-            # Combine default and analysis tools.
-            tools = default_tools + analysis_tools
-        else:
-            raise ValueError(
-                f'Unhandled ReplayDebuggingPhase in get_tools: {codeact_replay_phase}'
-            )
+        tools = get_replay_tools(replay_phase, default_tools)
 
     return tools
diff --git a/openhands/events/action/replay.py b/openhands/events/action/replay.py
@@ -11,9 +11,14 @@
 )
 
 
+@dataclass
+class ReplayAction(Action):
+    pass
+
+
 # NOTE: We need the same class twice because a lot of the agent logic is based on isinstance checks.
 @dataclass
-class ReplayCmdRunActionBase(Action):
+class ReplayCmdRunActionBase(ReplayAction):
     # Name of the command in @replayapi/cli.
     command_name: str
 
@@ -62,7 +67,7 @@ class ReplayToolCmdRunAction(ReplayCmdRunActionBase):
 
 
 @dataclass
-class ReplayPhaseUpdateAction(Action):
+class ReplayPhaseUpdateAction(ReplayAction):
     new_phase: ReplayDebuggingPhase
 
     thought: str = ''

diff --git a/openhands/replay/replay_commands.py → openhands/replay/replay_initial_analysis.py b/openhands/replay/replay_commands.py → openhands/replay/replay_initial_analysis.py
@@ -1,6 +1,6 @@
 import json
 import re
-from typing import Any, cast
+from typing import Any, Tuple, cast
 
 from openhands.controller.state.state import State
 from openhands.core.logger import openhands_logger as logger
@@ -9,7 +9,7 @@
 from openhands.events.action.replay import ReplayInternalCmdRunAction
 from openhands.events.observation.replay import ReplayInternalCmdOutputObservation
 from openhands.replay.replay_prompts import replay_prompt_phase_analysis
-from openhands.replay.replay_types import AnalysisToolMetadata, AnnotateResult
+from openhands.replay.replay_types import AnalysisToolMetadata
 
 
 def scan_recording_id(issue: str) -> str | None:
@@ -72,20 +72,23 @@ def safe_parse_json(text: str) -> dict[str, Any] | None:
         return None
 
 
-def split_metadata(result):
+def split_metadata(result: dict) -> Tuple[AnalysisToolMetadata, dict]:
     if 'metadata' not in result:
         return {}, result
-    metadata = result['metadata']
+    metadata = cast(AnalysisToolMetadata, result['metadata'])
     data = dict(result)
     del data['metadata']
     return metadata, data
 
 
-def handle_replay_internal_command_observation(
+def on_replay_internal_command_observation(
     state: State, observation: ReplayInternalCmdOutputObservation
 ) -> AnalysisToolMetadata | None:
     """
-    Enhance the user prompt with the results of the replay analysis.
+    Handle result for an internally sent command (not agent tool use or user action).
+
+    NOTE: Currently, the only internal command is the initial-analysis command.
+    Enhance the user prompt with the results of the initial analysis.
     Returns the metadata needed for the agent to switch to analysis tools.
     """
     enhance_action_id = state.extra_data.get('replay_enhance_prompt_id')
@@ -103,19 +106,19 @@ def handle_replay_internal_command_observation(
         state.extra_data['replay_enhance_observed'] = True
 
         # Deserialize stringified result.
-        result: AnnotateResult = cast(
-            AnnotateResult, safe_parse_json(observation.content)
-        )
+        result = safe_parse_json(observation.content)
 
         # Get metadata and enhance prompt.
         if result and 'metadata' in result:
             # initial-analysis provides metadata needed for tool use.
             metadata, command_result = split_metadata(result)
-            replay_prompt_phase_analysis(command_result, user_message)
+            user_message.content = replay_prompt_phase_analysis(
+                command_result, user_message.content
+            )
             return metadata
         else:
             logger.warning(
-                f'[REPLAY] Replay command result cannot be interpreted. Observed content: {str(observation.content)}'
+                f'[REPLAY] Replay command result missing metadata. Observed content: {str(observation.content)}'
             )
 
     return None