From 8d6649c7e71e6e1ba049395724f6ce2e8aab2aaa Mon Sep 17 00:00:00 2001
From: "D. Seifert" <Domiii@users.noreply.github.com>
Date: Tue, 21 Jan 2025 21:48:30 +0800
Subject: [PATCH] WIP: tool refactoring

---
 .../agenthub/codeact_agent/codeact_agent.py   |   4 +-
 .../codeact_agent/function_calling.py         | 217 +---------------
 openhands/events/action/replay.py             |   9 +-
 ...commands.py => replay_initial_analysis.py} |  25 +-
 openhands/replay/replay_prompts.py            |  24 +-
 openhands/replay/replay_state_machine.py      |  10 +-
 openhands/replay/replay_tools.py              | 236 ++++++++++++++++++
 openhands/replay/replay_types.py              |   9 -
 8 files changed, 285 insertions(+), 249 deletions(-)
 rename openhands/replay/{replay_commands.py => replay_initial_analysis.py} (82%)
 create mode 100644 openhands/replay/replay_tools.py

diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
index 4e04f87443f1..7453c9df2d41 100644
--- a/openhands/agenthub/codeact_agent/codeact_agent.py
+++ b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -40,7 +40,7 @@
 )
 from openhands.events.serialization.event import truncate_content
 from openhands.llm.llm import LLM
-from openhands.replay.replay_commands import replay_enhance_action
+from openhands.replay.replay_initial_analysis import replay_enhance_action
 from openhands.replay.replay_state_machine import (
     get_replay_observation_message,
 )
@@ -327,7 +327,7 @@ def replay_phase_changed(self, phase: ReplayDebuggingPhase) -> None:
             codeact_enable_jupyter=self.config.codeact_enable_jupyter,
             codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
             codeact_enable_replay=self.config.codeact_enable_replay,
-            codeact_replay_phase=phase,
+            replay_phase=phase,
         )
         logger.debug(
             f'[REPLAY] CodeActAgent.replay_phase_changed({phase}).'
diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
index 2fbd34f71282..16a7d1af848d 100644
--- a/openhands/agenthub/codeact_agent/function_calling.py
+++ b/openhands/agenthub/codeact_agent/function_calling.py
@@ -26,171 +26,13 @@
     IPythonRunCellAction,
     MessageAction,
 )
-from openhands.events.action.replay import (
-    ReplayPhaseUpdateAction,
-    ReplayToolCmdRunAction,
-)
 from openhands.events.tool import ToolCallMetadata
-
-# ---------------------------------------------------------
-# Tool: inspect-data
-# ---------------------------------------------------------
-_REPLAY_INSPECT_DATA_DESCRIPTION = """
-Explains value, data flow and origin information for `expression` at `point`.
-IMPORTANT: Prefer using inspect-data over inspect-point.
-"""
-
-ReplayInspectDataTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='inspect-data',
-        description=_REPLAY_INSPECT_DATA_DESCRIPTION.strip(),
-        parameters={
-            'type': 'object',
-            'properties': {
-                'expression': {
-                    'type': 'string',
-                    'description': 'A valid JS expression. IMPORTANT: First pick the best expression. If the expression is an object: Prefer "array[0]" over "array" and "o.x" over "o" to get closer to the origin and creation site of important data points. Prefer nested object over primitive expressions.',
-                },
-                'point': {
-                    'type': 'string',
-                    'description': 'The point at which to inspect the runtime. The first point comes from the `thisPoint` in the Initial analysis.',
-                },
-                'explanation': {
-                    'type': 'string',
-                    'description': 'Give a concise explanation as to why you take this investigative step.',
-                },
-                'explanation_source': {
-                    'type': 'string',
-                    'description': 'Explain which data you saw in the previous analysis results that informs this step.',
-                },
-            },
-            'required': ['expression', 'point', 'explanation', 'explanation_source'],
-        },
-    ),
-)
-
-# ---------------------------------------------------------
-# Tool: inspect-point
-# ---------------------------------------------------------
-_REPLAY_INSPECT_POINT_DESCRIPTION = """
-Explains dynamic control flow and data flow dependencies of the code at `point`.
-Use this tool instead of `inspect-data` only when you don't have a specific data point to investigate.
-"""
-
-ReplayInspectPointTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='inspect-point',
-        description=_REPLAY_INSPECT_POINT_DESCRIPTION.strip(),
-        parameters={
-            'type': 'object',
-            'properties': {
-                'point': {'type': 'string'},
-            },
-            'required': ['point'],
-        },
-    ),
-)
-
-# ---------------------------------------------------------
-# Tool: SubmitHypothesis
-# TODO: Divide this into multiple steps -
-#   1. The first submission must be as simple as possible to take little computational effort from the analysis steps.
-#   2. The second submission, after analysis has already concluded, must be as complete as possible.
-# ---------------------------------------------------------
-# _REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
-# Your investigation has yielded a complete thin slice from symptom to root cause,
-# enough proof to let the `CodeEdit` agent take over to fix the bug.
-# DO NOT GUESS. You must provide exact code in the exact right location to fix this bug,
-# based on evidence you have gathered.
-# """
-
-# ReplaySubmitHypothesisTool = ChatCompletionToolParam(
-#     type='function',
-#     function=ChatCompletionToolParamFunctionChunk(
-#         name='submit-hypothesis',
-#         description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
-#         parameters={
-#             'type': 'object',
-#             'properties': {
-#                 'rootCauseHypothesis': {'type': 'string'},
-#                 'thinSlice': {
-#                     'type': 'array',
-#                     'items': {
-#                         'type': 'object',
-#                         'properties': {
-#                             'point': {'type': 'string'},
-#                             'code': {'type': 'string'},
-#                             'role': {'type': 'string'},
-#                         },
-#                         'required': ['point', 'code', 'role'],
-#                     },
-#                 },
-#                 'modifications': {
-#                     'type': 'array',
-#                     'items': {
-#                         'type': 'object',
-#                         'properties': {
-#                             'kind': {
-#                                 'type': 'string',
-#                                 'enum': ['add', 'remove', 'modify'],
-#                             },
-#                             'newCode': {'type': 'string'},
-#                             'oldCode': {'type': 'string'},
-#                             'location': {'type': 'string'},
-#                             'point': {'type': 'string'},
-#                             # NOTE: Even though, we really want the `line` here, it will lead to much worse performance because the agent has a hard time computing correct line numbers from its point-based investigation.
-#                             # Instead of requiring a line number, the final fix will be more involved, as explained in the issue.
-#                             # see: https://linear.app/replay/issue/PRO-939/use-tools-data-flow-analysis-for-10608#comment-3b7ae176
-#                             # 'line': {'type': 'number'},
-#                             'briefExplanation': {'type': 'string'},
-#                             'verificationProof': {'type': 'string'},
-#                         },
-#                         'required': [
-#                             'kind',
-#                             'location',
-#                             'briefExplanation',
-#                             # 'line',
-#                             'verificationProof',
-#                         ],
-#                     },
-#                 },
-#             },
-#             'required': ['rootCauseHypothesis', 'thinSlice', 'modifications'],
-#         },
-#     ),
-# )
-_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
-# Use this tool to conclude your analysis and move on to code editing.
-# """
-
-ReplaySubmitHypothesisTool = ChatCompletionToolParam(
-    type='function',
-    function=ChatCompletionToolParamFunctionChunk(
-        name='submit-hypothesis',
-        description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
-        parameters={
-            'type': 'object',
-            'properties': {
-                'problem': {
-                    'type': 'string',
-                    'description': 'One-sentence explanation of the core problem that this will solve.',
-                },
-                'rootCauseHypothesis': {'type': 'string'},
-                'editSuggestions': {
-                    'type': 'string',
-                    'description': 'Provide suggestions to fix the bug, if you know enough about the code that requires modification.',
-                },
-            },
-            'required': ['rootCauseHypothesis'],
-        },
-    ),
+from openhands.replay.replay_tools import (
+    get_replay_tools,
+    handle_replay_tool_call,
+    is_replay_tool,
 )
 
-REPLAY_TOOLS = ['inspect-data', 'inspect-point', 'submit-hypothesis']
-
-
 # ---------------------------------------------------------
 # OH default tools.
 # ---------------------------------------------------------
@@ -631,36 +473,8 @@ def response_to_actions(response: ModelResponse, state: State) -> list[Action]:
                 ) from e
             if tool_call.function.name == 'execute_bash':
                 action = CmdRunAction(**arguments)
-            elif tool_call.function.name in REPLAY_TOOLS:
-                logger.info(
-                    f'[REPLAY] TOOL_CALL {tool_call.function.name} - arguments: {json.dumps(arguments, indent=2)}'
-                )
-                if tool_call.function.name == 'inspect-data':
-                    # Remove explanation props.
-                    arguments = {
-                        k: v for k, v in arguments.items() if 'explanation' not in k
-                    }
-                    action = ReplayToolCmdRunAction(
-                        command_name='inspect-data',
-                        command_args=arguments
-                        | {'recordingId': state.replay_recording_id},
-                    )
-                elif tool_call.function.name == 'inspect-point':
-                    # if arguments['expression'] == 'wiredRules':   # hackfix for 10608 experiment
-                    #     raise FunctionCallValidationError(f'wiredRules is irrelevant to the problem. Try something else.')
-                    action = ReplayToolCmdRunAction(
-                        command_name='inspect-point',
-                        command_args=arguments
-                        | {'recordingId': state.replay_recording_id},
-                    )
-                elif tool_call.function.name == 'submit-hypothesis':
-                    action = ReplayPhaseUpdateAction(
-                        new_phase=ReplayDebuggingPhase.Edit, info=json.dumps(arguments)
-                    )
-                else:
-                    raise ValueError(
-                        f'Unknown Replay tool. Make sure to add them all to REPLAY_TOOLS: {tool_call.function.name}'
-                    )
+            elif is_replay_tool(tool_call.function.name):
+                handle_replay_tool_call(tool_call, arguments, state)
             elif tool_call.function.name == 'execute_ipython_cell':
                 action = IPythonRunCellAction(**arguments)
             elif tool_call.function.name == 'delegate_to_browsing_agent':
@@ -727,31 +541,18 @@ def get_tools(
     codeact_enable_llm_editor: bool = False,
     codeact_enable_jupyter: bool = False,
     codeact_enable_replay: bool = False,
-    codeact_replay_phase: ReplayDebuggingPhase = ReplayDebuggingPhase.Normal,
+    replay_phase: ReplayDebuggingPhase = ReplayDebuggingPhase.Normal,
 ) -> list[ChatCompletionToolParam]:
     default_tools = get_default_tools(
         codeact_enable_browsing,
         codeact_enable_llm_editor,
         codeact_enable_jupyter,
     )
-    if not codeact_enable_replay or codeact_replay_phase == ReplayDebuggingPhase.Normal:
+    if not codeact_enable_replay or replay_phase == ReplayDebuggingPhase.Normal:
         # Use the default tools when not in a Replay-specific phase.
         return default_tools
 
     if codeact_enable_replay:
-        analysis_tools = [
-            ReplayInspectDataTool,
-            ReplayInspectPointTool,
-        ]
-        if codeact_replay_phase == ReplayDebuggingPhase.Analysis:
-            # Analysis tools only. This phase is concluded upon submit-hypothesis.
-            tools = analysis_tools + [ReplaySubmitHypothesisTool]
-        elif codeact_replay_phase == ReplayDebuggingPhase.Edit:
-            # Combine default and analysis tools.
-            tools = default_tools + analysis_tools
-        else:
-            raise ValueError(
-                f'Unhandled ReplayDebuggingPhase in get_tools: {codeact_replay_phase}'
-            )
+        tools = get_replay_tools(replay_phase, default_tools)
 
     return tools
diff --git a/openhands/events/action/replay.py b/openhands/events/action/replay.py
index 0d0c2d0983cc..885b8f8eab1f 100644
--- a/openhands/events/action/replay.py
+++ b/openhands/events/action/replay.py
@@ -11,9 +11,14 @@
 )
 
 
+@dataclass
+class ReplayAction(Action):
+    pass
+
+
 # NOTE: We need the same class twice because a lot of the agent logic is based on isinstance checks.
 @dataclass
-class ReplayCmdRunActionBase(Action):
+class ReplayCmdRunActionBase(ReplayAction):
     # Name of the command in @replayapi/cli.
     command_name: str
 
@@ -62,7 +67,7 @@ class ReplayToolCmdRunAction(ReplayCmdRunActionBase):
 
 
 @dataclass
-class ReplayPhaseUpdateAction(Action):
+class ReplayPhaseUpdateAction(ReplayAction):
     new_phase: ReplayDebuggingPhase
 
     thought: str = ''
diff --git a/openhands/replay/replay_commands.py b/openhands/replay/replay_initial_analysis.py
similarity index 82%
rename from openhands/replay/replay_commands.py
rename to openhands/replay/replay_initial_analysis.py
index a5c037a69ff1..fe6fa1c6fa40 100644
--- a/openhands/replay/replay_commands.py
+++ b/openhands/replay/replay_initial_analysis.py
@@ -1,6 +1,6 @@
 import json
 import re
-from typing import Any, cast
+from typing import Any, Tuple, cast
 
 from openhands.controller.state.state import State
 from openhands.core.logger import openhands_logger as logger
@@ -9,7 +9,7 @@
 from openhands.events.action.replay import ReplayInternalCmdRunAction
 from openhands.events.observation.replay import ReplayInternalCmdOutputObservation
 from openhands.replay.replay_prompts import replay_prompt_phase_analysis
-from openhands.replay.replay_types import AnalysisToolMetadata, AnnotateResult
+from openhands.replay.replay_types import AnalysisToolMetadata
 
 
 def scan_recording_id(issue: str) -> str | None:
@@ -72,20 +72,23 @@ def safe_parse_json(text: str) -> dict[str, Any] | None:
         return None
 
 
-def split_metadata(result):
+def split_metadata(result: dict) -> Tuple[AnalysisToolMetadata, dict]:
     if 'metadata' not in result:
         return {}, result
-    metadata = result['metadata']
+    metadata = cast(AnalysisToolMetadata, result['metadata'])
     data = dict(result)
     del data['metadata']
     return metadata, data
 
 
-def handle_replay_internal_command_observation(
+def on_replay_internal_command_observation(
     state: State, observation: ReplayInternalCmdOutputObservation
 ) -> AnalysisToolMetadata | None:
     """
-    Enhance the user prompt with the results of the replay analysis.
+    Handle result for an internally sent command (not agent tool use or user action).
+
+    NOTE: Currently, the only internal command is the initial-analysis command.
+    Enhance the user prompt with the results of the initial analysis.
     Returns the metadata needed for the agent to switch to analysis tools.
     """
     enhance_action_id = state.extra_data.get('replay_enhance_prompt_id')
@@ -103,19 +106,19 @@ def handle_replay_internal_command_observation(
         state.extra_data['replay_enhance_observed'] = True
 
         # Deserialize stringified result.
-        result: AnnotateResult = cast(
-            AnnotateResult, safe_parse_json(observation.content)
-        )
+        result = safe_parse_json(observation.content)
 
         # Get metadata and enhance prompt.
         if result and 'metadata' in result:
             # initial-analysis provides metadata needed for tool use.
             metadata, command_result = split_metadata(result)
-            replay_prompt_phase_analysis(command_result, user_message)
+            user_message.content = replay_prompt_phase_analysis(
+                command_result, user_message.content
+            )
             return metadata
         else:
             logger.warning(
-                f'[REPLAY] Replay command result cannot be interpreted. Observed content: {str(observation.content)}'
+                f'[REPLAY] Replay command result missing metadata. Observed content: {str(observation.content)}'
             )
 
     return None
diff --git a/openhands/replay/replay_prompts.py b/openhands/replay/replay_prompts.py
index c022249971b4..860c3ed3b155 100644
--- a/openhands/replay/replay_prompts.py
+++ b/openhands/replay/replay_prompts.py
@@ -1,19 +1,19 @@
 import json
 
 from openhands.core.logger import openhands_logger as logger
-from openhands.events.action.message import MessageAction
-from openhands.replay.replay_types import AnnotateResult
+from openhands.events.observation.replay import ReplayPhaseUpdateObservation
 
 
-def enhance_prompt(user_message: MessageAction, prefix: str, suffix: str):
+def enhance_prompt(prompt: str, prefix: str, suffix: str):
     if prefix != '':
-        user_message.content = f'{prefix}\n\n{user_message.content}'
+        prompt = f'{prefix}\n\n{prompt}'
     if suffix != '':
-        user_message.content = f'{user_message.content}\n\n{suffix}'
-    logger.info(f'[REPLAY] Enhanced user prompt:\n{user_message.content}')
+        prompt = f'{prompt}\n\n{suffix}'
+    logger.info(f'[REPLAY] Enhanced prompt:\n{prompt}')
+    return prompt
 
 
-def replay_prompt_phase_analysis(command_result: dict, user_message: MessageAction):
+def replay_prompt_phase_analysis(command_result: dict, prompt: str) -> str:
     prefix = ''
     suffix = """
 # Instructions
@@ -26,12 +26,10 @@ def replay_prompt_phase_analysis(command_result: dict, user_message: MessageActi
 
 # Initial Analysis
 """ + json.dumps(command_result, indent=2)
-    return enhance_prompt(user_message, prefix, suffix)
+    return enhance_prompt(prompt, prefix, suffix)
 
 
-def replay_prompt_phase_analysis_legacy(
-    command_result: AnnotateResult, user_message: MessageAction
-):
+def replay_prompt_phase_analysis_legacy(command_result: dict, prompt: str) -> str:
     # Old workflow: initial-analysis left hints in form of source code annotations.
     annotated_repo_path = command_result.get('annotatedRepo', '')
     comment_text = command_result.get('commentText', '')
@@ -61,10 +59,10 @@ def replay_prompt_phase_analysis_legacy(
 
     suffix = ''
 
-    return enhance_prompt(user_message, prefix, suffix)
+    return enhance_prompt(prompt, prefix, suffix)
 
 
-def replay_prompt_phase_edit():
+def replay_prompt_phase_edit(obs: ReplayPhaseUpdateObservation) -> str:
     # Tell the agent to stop analyzing and start editing:
     return """
 You have concluded the analysis.
diff --git a/openhands/replay/replay_state_machine.py b/openhands/replay/replay_state_machine.py
index 68f45b93be25..e8a9fab4b643 100644
--- a/openhands/replay/replay_state_machine.py
+++ b/openhands/replay/replay_state_machine.py
@@ -10,7 +10,9 @@
     ReplayToolCmdOutputObservation,
 )
 from openhands.events.serialization.event import truncate_content
-from openhands.replay.replay_commands import handle_replay_internal_command_observation
+from openhands.replay.replay_initial_analysis import (
+    on_replay_internal_command_observation,
+)
 from openhands.replay.replay_prompts import replay_prompt_phase_edit
 
 
@@ -18,7 +20,7 @@ def on_replay_observation(obs: ReplayObservation, state: State, agent: Agent) ->
     """Handle the observation."""
     if isinstance(obs, ReplayInternalCmdOutputObservation):
         # NOTE: Currently, the only internal command is the initial-analysis command.
-        analysis_tool_metadata = handle_replay_internal_command_observation(state, obs)
+        analysis_tool_metadata = on_replay_internal_command_observation(state, obs)
         if analysis_tool_metadata:
             # Start analysis phase
             state.replay_recording_id = analysis_tool_metadata['recordingId']
@@ -52,10 +54,10 @@ def get_replay_observation_message(
     elif isinstance(obs, ReplayPhaseUpdateObservation):
         new_phase = obs.new_phase
         if new_phase == ReplayDebuggingPhase.Edit:
-            text = replay_prompt_phase_edit()
-            message = Message(role='user', content=[TextContent(text=text)])
+            text = replay_prompt_phase_edit(obs)
         else:
             raise NotImplementedError(f'Unhandled ReplayPhaseUpdateAction: {new_phase}')
+        message = Message(role='user', content=[TextContent(text=text)])
     else:
         raise NotImplementedError(
             f"Unhandled observation type: {obs.__class__.__name__} ({getattr(obs, 'observation', None)})"
diff --git a/openhands/replay/replay_tools.py b/openhands/replay/replay_tools.py
new file mode 100644
index 000000000000..820e98efb0ca
--- /dev/null
+++ b/openhands/replay/replay_tools.py
@@ -0,0 +1,236 @@
+"""This file contains the function calling implementation for different actions.
+
+This is similar to the functionality of `CodeActResponseParser`.
+"""
+
+import json
+
+from litellm import (
+    ChatCompletionMessageToolCall,
+    ChatCompletionToolParam,
+    ChatCompletionToolParamFunctionChunk,
+)
+
+from openhands.controller.state.state import State
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.schema.replay import ReplayDebuggingPhase
+from openhands.events.action.replay import (
+    ReplayAction,
+    ReplayPhaseUpdateAction,
+    ReplayToolCmdRunAction,
+)
+
+
+class ReplayTool(ChatCompletionToolParam):
+    pass
+
+
+def replay_tool(**kwargs):
+    f = ChatCompletionToolParamFunctionChunk(**kwargs)
+    return ReplayTool(type='function', function=f)
+
+
+# ---------------------------------------------------------
+# Tool: inspect-data
+# ---------------------------------------------------------
+_REPLAY_INSPECT_DATA_DESCRIPTION = """
+Explains value, data flow and origin information for `expression` at `point`.
+IMPORTANT: Prefer using inspect-data over inspect-point.
+"""
+
+ReplayInspectDataTool = replay_tool(
+    name='inspect-data',
+    description=_REPLAY_INSPECT_DATA_DESCRIPTION.strip(),
+    parameters={
+        'type': 'object',
+        'properties': {
+            'expression': {
+                'type': 'string',
+                'description': 'A valid JS expression. IMPORTANT: First pick the best expression. If the expression is an object: Prefer "array[0]" over "array" and "o.x" over "o" to get closer to the origin and creation site of important data points. Prefer nested object over primitive expressions.',
+            },
+            'point': {
+                'type': 'string',
+                'description': 'The point at which to inspect the runtime. The first point comes from the `thisPoint` in the Initial analysis.',
+            },
+            'explanation': {
+                'type': 'string',
+                'description': 'Give a concise explanation as to why you take this investigative step.',
+            },
+            'explanation_source': {
+                'type': 'string',
+                'description': 'Explain which data you saw in the previous analysis results that informs this step.',
+            },
+        },
+        'required': ['expression', 'point', 'explanation', 'explanation_source'],
+    },
+)
+
+# ---------------------------------------------------------
+# Tool: inspect-point
+# ---------------------------------------------------------
+_REPLAY_INSPECT_POINT_DESCRIPTION = """
+Explains dynamic control flow and data flow dependencies of the code at `point`.
+Use this tool instead of `inspect-data` only when you don't have a specific data point to investigate.
+"""
+
+ReplayInspectPointTool = replay_tool(
+    name='inspect-point',
+    description=_REPLAY_INSPECT_POINT_DESCRIPTION.strip(),
+    parameters={
+        'type': 'object',
+        'properties': {
+            'point': {'type': 'string'},
+        },
+        'required': ['point'],
+    },
+)
+
+# ---------------------------------------------------------
+# Tool: SubmitHypothesis
+# TODO: Divide this into multiple steps -
+#   1. The first submission must be as simple as possible to take little computational effort from the analysis steps.
+#   2. The second submission, after analysis has already concluded, must be as complete as possible.
+# ---------------------------------------------------------
+# _REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
+# Your investigation has yielded a complete thin slice from symptom to root cause,
+# enough proof to let the `CodeEdit` agent take over to fix the bug.
+# DO NOT GUESS. You must provide exact code in the exact right location to fix this bug,
+# based on evidence you have gathered.
+# """
+
+# ReplaySubmitHypothesisTool = ReplayToolDefinition(
+#     name='submit-hypothesis',
+#     description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
+#     parameters={
+#         'type': 'object',
+#         'properties': {
+#             'rootCauseHypothesis': {'type': 'string'},
+#             'thinSlice': {
+#                 'type': 'array',
+#                 'items': {
+#                     'type': 'object',
+#                     'properties': {
+#                         'point': {'type': 'string'},
+#                         'code': {'type': 'string'},
+#                         'role': {'type': 'string'},
+#                     },
+#                     'required': ['point', 'code', 'role'],
+#                 },
+#             },
+#             'modifications': {
+#                 'type': 'array',
+#                 'items': {
+#                     'type': 'object',
+#                     'properties': {
+#                         'kind': {
+#                             'type': 'string',
+#                             'enum': ['add', 'remove', 'modify'],
+#                         },
+#                         'newCode': {'type': 'string'},
+#                         'oldCode': {'type': 'string'},
+#                         'location': {'type': 'string'},
+#                         'point': {'type': 'string'},
+#                         # NOTE: Even though, we really want the `line` here, it will lead to much worse performance because the agent has a hard time computing correct line numbers from its point-based investigation.
+#                         # Instead of requiring a line number, the final fix will be more involved, as explained in the issue.
+#                         # see: https://linear.app/replay/issue/PRO-939/use-tools-data-flow-analysis-for-10608#comment-3b7ae176
+#                         # 'line': {'type': 'number'},
+#                         'briefExplanation': {'type': 'string'},
+#                         'verificationProof': {'type': 'string'},
+#                     },
+#                     'required': [
+#                         'kind',
+#                         'location',
+#                         'briefExplanation',
+#                         # 'line',
+#                         'verificationProof',
+#                     ],
+#                 },
+#             },
+#         },
+#         'required': ['rootCauseHypothesis', 'thinSlice', 'modifications'],
+#     },
+# )
+_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
+# Use this tool to conclude your analysis and move on to code editing.
+# """
+
+ReplaySubmitHypothesisTool = replay_tool(
+    name='submit-hypothesis',
+    description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
+    parameters={
+        'type': 'object',
+        'properties': {
+            'problem': {
+                'type': 'string',
+                'description': 'One-sentence explanation of the core problem that this will solve.',
+            },
+            'rootCauseHypothesis': {'type': 'string'},
+            'editSuggestions': {
+                'type': 'string',
+                'description': 'Provide suggestions to fix the bug, if you know enough about the code that requires modification.',
+            },
+        },
+        'required': ['rootCauseHypothesis'],
+    },
+)
+
+replay_tools: list[ReplayTool] = [
+    ReplayInspectDataTool,
+    ReplayInspectPointTool,
+    ReplaySubmitHypothesisTool,
+]
+replay_tool_names: set[str] = set([t.function['name'] for t in replay_tools])
+
+
+def is_replay_tool(tool_name: str) -> bool:
+    return tool_name in replay_tool_names
+
+
+def handle_replay_tool_call(
+    tool_call: ChatCompletionMessageToolCall, arguments: dict, state: State
+) -> ReplayAction:
+    logger.info(
+        f'[REPLAY] TOOL_CALL {tool_call.function.name} - arguments: {json.dumps(arguments, indent=2)}'
+    )
+    action: ReplayAction
+    if tool_call.function.name == 'inspect-data':
+        # Remove explanation props.
+        arguments = {k: v for k, v in arguments.items() if 'explanation' not in k}
+        action = ReplayToolCmdRunAction(
+            command_name='inspect-data',
+            command_args=arguments | {'recordingId': state.replay_recording_id},
+        )
+    elif tool_call.function.name == 'inspect-point':
+        # if arguments['expression'] == 'wiredRules':   # hackfix for 10608 experiment
+        #     raise FunctionCallValidationError(f'wiredRules is irrelevant to the problem. Try something else.')
+        action = ReplayToolCmdRunAction(
+            command_name='inspect-point',
+            command_args=arguments | {'recordingId': state.replay_recording_id},
+        )
+    elif tool_call.function.name == 'submit-hypothesis':
+        action = ReplayPhaseUpdateAction(
+            new_phase=ReplayDebuggingPhase.Edit, info=json.dumps(arguments)
+        )
+    else:
+        raise ValueError(
+            f'Unknown Replay tool. Make sure to add them all to REPLAY_TOOLS: {tool_call.function.name}'
+        )
+    return action
+
+
+def get_replay_tools(
+    replay_phase: ReplayDebuggingPhase, default_tools: list[ChatCompletionToolParam]
+) -> list[ChatCompletionToolParam]:
+    analysis_tools = [
+        ReplayInspectDataTool,
+        ReplayInspectPointTool,
+    ]
+    if replay_phase == ReplayDebuggingPhase.Analysis:
+        # Analysis tools only. This phase is concluded upon submit-hypothesis.
+        tools = analysis_tools + [ReplaySubmitHypothesisTool]
+    elif replay_phase == ReplayDebuggingPhase.Edit:
+        # Combine default and analysis tools.
+        tools = default_tools + analysis_tools
+    else:
+        raise ValueError(f'Unhandled ReplayDebuggingPhase in get_tools: {replay_phase}')
+    return tools
diff --git a/openhands/replay/replay_types.py b/openhands/replay/replay_types.py
index 2628da9e414f..213bc6d8ded0 100644
--- a/openhands/replay/replay_types.py
+++ b/openhands/replay/replay_types.py
@@ -8,12 +8,3 @@ class AnnotatedLocation(TypedDict, total=False):
 
 class AnalysisToolMetadata(TypedDict, total=False):
     recordingId: str
-
-
-class AnnotateResult(TypedDict, total=False):
-    point: str
-    commentText: str | None
-    annotatedRepo: str | None
-    annotatedLocations: list[AnnotatedLocation] | None
-    pointLocation: str | None
-    metadata: AnalysisToolMetadata | None