Added bolt samples (#15)

Domiii · web-flow · commit bef88b0de874 · 2025-01-19T03:04:23.000+08:00
diff --git a/openhands/agenthub/codeact_agent/codeact_agent.py b/openhands/agenthub/codeact_agent/codeact_agent.py
@@ -1,4 +1,3 @@
-import json
 import os
 from collections import deque
 
@@ -269,7 +268,7 @@ def get_observation_message(
             new_phase = obs.new_phase
             if new_phase == ReplayDebuggingPhase.Edit:
                 # Tell the agent to stop analyzing and start editing:
-                text = "You have concluded the analysis. Review, then implement the hypothesized changes using the edit tools available to you. The code is available in the workspace. Don't stop. Fix the bug."
+                text = "You have concluded the analysis.\nIMPORTANT: NOW review, then implement the hypothesized changes using tools. The code is available in the workspace.\nIMPORTANT: Don't stop. Fix the bug.\nIMPORTANT: Don't stop. Fix the bug."
                 message = Message(role='user', content=[TextContent(text=text)])
             else:
                 raise NotImplementedError(
@@ -348,7 +347,8 @@ def replay_phase_changed(self, phase: ReplayDebuggingPhase) -> None:
             codeact_replay_phase=phase,
         )
         logger.debug(
-            f'[REPLAY] CodeActAgent.replay_phase_changed({phase}). New tools: {json.dumps(self.tools, indent=2)}'
+            f'[REPLAY] CodeActAgent.replay_phase_changed({phase}).'
+            # f'New tools: {json.dumps(self.tools, indent=2)}'
         )
 
     def step(self, state: State) -> Action:
diff --git a/openhands/agenthub/codeact_agent/function_calling.py b/openhands/agenthub/codeact_agent/function_calling.py
@@ -95,13 +95,75 @@
 
 # ---------------------------------------------------------
 # Tool: SubmitHypothesis
+# TODO: Divide this into multiple steps -
+#   1. The first submission must be as simple as possible to take little computational effort from the analysis steps.
+#   2. The second submission, after analysis has already concluded, must be as complete as possible.
 # ---------------------------------------------------------
+# _REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
+# Your investigation has yielded a complete thin slice from symptom to root cause,
+# enough proof to let the `CodeEdit` agent take over to fix the bug.
+# DO NOT GUESS. You must provide exact code in the exact right location to fix this bug,
+# based on evidence you have gathered.
+# """
+
+# ReplaySubmitHypothesisTool = ChatCompletionToolParam(
+#     type='function',
+#     function=ChatCompletionToolParamFunctionChunk(
+#         name='submit-hypothesis',
+#         description=_REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION.strip(),
+#         parameters={
+#             'type': 'object',
+#             'properties': {
+#                 'rootCauseHypothesis': {'type': 'string'},
+#                 'thinSlice': {
+#                     'type': 'array',
+#                     'items': {
+#                         'type': 'object',
+#                         'properties': {
+#                             'point': {'type': 'string'},
+#                             'code': {'type': 'string'},
+#                             'role': {'type': 'string'},
+#                         },
+#                         'required': ['point', 'code', 'role'],
+#                     },
+#                 },
+#                 'modifications': {
+#                     'type': 'array',
+#                     'items': {
+#                         'type': 'object',
+#                         'properties': {
+#                             'kind': {
+#                                 'type': 'string',
+#                                 'enum': ['add', 'remove', 'modify'],
+#                             },
+#                             'newCode': {'type': 'string'},
+#                             'oldCode': {'type': 'string'},
+#                             'location': {'type': 'string'},
+#                             'point': {'type': 'string'},
+#                             # NOTE: Even though, we really want the `line` here, it will lead to much worse performance because the agent has a hard time computing correct line numbers from its point-based investigation.
+#                             # Instead of requiring a line number, the final fix will be more involved, as explained in the issue.
+#                             # see: https://linear.app/replay/issue/PRO-939/use-tools-data-flow-analysis-for-10608#comment-3b7ae176
+#                             # 'line': {'type': 'number'},
+#                             'briefExplanation': {'type': 'string'},
+#                             'verificationProof': {'type': 'string'},
+#                         },
+#                         'required': [
+#                             'kind',
+#                             'location',
+#                             'briefExplanation',
+#                             # 'line',
+#                             'verificationProof',
+#                         ],
+#                     },
+#                 },
+#             },
+#             'required': ['rootCauseHypothesis', 'thinSlice', 'modifications'],
+#         },
+#     ),
+# )
 _REPLAY_SUBMIT_HYPOTHESIS_DESCRIPTION = """
-Your investigation has yielded a complete thin slice from symptom to root cause,
-enough proof to let the `CodeEdit` agent take over to fix the bug.
-DO NOT GUESS. You must provide exact code in the exact right location to fix this bug,
-based on evidence you have gathered.
-"""
+# Use this tool to conclude your analysis and move on to code editing.
+# """
 
 ReplaySubmitHypothesisTool = ChatCompletionToolParam(
     type='function',
@@ -112,49 +174,12 @@
             'type': 'object',
             'properties': {
                 'rootCauseHypothesis': {'type': 'string'},
-                'thinSlice': {
-                    'type': 'array',
-                    'items': {
-                        'type': 'object',
-                        'properties': {
-                            'point': {'type': 'string'},
-                            'code': {'type': 'string'},
-                            'role': {'type': 'string'},
-                        },
-                        'required': ['point', 'code', 'role'],
-                    },
-                },
-                'modifications': {
-                    'type': 'array',
-                    'items': {
-                        'type': 'object',
-                        'properties': {
-                            'kind': {
-                                'type': 'string',
-                                'enum': ['add', 'remove', 'modify'],
-                            },
-                            'newCode': {'type': 'string'},
-                            'oldCode': {'type': 'string'},
-                            'location': {'type': 'string'},
-                            'point': {'type': 'string'},
-                            # NOTE: Even though, we really want the `line` here, it will lead to much worse performance because the agent has a hard time computing correct line numbers from its point-based investigation.
-                            # Instead of requiring a line number, the final fix will be more involved, as explained in the issue.
-                            # see: https://linear.app/replay/issue/PRO-939/use-tools-data-flow-analysis-for-10608#comment-3b7ae176
-                            # 'line': {'type': 'number'},
-                            'briefExplanation': {'type': 'string'},
-                            'verificationProof': {'type': 'string'},
-                        },
-                        'required': [
-                            'kind',
-                            'location',
-                            'briefExplanation',
-                            # 'line',
-                            'verificationProof',
-                        ],
-                    },
+                'editSuggestions': {
+                    'type': 'string',
+                    'description': 'Provide suggestions to fix the bug, if you know enough about the code that requires modification.',
                 },
             },
-            'required': ['rootCauseHypothesis', 'thinSlice', 'modifications'],
+            'required': ['rootCauseHypothesis'],
         },
     ),
 )
@@ -626,7 +651,7 @@ def response_to_actions(response: ModelResponse, state: State) -> list[Action]:
                     )
                 elif tool_call.function.name == 'submit-hypothesis':
                     action = ReplayPhaseUpdateAction(
-                        new_phase=ReplayDebuggingPhase.Edit
+                        new_phase=ReplayDebuggingPhase.Edit, info=json.dumps(arguments)
                     )
                 else:
                     raise ValueError(
@@ -713,11 +738,10 @@ def get_tools(
         analysis_tools = [
             ReplayInspectDataTool,
             ReplayInspectPointTool,
-            ReplaySubmitHypothesisTool,
         ]
         if codeact_replay_phase == ReplayDebuggingPhase.Analysis:
             # Analysis tools only. This phase is concluded upon submit-hypothesis.
-            tools = analysis_tools
+            tools = analysis_tools + [ReplaySubmitHypothesisTool]
         elif codeact_replay_phase == ReplayDebuggingPhase.Edit:
             # Combine default and analysis tools.
             tools = default_tools + analysis_tools
diff --git a/openhands/core/main.py b/openhands/core/main.py
@@ -284,5 +284,6 @@ def generate_sid(config: AppConfig, session_name: str | None = None) -> str:
             config=config,
             initial_user_action=initial_user_action,
             sid=sid,
+            exit_on_message=True,
         )
     )
diff --git a/openhands/events/action/replay.py b/openhands/events/action/replay.py
@@ -66,6 +66,7 @@ class ReplayPhaseUpdateAction(Action):
     new_phase: ReplayDebuggingPhase
 
     thought: str = ''
+    info: str = ''
 
     action: str = ActionType.REPLAY_UPDATE_PHASE
     runnable: ClassVar[bool] = True
@@ -77,5 +78,5 @@ def message(self) -> str:
         return f'{self.__class__.__name__}: {self.new_phase}'
 
     def __str__(self) -> str:
-        ret = f'{self.message}'
+        ret = f'[{self.message}] {self.info}'
         return ret
diff --git a/openhands/events/replay.py b/openhands/events/replay.py
@@ -28,15 +28,6 @@ def command_annotate_execution_points(
     thought: str, is_workspace_repo: bool
 ) -> ReplayInternalCmdRunAction:
     command_input: dict[str, Any] = dict()
-    if is_workspace_repo:
-        # NOTE: In the resolver workflow, the workdir path is equal to the repo path:
-        #    1. We should not append the repo name to the path.
-        #    2. The resolver also already hard-reset the repo, so forceDelete is not necessary.
-        command_input['isWorkspaceRepoPath'] = True
-        command_input['forceDelete'] = False
-    else:
-        command_input['isWorkspaceRepoPath'] = False
-        command_input['forceDelete'] = True
     command_input['prompt'] = thought
 
     action = ReplayInternalCmdRunAction(
diff --git a/openhands/runtime/base.py b/openhands/runtime/base.py
@@ -140,10 +140,14 @@ def setup_initial_env(self) -> None:
         if self.config.sandbox.runtime_startup_env_vars:
             self.add_env_vars(self.config.sandbox.runtime_startup_env_vars)
 
-        logger.debug('Maybe adding replay env vars')
         if self.config.replay.api_key:
-            self.add_env_vars({'REPLAY_API_KEY': self.config.replay.api_key})
-            logger.debug('Added REPLAY_API_KEY to environment')
+            self.add_env_vars(
+                {
+                    'REPLAY_API_KEY': self.config.replay.api_key,
+                    'REPLAY_DEV_MODE': os.environ.get('REPLAY_DEV_MODE', ''),
+                    'REPLAY_ENABLE_TOOL_CACHE': os.environ.get('REPLAY_DEV_MODE', ''),
+                }
+            )
         if self.config.replay.dir:
             self.add_env_vars({'REPLAY_DIR': self.config.replay.dir})
             logger.debug('Added REPLAY_DIR to environment')
diff --git a/openhands/runtime/replay/replay_cli.py b/openhands/runtime/replay/replay_cli.py
@@ -30,9 +30,9 @@ async def run_action(
             command_args['recordingId'] = action.recording_id
         if action.session_id != '':
             command_args['sessionId'] = action.session_id
-        if action.command_name == 'initial-analysis':
-            # Hardcode the path for now. We won't need it in the long run.
-            command_args['workspacePath'] = self.bash_session.workdir
+        # if action.command_name == 'initial-analysis':
+        #     # Hardcode the path for now. We won't need it in the long run.
+        #     command_args['workspacePath'] = self.bash_session.workdir
 
         with (
             tempfile.NamedTemporaryFile(
diff --git a/replay_benchmarks/bolt/945/prompt.md b/replay_benchmarks/bolt/945/prompt.md
@@ -0,0 +1,3 @@
+The deviations from the median for the total network data transferred are rendered incorrectly.  They render as 0.0% when they should be something else.
+
+Bug recording: https://app.replay.io/recording/replay-of-localhost8040--3e0a8f68-14e6-4809-bc72-dea0e0374c77?&point=25312447185420620431941338721681417
diff --git a/replay_benchmarks/bolt/945/source_code.zip b/replay_benchmarks/bolt/945/source_code.zip
diff --git a/replay_benchmarks/bolt/946/prompt.md b/replay_benchmarks/bolt/946/prompt.md
@@ -0,0 +1,8 @@
+The following bug has been discovered in a web app inside a browser.
+NOTE: Networking errors might be caused by CORS problems. Their fix usually requires server middleware.
+
+This web app should render the results of a single prompt provided to two different models.
+The UI renders: `anthropic API error: Connection error`
+Fix it.
+
+Bug recording: https://app.replay.io/recording/replay-of-localhost8040--f9dd7902-5188-45d2-9d2a-aa4d2e674592?point=29531188383192074852669071103623174&time=13949.366224168605&viewMode=dev
diff --git a/replay_benchmarks/bolt/946/source_code.zip b/replay_benchmarks/bolt/946/source_code.zip
diff --git a/replay_benchmarks/bolt/951/prompt.md b/replay_benchmarks/bolt/951/prompt.md
@@ -0,0 +1,3 @@
+The React app renders a blank screen.
+
+Bug recording: https://app.replay.io/recording/replay-of-localhost8040--43a890bc-6f37-47e0-ba47-4d04827e4e44
diff --git a/replay_benchmarks/bolt/951/source_code.zip b/replay_benchmarks/bolt/951/source_code.zip
diff --git a/replay_benchmarks/bolt/run-bolt.sh b/replay_benchmarks/bolt/run-bolt.sh
@@ -0,0 +1,89 @@
+# Copyright 2020-2025 Record Replay Inc.
+set -e
+
+if [[ -z "$1" ]]; then
+    echo "Usage: $0 <instance-id>"
+    exit 1
+fi
+INSTANCE_ID=$1
+PROMPT_NAME="$2"
+
+THIS_DIR="$(dirname "$0")"
+OH_ROOT="$THIS_DIR/.."
+OH_ROOT="$(node -e 'console.log(require("path").resolve(process.argv[1]))' $OH_ROOT)"
+if [[ -z "$TMP_DIR" ]]; then
+    TMP_DIR="/tmp"
+fi
+TARGET_FOLDER="$TMP_DIR/bolt/$INSTANCE_ID"
+WORKSPACE_ROOT="$TARGET_FOLDER/workspace"
+INSTANCE_DIR="$THIS_DIR/$INSTANCE_ID"
+
+if [[ ! -d "$INSTANCE_DIR" ]]; then
+    echo -e "Instance directory \"$INSTANCE_DIR\" not found.\n"
+    echo -e "Available instance folders:\n"
+    # List all sub folders
+    ls -1 -d $THIS_DIR/*/
+    echo -e "\n"
+    exit 1
+fi
+
+
+# Load prompt.
+if [[ -z "$PROMPT_NAME" ]]; then
+    PROMPT_NAME="prompt"
+fi
+PROMPT_FILE="$INSTANCE_DIR/$PROMPT_NAME.md"
+if [[ ! -f "$PROMPT_FILE" ]]; then
+    echo "Prompt file \"$PROMPT_FILE\" not found."
+    exit 1
+fi
+PROMPT=$(cat $PROMPT_FILE)
+if [[ -z "$PROMPT" ]]; then
+    echo "Prompt file found but was empty."
+    exit 1
+fi
+
+# (Re-load) source files.
+SOURCE_ZIP_FILE="$INSTANCE_DIR/source_code.zip"
+rm -rf $WORKSPACE_ROOT
+mkdir -p $WORKSPACE_ROOT
+if [[ -f "$SOURCE_ZIP_FILE" ]]; then
+    unzip -q $SOURCE_ZIP_FILE -d $WORKSPACE_ROOT
+    # If it only contains a single folder called "project", move it up.
+    if [ -d "$WORKSPACE_ROOT/project" ] && [ $(ls -A "$WORKSPACE_ROOT" | wc -l) -eq 1 ]; then
+        mv "$WORKSPACE_ROOT/project"/* "$WORKSPACE_ROOT"
+        rm -rf "$WORKSPACE_ROOT/project"
+    fi
+    pushd $WORKSPACE_ROOT > /dev/null
+    git init > /dev/null
+    git add -A > /dev/null
+    git commit -am "initial commit" > /dev/null
+    popd > /dev/null
+    echo "Workspace has been set up and git initialized."
+else
+    echo "Running analysis WITHOUT source code..."
+fi
+
+# Config overrides + sanity checks.
+export DEBUG=1
+export REPLAY_DEV_MODE=1
+export REPLAY_ENABLE_TOOL_CACHE=1
+export WORKSPACE_BASE="$WORKSPACE_ROOT"
+export LLM_MODEL="anthropic/claude-3-5-sonnet-20241022"
+if [[ -z "$LLM_API_KEY" ]]; then
+    if [[ -z "$ANTHROPIC_API_KEY" ]]; then
+        echo "LLM_API_KEY or ANTHROPIC_API_KEY environment variable must be set."
+        exit 1
+    fi
+    export LLM_API_KEY=$ANTHROPIC_API_KEY
+fi
+
+# Logging.
+LOG_FILE="$TARGET_FOLDER/default.log"
+echo "WORKSPACE_ROOT: \"$WORKSPACE_ROOT\""
+echo "Logging to \"$LOG_FILE\"..."
+
+# GO.
+cd $OH_ROOT
+poetry run python -m openhands.core.main -t "$PROMPT" \
+    > "$LOG_FILE" 2>&1
diff --git a/replay_benchmarks/devtools/resolver_10608.sh b/replay_benchmarks/devtools/resolver_10608.sh
diff --git a/replay_benchmarks/devtools/resolver_10609.sh b/replay_benchmarks/devtools/resolver_10609.sh

Original file line number	Diff line number	Diff line change
`@@ -284,5 +284,6 @@ def generate_sid(config: AppConfig, session_name: str \| None = None) -> str:`
`284`	`284`	`config=config,`
`285`	`285`	`initial_user_action=initial_user_action,`
`286`	`286`	`sid=sid,`
	`287`	`+ exit_on_message=True,`
`287`	`288`	`)`
`288`	`289`	`)`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+The deviations from the median for the total network data transferred are rendered incorrectly. They render as 0.0% when they should be something else.`
	`2`	`+`
	`3`	`+Bug recording: https://app.replay.io/recording/replay-of-localhost8040--3e0a8f68-14e6-4809-bc72-dea0e0374c77?&point=25312447185420620431941338721681417`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+The React app renders a blank screen.`
	`2`	`+`
	`3`	`+Bug recording: https://app.replay.io/recording/replay-of-localhost8040--43a890bc-6f37-47e0-ba47-4d04827e4e44`