Skip to content

Commit

Permalink
More tests, more fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Domiii committed Jan 24, 2025
1 parent 954e435 commit 274973b
Show file tree
Hide file tree
Showing 10 changed files with 270 additions and 66 deletions.
5 changes: 4 additions & 1 deletion openhands/agenthub/codeact_agent/codeact_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,10 @@ def step(self, state: State) -> Action:
params['tools'] = self.tools
if self.mock_function_calling:
params['mock_function_calling'] = True
# logger.debug(f'#######\nCodeActAgent.step: messages:\n{json.dumps(params)}\n\n#######\n')

# # Debug log the raw input to the LLM:
# logger.debug(f'#######\nCodeActAgent.step: RAW LLM INPUT:\n{repr(params)}\n\n#######\n')

response = self.llm.completion(**params)
actions = codeact_function_calling.response_to_actions(response, state)
for action in actions:
Expand Down
13 changes: 10 additions & 3 deletions openhands/core/schema/replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,19 @@


class ReplayPhase(str, Enum):
"""All Replay phases that an agent can be in."""

Normal = 'normal'
"""The agent is not doing anything related to Replay.
"""The agent does not have access to a recording.
"""

Analysis = 'analysis'
"""The agent is analyzing a recording.
"""The agent uses initial-analysis data and dedicated tools to analyze a Replay recording.
"""
ConfirmAnalysis = 'confirm_analysis'
"""The agent is confirming the analysis.
"""

Edit = 'edit'
"""The agent is editing the code.
"""The agent finally edits the code.
"""
34 changes: 24 additions & 10 deletions openhands/replay/replay_phases.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,21 +97,35 @@ def update_phase(new_phase: ReplayPhase, state: State, agent: Agent):
# ReplayStateMachine.
# ###########################################################################

replay_state_transitions: dict[ReplayPhase, tuple[ReplayPhase, ...] | None] = {
ReplayPhase.Normal: None,
ReplayPhase.Analysis: (ReplayPhase.Edit,),
ReplayPhase.Edit: None,
# Transition edges controlled by the agent.
replay_agent_state_transitions: dict[ReplayPhase, tuple[ReplayPhase, ...]] = {
ReplayPhase.Analysis: (ReplayPhase.ConfirmAnalysis,),
ReplayPhase.ConfirmAnalysis: (ReplayPhase.Edit,),
}


# This machine represents state transitions that the agent can make.
class ReplayStateMachine:
class ReplayAgentStateMachine:
edges: list[tuple[ReplayPhase, ReplayPhase]]
"""List of all edges."""

forward_edges: dict[ReplayPhase, list[ReplayPhase]]
"""Adjacency list of forward edges."""

reverse_edges: dict[ReplayPhase, list[ReplayPhase]]
"""Adjacency list of reverse edges."""

def __init__(self):
self.forward_edges: dict[ReplayPhase, list[ReplayPhase] | None] = {
self.forward_edges = {
phase: list(targets) if targets is not None else None
for phase, targets in replay_state_transitions.items()
for phase, targets in replay_agent_state_transitions.items()
}
self.reverse_edges: dict[ReplayPhase, list[ReplayPhase]] = {}
self.edges = [
(source, target)
for source, targets in self.forward_edges.items()
for target in targets
]
self.reverse_edges = {}

for source, targets in self.forward_edges.items():
if targets:
Expand Down Expand Up @@ -151,8 +165,8 @@ def get_child_phases(self, phase: ReplayPhase) -> list[ReplayPhase] | None:
return self.forward_edges.get(phase, list())


replay_state_machine = ReplayStateMachine()
replay_agent_state_machine = ReplayAgentStateMachine()


def get_next_agent_replay_phase(phase: ReplayPhase) -> ReplayPhase | None:
return replay_state_machine.get_unique_child_phase(phase)
return replay_agent_state_machine.get_unique_child_phase(phase)
18 changes: 13 additions & 5 deletions openhands/replay/replay_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,25 @@ class PromptMap(TypedDict):


phase_prompts: dict[ReplayPhase, PromptMap] = {
ReplayPhase.Edit: {
ReplayPhase.ConfirmAnalysis: {
'enter': """
You have concluded the analysis.
You have submitted a draft of the hypothesis.
IMPORTANT: NOW review, then implement the hypothesized changes using tools. The code is available in the workspace. Start by answering these questions:
IMPORTANT: NOW review the hypothesis. When reviewing, PAY ATTENTION to this:
1. What is the goal of the investigation according to the initial prompt and initial analysis? IMPORTANT. PAY ATTENTION TO THIS. THIS IS THE ENTRY POINT OF EVERYTHING.
2. Given (1), is the hypothesis's `problem` description correct? Does it match the goal of the investigation?
3. Do the `editSuggestions` actually address the issue?
4. Rephrase the hypothesis so that it is consistent and correct.
Rephrase and confirm a revised hypothesis.
"""
},
ReplayPhase.Edit: {
'enter': """
You have confirmed the analysis.
IMPORTANT: NOW implement the confirmed suggestions. The code is available in the workspace.
"""
}
},
}


Expand Down
125 changes: 92 additions & 33 deletions openhands/replay/replay_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,20 @@
from openhands.controller.state.state import State
from openhands.core.logger import openhands_logger as logger
from openhands.core.schema.replay import ReplayPhase
from openhands.events.action.action import Action
from openhands.events.action.empty import NullAction
from openhands.events.action.replay import (
ReplayAction,
ReplayPhaseUpdateAction,
ReplayToolCmdRunAction,
)
from openhands.replay.replay_phases import get_next_agent_replay_phase


class ReplayToolType(Enum):
Analysis = 'analysis'
"""Tools that allow analyzing a Replay recording."""

PhaseTransition = 'phase_transition'
"""Tools that trigger a phase transition. Phase transitions are generally used to improve self-consistency."""


class ReplayTool(ChatCompletionToolParam):
Expand All @@ -45,16 +48,19 @@ def replay_analysis_tool(name: str, description: str, parameters: dict) -> Repla


class ReplayPhaseTransitionTool(ReplayTool):
edges: list[tuple[ReplayPhase, ReplayPhase]]
replay_tool_type = ReplayToolType.PhaseTransition
new_phase: ReplayPhase


def replay_phase_tool(
new_phase: ReplayPhase, name: str, description: str, parameters: dict
edges: list[tuple[ReplayPhase, ReplayPhase]],
name: str,
description: str,
parameters: dict,
):
tool = ReplayPhaseTransitionTool(
edges=edges,
replay_tool_type=ReplayToolType.PhaseTransition,
new_phase=new_phase,
type='function',
function=ChatCompletionToolParamFunctionChunk(
name=name,
Expand Down Expand Up @@ -127,27 +133,65 @@ def replay_phase_tool(

replay_phase_transition_tools: list[ReplayPhaseTransitionTool] = [
replay_phase_tool(
ReplayPhase.Edit,
[
(ReplayPhase.Analysis, ReplayPhase.ConfirmAnalysis),
],
'submit',
"""Conclude your analysis.""",
"""Submit your analysis.""",
{
'type': 'object',
'properties': {
'problem': {
'type': 'string',
'description': 'One-sentence explanation of the core problem that this will solve.',
'description': 'One-sentence explanation of the core problem, according to user requirements and initial-analysis.',
},
'rootCauseHypothesis': {'type': 'string'},
'badCode': {
'type': 'string',
'description': 'Show exactly which code requires changing, according to the user requirements and initial-analysis.',
},
},
'required': ['problem', 'rootCauseHypothesis', 'badCode'],
},
),
replay_phase_tool(
[
(ReplayPhase.ConfirmAnalysis, ReplayPhase.Edit),
],
'confirm',
"""Confirm your analysis and suggest specific code changes.""",
{
'type': 'object',
'properties': {
'problem': {
'type': 'string',
'description': 'One-sentence explanation of the core problem, according to user requirements and initial-analysis.',
},
'rootCauseHypothesis': {'type': 'string'},
'badCode': {
'type': 'string',
'description': 'Show exactly which code requires changing, according to the user requirements and initial-analysis.',
},
'editSuggestions': {
'type': 'string',
'description': 'Provide suggestions to fix the bug, if you know enough about the code that requires modification.',
},
},
'required': ['problem', 'rootCauseHypothesis'],
'required': ['problem', 'rootCauseHypothesis', 'editSuggestions'],
},
)
),
]

replay_phase_transition_tools_by_from_phase = {
phase: [
t
for t in replay_phase_transition_tools
for edge in t['edges']
if edge[0] == phase
]
for phase in {edge[0] for t in replay_phase_transition_tools for edge in t['edges']}
}

# ###########################################################################
# Bookkeeping + utilities.
# ###########################################################################
Expand Down Expand Up @@ -181,22 +225,25 @@ def is_replay_tool(
# ###########################################################################


def get_replay_transition_tool_for_current_phase(
current_phase: ReplayPhase, name: str | None = None
def get_replay_transition_tools(current_phase: ReplayPhase) -> list[ReplayTool] | None:
phase_tools = replay_phase_transition_tools_by_from_phase.get(current_phase, None)
if not phase_tools:
return None
assert len(phase_tools)
return phase_tools


def get_replay_transition_tool(
current_phase: ReplayPhase, name: str
) -> ReplayTool | None:
next_phase = get_next_agent_replay_phase(current_phase)
if next_phase:
transition_tools = [
t
for t in replay_phase_transition_tools
if t['new_phase'] == next_phase
and (not name or t['function']['name'] == name)
]
assert len(
transition_tools
), f'replay_phase_transition_tools is missing tools for new_phase: {next_phase}'
return transition_tools[0]
return None
tools = get_replay_transition_tools(current_phase)
if not tools:
return None
matching = [t for t in tools if t['function']['name'] == name]
assert (
len(matching) == 1
), f'replay_phase_transition_tools did not get unique matching tool for phase {current_phase} and name {name}'
return matching[0]


def get_replay_tools(
Expand All @@ -206,14 +253,16 @@ def get_replay_tools(
tools = default_tools
elif replay_phase == ReplayPhase.Analysis:
tools = list(replay_analysis_tools)
elif replay_phase == ReplayPhase.ConfirmAnalysis:
tools = list(replay_analysis_tools)
elif replay_phase == ReplayPhase.Edit:
tools = default_tools + list(replay_analysis_tools)
else:
raise ValueError(f'Unhandled ReplayPhase in get_tools: {replay_phase}')

next_phase_tool = get_replay_transition_tool_for_current_phase(replay_phase)
if next_phase_tool:
tools.append(next_phase_tool)
next_phase_tools = get_replay_transition_tools(replay_phase)
if next_phase_tools:
tools += next_phase_tools

return tools

Expand All @@ -225,11 +274,11 @@ def get_replay_tools(

def handle_replay_tool_call(
tool_call: ChatCompletionMessageToolCall, arguments: dict, state: State
) -> ReplayAction:
) -> Action:
logger.info(
f'[REPLAY] TOOL_CALL {tool_call.function.name} - arguments: {json.dumps(arguments, indent=2)}'
)
action: ReplayAction
action: Action
name = tool_call.function.name
if is_replay_tool(name, ReplayToolType.Analysis):
if name == 'inspect-data':
Expand All @@ -248,14 +297,24 @@ def handle_replay_tool_call(
)
elif is_replay_tool(name, ReplayToolType.PhaseTransition):
# Request a phase change.
tool = get_replay_transition_tool_for_current_phase(state.replay_phase, name)
assert tool, f'[REPLAY] Missing ReplayPhaseTransitionTool for {state.replay_phase} in Replay tool_call: {tool_call.function.name}'
new_phase = tool['new_phase']
tool = get_replay_transition_tool(state.replay_phase, name)
assert tool, f'[REPLAY] Missing ReplayPhaseTransitionTool for {state.replay_phase} in Replay tool_call({tool_call.function.name})'
new_phase = next(
(
to_phase
for [from_phase, to_phase] in tool['edges']
if from_phase == state.replay_phase
),
None,
)
assert (
new_phase
), f'[REPLAY] Missing new_phase in Replay tool_call: {tool_call.function.name}'
action = ReplayPhaseUpdateAction(
new_phase=new_phase, info=json.dumps(arguments)
)
else:
# NOTE: This is a weird bug where Claude sometimes might call a tool that it *had* but does not have anymore.
action = NullAction()
assert action, f'[REPLAY] Unhandled Replay tool_call: {tool_call.function.name}'
return action
2 changes: 0 additions & 2 deletions replay_benchmarks/bolt/run-bolt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,11 @@ else
fi

# Config overrides + sanity checks.
set -x
export DEBUG=1
# export REPLAY_DEV_MODE=1
export REPLAY_ENABLE_TOOL_CACHE=1
export WORKSPACE_BASE="$WORKSPACE_ROOT"
export LLM_MODEL="anthropic/claude-3-5-sonnet-20241022"
set +x

if [[ -z "$LLM_API_KEY" ]]; then
if [[ -z "$ANTHROPIC_API_KEY" ]]; then
Expand Down
40 changes: 40 additions & 0 deletions tests/unit/replay/replay_test_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import enum
from typing import Collection, TypeVar

from openhands.core.schema.replay import ReplayPhase
from openhands.replay.replay_phases import (
replay_agent_state_machine,
)

T = TypeVar('T', bound=enum.Enum)


def format_enums(enums: Collection[T]) -> set[str]:
"""Convert any collection of enums to readable set of names."""
return {e.name for e in enums}


def format_edge(edge: tuple[ReplayPhase, ReplayPhase]) -> str:
"""Format state transition edge as 'from → to'."""
from_state, to_state = edge
return f'{from_state.name}{to_state.name}'


# The given edges should form a partition of the set of all agent edges.
def assert_edges_partition(
edges: list[tuple[ReplayPhase, ReplayPhase]],
) -> None:
"""Verify tool edges form a partition of agent edges."""
# Get ground truth
all_agent_edges = replay_agent_state_machine.edges

# Convert to sorted lists of formatted strings for comparison
tool_edge_strs = sorted(f'{f.name}{t.name}' for [f, t] in edges)
agent_edge_strs = sorted(f'{f.name}{t.name}' for [f, t] in all_agent_edges)

# Check for duplicates in tool edges
if len(set(tool_edge_strs)) != len(tool_edge_strs):
raise AssertionError('Tool edges contain duplicates')

# Check lists match exactly
assert tool_edge_strs == agent_edge_strs
Loading

0 comments on commit 274973b

Please sign in to comment.