Skip to content

Commit cf56307

Browse files
VascoSch92allhands-bot
andauthored
feat(sdk): add /goal SDK core (judge-driven goal-completion loop) (#3769)
Co-authored-by: allhands-bot <allhands-bot@users.noreply.github.com>
1 parent 63e9e77 commit cf56307

13 files changed

Lines changed: 845 additions & 0 deletions

File tree

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""The /goal command: pursue an objective until a judge LLM confirms it is done.
2+
3+
A plain ``conversation.run()`` stops as soon as the agent *thinks* it is
4+
finished. The ``/goal`` loop is stricter: after each run it asks a second
5+
"judge" LLM to audit the transcript for authoritative evidence -- file
6+
contents, command output, test results -- that the objective is *provably*
7+
complete. If something is still missing, it re-prompts the agent with the
8+
judge's feedback and runs again, until the goal is genuinely done or a hard
9+
iteration cap is reached.
10+
11+
That makes it a good fit for verifiable objectives like "make the tests pass":
12+
the agent cannot finish just by claiming success; the judge has to see green
13+
output first.
14+
15+
Key concepts demonstrated:
16+
1. ``run_goal(conversation, objective, judge_llm, max_iterations=...)`` drives
17+
the conversation from the outside, re-prompting until the judge is satisfied.
18+
2. A second, independent "judge" LLM grades completion -- separate from the
19+
agent that does the work.
20+
3. The returned ``GoalOutcome`` reports whether the goal ``"complete"``-d or was
21+
``"capped"``, how many audit rounds it took, and the judge's final verdict.
22+
23+
Because ``run_goal`` drives the conversation you pass in (it does not fork or
24+
spin up a sidecar), every turn -- objective, agent work, judge-driven followups
25+
-- lands in the same ``conversation.state.events`` history. It therefore
26+
composes with whatever agent, tools, or critic you already have.
27+
"""
28+
29+
import os
30+
import tempfile
31+
32+
from openhands.sdk import LLM, Agent, Conversation, Tool
33+
from openhands.sdk.conversation.goal import run_goal
34+
from openhands.tools.file_editor import FileEditorTool
35+
from openhands.tools.terminal import TerminalTool
36+
37+
38+
# The agent LLM does the work; the judge LLM independently grades completion.
39+
# Two separate instances (same model, distinct usage_id) keep their costs apart.
40+
model = os.getenv("LLM_MODEL", "gpt-5.5")
41+
api_key = os.getenv("LLM_API_KEY")
42+
base_url = os.getenv("LLM_BASE_URL")
43+
agent_llm = LLM(usage_id="agent", model=model, api_key=api_key, base_url=base_url)
44+
judge_llm = LLM(usage_id="goal-judge", model=model, api_key=api_key, base_url=base_url)
45+
46+
agent = Agent(
47+
llm=agent_llm,
48+
tools=[Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name)],
49+
)
50+
51+
workspace = tempfile.mkdtemp(prefix="goal_demo_")
52+
conversation = Conversation(agent=agent, workspace=workspace)
53+
54+
# A verifiable objective: the judge can only call it done once it has seen
55+
# pytest actually pass -- not merely the agent asserting that it did.
56+
objective = (
57+
"Create mathx.py with an add(a, b) function and test_mathx.py with a pytest "
58+
"test for it. The goal is complete only when `python -m pytest -q` passes."
59+
)
60+
61+
# Drive the conversation toward the objective, re-judging after each run.
62+
outcome = run_goal(conversation, objective, judge_llm, max_iterations=3)
63+
64+
print("\n" + "=" * 70)
65+
print(f"Goal {outcome.status} after {outcome.iterations} audit round(s).")
66+
print(f"Judge score: {outcome.verdict.score:.2f}")
67+
if outcome.verdict.missing:
68+
print(f"Still missing: {outcome.verdict.missing}")
69+
print(f"Workspace: {workspace}")
70+
print("=" * 70)
71+
72+
# Report cost (agent work + judge audits).
73+
cost = agent_llm.metrics.accumulated_cost + judge_llm.metrics.accumulated_cost
74+
print(f"EXAMPLE_COST: {cost}")
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""The ``/goal`` command: judge-driven, self-continuing goal completion.
2+
3+
A conversation-level command (not a critic) that drives the agent toward an
4+
objective: it sends the objective, runs the agent, judges completion with a
5+
second LLM, and re-prompts until the goal is done or a cap is reached.
6+
7+
The decision logic lives in :class:`GoalController` (transport-agnostic, no
8+
I/O); :func:`run_goal` is a thin synchronous driver over it. An async
9+
agent-server task can reuse the same controller with its own I/O loop.
10+
11+
Usage::
12+
13+
from openhands.sdk.conversation.goal import run_goal
14+
15+
outcome = run_goal(conversation, "make pytest pass for mathx.py", judge_llm)
16+
"""
17+
18+
from openhands.sdk.conversation.goal.controller import (
19+
GoalContinue,
20+
GoalController,
21+
GoalDone,
22+
GoalOutcome,
23+
GoalStatus,
24+
GoalStatusName,
25+
GoalStep,
26+
)
27+
from openhands.sdk.conversation.goal.judge import GoalVerdict, judge_goal
28+
from openhands.sdk.conversation.goal.runner import run_goal
29+
30+
31+
__all__ = [
32+
"GoalContinue",
33+
"GoalController",
34+
"GoalDone",
35+
"GoalOutcome",
36+
"GoalStatus",
37+
"GoalStatusName",
38+
"GoalStep",
39+
"GoalVerdict",
40+
"judge_goal",
41+
"run_goal",
42+
]
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
"""The transport-agnostic brain of the ``/goal`` loop.
2+
3+
``GoalController`` decides -- after each agent run finishes -- whether to
4+
continue (with a followup message) or stop (with a ``GoalOutcome``). It performs
5+
NO I/O: a *driver* (the sync ``run_goal``, or an async agent-server task) owns
6+
sending messages and running the agent; the controller only judges and decides.
7+
That split lets the sync and async drivers share identical decision logic.
8+
"""
9+
10+
from collections.abc import Sequence
11+
from typing import Literal
12+
13+
from pydantic import BaseModel, Field
14+
15+
from openhands.sdk.conversation.goal.judge import GoalVerdict, judge_goal
16+
from openhands.sdk.conversation.goal.prompts import FOLLOWUP_PROMPT
17+
from openhands.sdk.event import Event
18+
from openhands.sdk.llm import LLM
19+
from openhands.sdk.logger import get_logger
20+
21+
22+
logger = get_logger(__name__)
23+
24+
25+
class GoalOutcome(BaseModel):
26+
"""Result of a ``/goal`` loop.
27+
28+
``status`` distinguishes genuine completion from hitting the iteration cap,
29+
so a driver never has to guess whether a silent finish meant success.
30+
"""
31+
32+
status: Literal["complete", "capped"]
33+
iterations: int = Field(ge=1, description="Number of audit rounds performed.")
34+
verdict: GoalVerdict
35+
36+
37+
GoalStatusName = Literal["running", "complete", "capped", "interrupted"]
38+
"""Lifecycle state of a ``/goal`` loop."""
39+
40+
41+
class GoalStatus(BaseModel):
42+
"""Live status of a ``/goal`` loop, for a UI progress chip.
43+
44+
The agent server publishes this as the ``value`` of a
45+
``ConversationStateUpdateEvent`` with ``key="goal"`` at each lifecycle point
46+
(start, each round, and the terminal/interrupted state).
47+
"""
48+
49+
active: bool = Field(description="Whether the goal loop is still running.")
50+
status: GoalStatusName
51+
iteration: int = Field(ge=0, description="Audit rounds completed so far.")
52+
max_iterations: int = Field(ge=1)
53+
objective: str
54+
verdict: GoalVerdict | None = Field(
55+
default=None, description="Last judge verdict; set once the loop ends."
56+
)
57+
58+
59+
class GoalContinue(BaseModel):
60+
"""Decision to keep going: send ``followup`` before the next run."""
61+
62+
followup: str
63+
64+
65+
class GoalDone(BaseModel):
66+
"""Decision to stop: the loop finished with ``outcome``."""
67+
68+
outcome: GoalOutcome
69+
70+
71+
GoalStep = GoalContinue | GoalDone
72+
"""One decision returned by :meth:`GoalController.on_run_finished`."""
73+
74+
75+
class GoalController:
76+
"""Judges goal completion and decides continue-vs-stop, without doing I/O.
77+
78+
A driver calls :meth:`start` once to get the first message to send, then
79+
calls :meth:`on_run_finished` after every agent run to get the next
80+
decision. The controller owns the iteration count and the ``max_iterations``
81+
cap, so drivers stay trivial.
82+
"""
83+
84+
def __init__(
85+
self, objective: str, judge_llm: LLM, *, max_iterations: int = 10
86+
) -> None:
87+
if not objective.strip():
88+
raise ValueError("Goal objective must not be empty.")
89+
if max_iterations < 1:
90+
raise ValueError("max_iterations must be >= 1.")
91+
self.objective = objective
92+
self.judge_llm = judge_llm
93+
self.max_iterations = max_iterations
94+
self.iteration = 0
95+
96+
def start(self) -> str:
97+
"""Return the first message a driver should send (the objective)."""
98+
return self.objective
99+
100+
def on_run_finished(self, events: Sequence[Event]) -> GoalStep:
101+
"""Judge the objective after a run and decide whether to continue.
102+
103+
Increments the iteration count, audits ``events`` with the judge LLM,
104+
and returns a :class:`GoalContinue` (with a followup) or a terminal
105+
:class:`GoalDone` (with a :class:`GoalOutcome`).
106+
"""
107+
self.iteration += 1
108+
verdict = judge_goal(self.judge_llm, self.objective, events)
109+
logger.info(
110+
"Goal audit %d/%d: score=%.2f complete=%s",
111+
self.iteration,
112+
self.max_iterations,
113+
verdict.score,
114+
verdict.complete,
115+
)
116+
if verdict.complete:
117+
return GoalDone(
118+
outcome=GoalOutcome(
119+
status="complete", iterations=self.iteration, verdict=verdict
120+
)
121+
)
122+
if self.iteration >= self.max_iterations:
123+
return GoalDone(
124+
outcome=GoalOutcome(
125+
status="capped", iterations=self.iteration, verdict=verdict
126+
)
127+
)
128+
missing = verdict.missing or "Some requirements are not yet verified."
129+
followup = FOLLOWUP_PROMPT.format(iteration=self.iteration, missing=missing)
130+
return GoalContinue(followup=followup)
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
"""LLM judge that decides whether a ``/goal`` objective is complete.
2+
3+
This is the reusable kernel of the goal feature: a pure
4+
``objective + transcript -> verdict`` evaluator with no dependency on the
5+
critic machinery. The ``/goal`` runner uses it to drive continuation, but it
6+
can equally back a status command, a stop hook, or a server endpoint.
7+
"""
8+
9+
import contextlib
10+
import json
11+
import re
12+
from collections.abc import Sequence
13+
from typing import Any
14+
15+
from pydantic import BaseModel, Field
16+
17+
from openhands.sdk.conversation.goal.prompts import JUDGE_PROMPT
18+
from openhands.sdk.event import Event, LLMConvertibleEvent
19+
from openhands.sdk.llm import LLM, Message, TextContent, content_to_str
20+
from openhands.sdk.logger import get_logger
21+
22+
23+
logger = get_logger(__name__)
24+
25+
26+
class GoalVerdict(BaseModel):
27+
"""The judge's verdict on whether the objective is complete."""
28+
29+
score: float = Field(
30+
ge=0.0,
31+
le=1.0,
32+
description="Probability (0-1) that the full objective is provably done.",
33+
)
34+
complete: bool = Field(
35+
description="Whether the judge considers the objective complete."
36+
)
37+
missing: str = Field(
38+
default="",
39+
description="Concise description of what remains, or empty if complete.",
40+
)
41+
42+
43+
def judge_goal(judge_llm: LLM, objective: str, events: Sequence[Event]) -> GoalVerdict:
44+
"""Audit the transcript and decide whether ``objective`` is complete.
45+
46+
Args:
47+
judge_llm: The second LLM that grades completion.
48+
objective: The goal to audit against.
49+
events: Conversation events (non-LLM events are ignored).
50+
51+
Returns:
52+
A GoalVerdict. On a judge response that cannot be parsed, returns a
53+
conservative low score so the caller keeps working rather than
54+
falsely finishing.
55+
"""
56+
convertible = [e for e in events if isinstance(e, LLMConvertibleEvent)]
57+
transcript = _render_transcript(convertible)
58+
prompt = JUDGE_PROMPT.format(objective=objective, transcript=transcript)
59+
60+
# The judge only needs the verdict text. Force non-streaming so reusing a
61+
# streaming agent LLM as the judge does not trip completion()'s requirement
62+
# of an on_token callback when stream=True.
63+
if judge_llm.stream:
64+
judge_llm = judge_llm.model_copy(update={"stream": False})
65+
response = judge_llm.completion(
66+
messages=[Message(role="user", content=[TextContent(text=prompt)])]
67+
)
68+
verdict = _parse_verdict(response.message)
69+
logger.debug("judge_goal verdict: %s", verdict)
70+
return verdict
71+
72+
73+
def _render_transcript(events: Sequence[LLMConvertibleEvent]) -> str:
74+
"""Render events as a plain ``role: text`` transcript for the judge.
75+
76+
The agent's ``system`` prompt is excluded: it is large (~thousands of tokens)
77+
and carries no goal-specific evidence, so it would only inflate the judge's
78+
token cost on every audit.
79+
"""
80+
turns = [
81+
(msg.role, text)
82+
for msg in LLMConvertibleEvent.events_to_messages(list(events))
83+
if msg.role != "system"
84+
and (text := "\n".join(content_to_str(msg.content)).strip())
85+
]
86+
return "\n\n".join(f"{role}: {text}" for role, text in turns)
87+
88+
89+
def _parse_verdict(message: Message) -> GoalVerdict:
90+
"""Normalize the judge response into a GoalVerdict, conservatively."""
91+
raw = "\n".join(content_to_str(message.content)).strip()
92+
93+
data: dict[str, Any] | None = None
94+
candidates = [raw]
95+
block = re.search(r"\{.*\}", raw, re.DOTALL)
96+
if block:
97+
candidates.append(block.group(0))
98+
for candidate in candidates:
99+
with contextlib.suppress(json.JSONDecodeError):
100+
parsed = json.loads(candidate)
101+
if isinstance(parsed, dict):
102+
data = parsed
103+
break
104+
105+
if data is None:
106+
logger.warning("judge_goal: could not parse verdict: %r", raw)
107+
return GoalVerdict(
108+
score=0.0, complete=False, missing="Judge verdict could not be parsed."
109+
)
110+
111+
try:
112+
score = float(data.get("score", 0.0))
113+
except (TypeError, ValueError):
114+
score = 0.0
115+
score = max(0.0, min(1.0, score))
116+
117+
return GoalVerdict(
118+
score=score,
119+
complete=bool(data.get("complete", score >= 1.0)),
120+
missing=str(data.get("missing") or ""),
121+
)

0 commit comments

Comments
 (0)