OpenHands · VascoSch92 · Jun 18, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/.pr/README.md b/.pr/README.md
@@ -0,0 +1,64 @@
+# `/goal` shared-history demo
+
+Proves that the `/goal` loop writes into the **same** conversation history as the
+main chat — it drives the `Conversation` you pass in, it does **not** fork or
+create a sidecar conversation.
+
+## Run
+
+```bash
+# Deterministic, no network (scripted TestLLMs) — always works:
+uv run python .pr/goal_shared_history.py
+
+# Real agent doing real work (creates files, runs pytest) — opt in explicitly:
+GOAL_DEMO_REAL=1 LLM_API_KEY=sk-... LLM_MODEL=gpt-5.5 \
+    uv run python .pr/goal_shared_history.py
+```
+
+## What to look for
+
+The script sends a normal "main conversation" message, then runs `run_goal(...)`
+on the **same** `Conversation`. The `PROOF` section at the end shows:
+
+```
+same conversation id .............. True
+only one Conversation object ...... True (no fork was created)
+event log GREW in place ........... 3 -> 7
+main-convo events still present ... True
+goal objective is in THIS log ..... True
+goal outcome ...................... complete (after 2 round(s))
+```
+
+i.e. the goal's objective, the agent's work, the judge-driven followups, and the
+completion are all appended to the **one** `conversation.state.events` log under
+the **one** `conversation.id` — alongside (not replacing) the main-convo events.
+
+## Seeing what the LLM is doing
+
+The demo passes `visualizer=None` to keep the proof output clean. To watch the
+agent's activity:
+
+- **Live**: drop `visualizer=None` (the default is `DefaultConversationVisualizer`),
+  and every event — messages, tool calls, observations — prints as it happens.
+- **After the fact**: the script ends with a `REPLAY` section that renders the
+  saved history through the visualizer. Because every turn is persisted in
+  `conversation.state.events`, you can replay it any time:
+
+  ```python
+  from openhands.sdk.conversation.visualizer import DefaultConversationVisualizer
+  viz = DefaultConversationVisualizer()
+  for event in conversation.state.events:
+      viz.on_event(event)
+  ```
+
+In the deterministic (no-key) run the agent only emits scripted text, so you see
+messages. In real mode (`GOAL_DEMO_REAL=1`) you also see the actual terminal
+commands, file edits, and `pytest` output the agent runs.
+
+## How this maps to the agent server
+
+`run_goal` (used here) and the agent server's `EventService.start_goal` use the
+same mechanism: they drive a single `Conversation`/`_conversation`, so every
+event lands in that conversation's shared log and streams to subscribers. A
+`POST /conversations/{id}/goal` endpoint runs the loop in the background on the
+**existing** conversation — same history as the main chat.
diff --git a/.pr/goal_shared_history.py b/.pr/goal_shared_history.py
@@ -0,0 +1,164 @@
+"""Runnable proof that the ``/goal`` loop writes into the SAME conversation history.
+
+What it does:
+  1. Sends a normal "main conversation" message and runs the agent.
+  2. Runs a ``/goal`` loop on the *same* ``Conversation`` object.
+  3. Prints the single shared event log and checks that the main-conversation
+     events are still there, untouched, with the goal's objective / agent work /
+     judge-driven followups / completion appended after them.
+
+The point: ``run_goal`` drives the conversation you pass in (it does not fork or
+spin up a sidecar), so everything lands in one ``conversation.state.events`` log
+under one ``conversation.id``. The agent-server ``EventService.start_goal`` uses
+the same mechanism on its single ``_conversation``, so this proves the property
+both paths rely on.
+
+Run it two ways:
+  # Deterministic, no network (scripted TestLLMs) -- always works, quick check:
+  uv run python .pr/goal_shared_history.py
+
+  # Real agent doing real work (creates files, runs pytest) -- opt in explicitly:
+  GOAL_DEMO_REAL=1 LLM_API_KEY=sk-... LLM_MODEL=gpt-5.5 \
+      uv run python .pr/goal_shared_history.py
+"""
+
+import os
+import tempfile
+
+from openhands.sdk import LLM, Agent, Conversation, Tool
+from openhands.sdk.conversation.goal import run_goal
+from openhands.sdk.conversation.visualizer import DefaultConversationVisualizer
+from openhands.sdk.event import LLMConvertibleEvent
+from openhands.sdk.llm import Message, TextContent, content_to_str
+from openhands.sdk.testing import TestLLM
+from openhands.tools.file_editor import FileEditorTool
+from openhands.tools.terminal import TerminalTool
+
+
+def dump_history(conversation, title: str) -> list:
+    """Print the conversation's full event log and return its events."""
+    events = list(conversation.state.events)
+    print(f"\n===== {title} =====")
+    print(f"conversation id : {conversation.id}")
+    print(f"total events    : {len(events)}")
+    for i, ev in enumerate(events):
+        if isinstance(ev, LLMConvertibleEvent):
+            text = " ".join(content_to_str(ev.to_llm_message().content))
+            text = text.strip().replace("\n", " ")
+            print(f"  [{i:>2}] {ev.to_llm_message().role:<9} {text[:96]}")
+        else:
+            print(f"  [{i:>2}] {type(ev).__name__}")
+    return events
+
+
+def _scripted(*texts: str, usage_id: str) -> TestLLM:
+    return TestLLM.from_messages(
+        [Message(role="assistant", content=[TextContent(text=t)]) for t in texts],
+        usage_id=usage_id,
+    )
+
+
+def build(real: bool):
+    """Return (agent, judge_llm, main_message, objective, max_iterations)."""
+    if real:
+        llm = LLM(
+            usage_id="agent",
+            model=os.getenv("LLM_MODEL", "gpt-5.5"),
+            api_key=os.getenv("LLM_API_KEY"),
+            base_url=os.getenv("LLM_BASE_URL"),
+        )
+        agent = Agent(
+            llm=llm,
+            tools=[Tool(name=TerminalTool.name), Tool(name=FileEditorTool.name)],
+        )
+        judge_llm = llm.model_copy(update={"usage_id": "goal-judge"})
+        objective = (
+            "Create mathx.py with an add(a, b) function and test_mathx.py with a "
+            "pytest test for it. The goal is complete only when `python -m pytest "
+            "-q` passes. Finish each turn with the finish tool."
+        )
+        return (
+            agent,
+            judge_llm,
+            "Say hello and tell me which directory you are in.",
+            objective,
+            5,
+        )
+
+    # Deterministic path: scripted agent (one content-only reply per run) + a
+    # judge that says "not done" once, then "done".
+    agent = Agent(
+        llm=_scripted(
+            "Hello! I am working in the demo workspace.",  # main turn
+            "I drafted mathx.py and a pytest for it.",  # goal round 1
+            "Fixed it -- mathx.py and test_mathx.py now pass.",  # goal round 2
+            usage_id="agent",
+        ),
+        tools=[],
+    )
+    judge_llm = _scripted(
+        '{"score": 0.3, "complete": false, "missing": "tests not passing yet"}',
+        '{"score": 1.0, "complete": true, "missing": ""}',
+        usage_id="goal-judge",
+    )
+    return agent, judge_llm, "Say hello.", "Make `pytest` pass for mathx.py.", 5
+
+
+def main() -> None:
+    # Real mode is explicit opt-in so the deterministic demo always works,
+    # even when a (possibly stale) LLM_API_KEY is present in the environment.
+    real = os.getenv("GOAL_DEMO_REAL") == "1"
+    print(f"mode: {'REAL LLM' if real else 'DETERMINISTIC (scripted TestLLM)'}")
+
+    agent, judge_llm, main_message, objective, max_iters = build(real)
+    workspace = tempfile.mkdtemp(prefix="goal-demo-")
+    # visualizer=None keeps the output focused on the proof below.
+    conversation = Conversation(
+        agent=agent, workspace=workspace, visualizer=None, persistence_dir=workspace
+    )
+    convo_id = conversation.id
+
+    # 1) A normal "main conversation" turn.
+    conversation.send_message(main_message)
+    conversation.run()
+    main_events = dump_history(conversation, "AFTER MAIN CONVERSATION TURN")
+    main_ids = [ev.id for ev in main_events]
+
+    # 2) A /goal loop on the SAME conversation object.
+    print(f"\n>>> running /goal: {objective}\n")
+    outcome = run_goal(conversation, objective, judge_llm, max_iterations=max_iters)
+
+    all_events = dump_history(conversation, "AFTER /goal LOOP (SAME CONVERSATION)")
+    all_ids = [ev.id for ev in all_events]
+
+    # 3) Prove it is one shared history.
+    objective_in_log = any(
+        objective[:20] in " ".join(content_to_str(ev.to_llm_message().content))
+        for ev in all_events
+        if isinstance(ev, LLMConvertibleEvent)
+    )
+    print("\n===== PROOF (shared history) =====")
+    print(f"same conversation id .............. {conversation.id == convo_id}")
+    print("only one Conversation object ...... True (no fork was created)")
+    print(f"event log GREW in place ........... {len(main_ids)} -> {len(all_ids)}")
+    print(f"main-convo events still present ... {all_ids[: len(main_ids)] == main_ids}")
+    print(f"goal objective is in THIS log ..... {objective_in_log}")
+    print(
+        f"goal outcome ...................... {outcome.status} "
+        f"(after {outcome.iterations} round(s))"
+    )
+    print(f"\nworkspace: {workspace}")
+
+    # Visualize the whole thing AFTER the fact. Because every turn (main + goal)
+    # is persisted in conversation.state.events, we can replay the conversation
+    # through the SDK's visualizer at any time -- here, after the run finished.
+    # (For LIVE output instead, drop `visualizer=None` above; the default
+    # DefaultConversationVisualizer then prints each event as it happens.)
+    print("\n===== REPLAY (visualizing the saved conversation) =====")
+    visualizer = DefaultConversationVisualizer()
+    for event in conversation.state.events:
+        visualizer.on_event(event)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/openhands-agent-server/openhands/agent_server/conversation_router.py b/openhands-agent-server/openhands/agent_server/conversation_router.py
@@ -34,6 +34,7 @@
     SetConfirmationPolicyRequest,
     SetSecurityAnalyzerRequest,
     StartConversationRequest,
+    StartGoalRequest,
     Success,
     UpdateConversationRequest,
     UpdateSecretsRequest,
@@ -286,6 +287,90 @@ async def run_conversation(
     return Success()
 
 
+@conversation_router.post(
+    "/{conversation_id}/goal",
+    responses={
+        404: {"description": "Item not found"},
+        409: {"description": "Conversation or goal is already running"},
+    },
+)
+async def start_goal_conversation(
+    conversation_id: UUID,
+    request: StartGoalRequest,
+    conversation_service: ConversationService = Depends(get_conversation_service),
+) -> Success:
+    """Start a ``/goal`` driver loop in the background on the conversation.
+
+    Drives the agent toward ``objective``, judging completion after each run and
+    re-prompting until done or ``max_iterations``. All work lands in the same
+    conversation history and event stream as the main chat (it is not a fork).
+    """
+    event_service = await conversation_service.get_event_service(conversation_id)
+    if event_service is None:
+        raise HTTPException(status.HTTP_404_NOT_FOUND)
+
+    try:
+        await event_service.start_goal(
+            request.objective, max_iterations=request.max_iterations
+        )
+    except ValueError as e:
+        message = str(e)
+        if message in ("conversation_already_running", "goal_already_running"):
+            raise HTTPException(
+                status_code=status.HTTP_409_CONFLICT,
+                detail="Conversation or goal already running.",
+            )
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=message)
+
+    return Success()
+
+
+@conversation_router.post(
+    "/{conversation_id}/goal/stop",
+    responses={404: {"description": "Item not found"}},
+)
+async def stop_goal_conversation(
+    conversation_id: UUID,
+    conversation_service: ConversationService = Depends(get_conversation_service),
+) -> Success:
+    """Stop a running ``/goal`` loop. The goal records a resumable state."""
+    event_service = await conversation_service.get_event_service(conversation_id)
+    if event_service is None:
+        raise HTTPException(status.HTTP_404_NOT_FOUND)
+    await event_service.stop_goal()
+    return Success()
+
+
+@conversation_router.post(
+    "/{conversation_id}/goal/resume",
+    responses={
+        404: {"description": "Item not found"},
+        409: {"description": "Conversation or goal is already running"},
+    },
+)
+async def resume_goal_conversation(
+    conversation_id: UUID,
+    conversation_service: ConversationService = Depends(get_conversation_service),
+) -> Success:
+    """Resume a previously interrupted ``/goal`` loop from where it left off."""
+    event_service = await conversation_service.get_event_service(conversation_id)
+    if event_service is None:
+        raise HTTPException(status.HTTP_404_NOT_FOUND)
+
+    try:
+        await event_service.resume_goal()
+    except ValueError as e:
+        message = str(e)
+        if message in ("conversation_already_running", "goal_already_running"):
+            raise HTTPException(
+                status_code=status.HTTP_409_CONFLICT,
+                detail="Conversation or goal already running.",
+            )
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=message)
+
+    return Success()
+
+
 @conversation_router.post(
     "/{conversation_id}/secrets", responses={404: {"description": "Item not found"}}
 )