aden-hive
diff --git a/‎core/framework/agents/hive_coder/nodes/__init__.py‎
Lines changed: 59 additions & 13 deletions b/‎core/framework/agents/hive_coder/nodes/__init__.py‎
Lines changed: 59 additions & 13 deletions
diff --git a/‎core/framework/agents/hive_coder/reference/anti_patterns.md‎
Lines changed: 4 additions & 0 deletions b/‎core/framework/agents/hive_coder/reference/anti_patterns.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎core/framework/agents/hive_coder/reference/framework_guide.md‎
Lines changed: 1 addition & 1 deletion b/‎core/framework/agents/hive_coder/reference/framework_guide.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/framework/agents/hive_coder/reference/gcu_guide.md‎
Lines changed: 119 additions & 0 deletions b/‎core/framework/agents/hive_coder/reference/gcu_guide.md‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎core/framework/config.py‎
Lines changed: 5 additions & 0 deletions b/‎core/framework/config.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎core/framework/credentials/validation.py‎
Lines changed: 1 addition & 5 deletions b/‎core/framework/credentials/validation.py‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎core/framework/graph/conversation.py‎
Lines changed: 26 additions & 5 deletions b/‎core/framework/graph/conversation.py‎
Lines changed: 26 additions & 5 deletions
diff --git a/‎core/framework/graph/conversation_judge.py‎
Lines changed: 32 additions & 7 deletions b/‎core/framework/graph/conversation_judge.py‎
Lines changed: 32 additions & 7 deletions
@@ -7,19 +7,38 @@
 # Load reference docs at import time so they're always in the system prompt.
 # No voluntary read_file() calls needed — the LLM gets everything upfront.
 _ref_dir = Path(__file__).parent.parent / "reference"
-_framework_guide = (_ref_dir / "framework_guide.md").read_text(encoding="utf-8")
-_file_templates = (_ref_dir / "file_templates.md").read_text(encoding="utf-8")
-_anti_patterns = (_ref_dir / "anti_patterns.md").read_text(encoding="utf-8")
+_framework_guide = (_ref_dir / "framework_guide.md").read_text()
+_file_templates = (_ref_dir / "file_templates.md").read_text()
+_anti_patterns = (_ref_dir / "anti_patterns.md").read_text()
+_gcu_guide_path = _ref_dir / "gcu_guide.md"
+_gcu_guide = _gcu_guide_path.read_text() if _gcu_guide_path.exists() else ""
+
+
+def _is_gcu_enabled() -> bool:
+    try:
+        from framework.config import get_gcu_enabled
+
+        return get_gcu_enabled()
+    except Exception:
+        return False
+
+
+def _build_appendices() -> str:
+    parts = (
+        "\n\n# Appendix: Framework Reference\n\n"
+        + _framework_guide
+        + "\n\n# Appendix: File Templates\n\n"
+        + _file_templates
+        + "\n\n# Appendix: Anti-Patterns\n\n"
+        + _anti_patterns
+    )
+    if _is_gcu_enabled() and _gcu_guide:
+        parts += "\n\n# Appendix: GCU Browser Automation Guide\n\n" + _gcu_guide
+    return parts
+
 
 # Shared appendices — appended to every coding node's system prompt.
-_appendices = (
-    "\n\n# Appendix: Framework Reference\n\n"
-    + _framework_guide
-    + "\n\n# Appendix: File Templates\n\n"
-    + _file_templates
-    + "\n\n# Appendix: Anti-Patterns\n\n"
-    + _anti_patterns
-)
+_appendices = _build_appendices()
 
 # Tools available to both coder (worker) and queen.
 _SHARED_TOOLS = [
@@ -391,7 +410,10 @@
 **Node rules**:
 - **2-4 nodes MAX.** Never exceed 4. Merge thin nodes aggressively.
 - A node with 0 tools is NOT a real node — merge it.
-- node_type always "event_loop"
+- node_type "event_loop" for all regular graph nodes. Use "gcu" ONLY for
+  browser automation subagents (see GCU appendix). GCU nodes MUST be in a
+  parent node's sub_agents list, NEVER connected via edges, and NEVER used
+  as entry/terminal nodes.
 - max_node_visits default is 0 (unbounded) — correct for forever-alive. \
 Only set >0 in one-shot agents with bounded feedback loops.
 - Feedback inputs: nullable_output_keys
@@ -539,6 +561,11 @@
 this session. If a worker is already loaded, it is automatically unloaded \
 first. Call after building and validating an agent to make it available \
 immediately.
+
+## Credentials
+- list_credentials(credential_id?) — List all authorized credentials in the \
+local store. Returns IDs, aliases, status, and identity metadata (never \
+secrets). Optionally filter by credential_id.
 """
 
 _queen_behavior = """
@@ -589,14 +616,29 @@
 - For tasks matching the worker's goal, call start_worker(task).
 - For everything else, do it directly.
 
+## When the user clicks Run (external event notification)
+When you receive an event that the user clicked Run:
+- If the worker started successfully, briefly acknowledge it — do NOT \
+repeat the full status. The user can see the graph is running.
+- If the worker failed to start (credential or structural error), \
+explain the problem clearly and help fix it. For credential errors, \
+guide the user to set up the missing credentials. For structural \
+issues, offer to fix the agent graph directly.
+
 ## When worker is running:
-- If the user asks about progress, call get_worker_status().
+- If the user asks about progress, call get_worker_status() ONCE and \
+report the result. Do NOT poll in a loop.
+- NEVER call get_worker_status() repeatedly without user input in between. \
+The worker will surface results through client-facing nodes. You do not \
+need to monitor it. One check per user request is enough.
 - If the user has a concern or instruction for the worker, call \
 inject_worker_message(content) to relay it.
 - You can still do coding tasks directly while the worker runs.
 - If an escalation ticket arrives from the judge, assess severity:
   - Low/transient: acknowledge silently, do not disturb the user.
   - High/critical: notify the user with a brief analysis and suggested action.
+- After starting the worker or checking its status, WAIT for the user's \
+next message. Do not take autonomous actions unless the user asks.
 
 ## When worker asks user a question:
 - The system will route the user's response directly to the worker. \
@@ -778,6 +820,8 @@
         "notify_operator",
         # Agent loading
         "load_built_agent",
+        # Credentials
+        "list_credentials",
     ],
     system_prompt=(
         "You are the Queen — the user's primary interface. You are a coding agent "
@@ -803,6 +847,8 @@
     "notify_operator",
     # Agent loading
     "load_built_agent",
+    # Credentials
+    "list_credentials",
 ]
 
 __all__ = [
 
@@ -105,3 +105,7 @@ def test_research_routes_back_to_interact(self):
 23. **Forgetting sys.path setup in conftest.py** — Tests need `exports/` and `core/` on sys.path.
 
 24. **Not using auto_responder for client-facing nodes** — Tests with client-facing nodes hang without an auto-responder that injects input. But note: even WITH auto_responder, forever-alive agents still hang because the graph never terminates. Auto-responder only helps for agents with terminal nodes.
+
+25. **Manually wiring browser tools on event_loop nodes** — If the agent needs browser automation, use `node_type="gcu"` which auto-includes all browser tools and prepends best-practices guidance. Do NOT manually list browser tool names on event_loop nodes — they may not exist in the MCP server or may be incomplete. See the GCU Guide appendix.
+
+26. **Using GCU nodes as regular graph nodes** — GCU nodes (`node_type="gcu"`) are exclusively subagents. They must ONLY appear in a parent node's `sub_agents=["gcu-node-id"]` list and be invoked via `delegate_to_sub_agent()`. They must NEVER be connected via edges, used as entry nodes, or used as terminal nodes. If a GCU node appears as an edge source or target, the graph will fail pre-load validation.
@@ -72,7 +72,7 @@ goal = Goal(
 | id | str | required | kebab-case identifier |
 | name | str | required | Display name |
 | description | str | required | What the node does |
-| node_type | str | required | Always `"event_loop"` |
+| node_type | str | required | `"event_loop"` or `"gcu"` (browser automation — see GCU Guide appendix) |
 | input_keys | list[str] | required | Memory keys this node reads |
 | output_keys | list[str] | required | Memory keys this node writes via set_output |
 | system_prompt | str | "" | LLM instructions |
 
@@ -0,0 +1,119 @@
+# GCU Browser Automation Guide
+
+## When to Use GCU Nodes
+
+Use `node_type="gcu"` when:
+- The user's workflow requires **navigating real websites** (scraping, form-filling, social media interaction, testing web UIs)
+- The task involves **dynamic/JS-rendered pages** that `web_scrape` cannot handle (SPAs, infinite scroll, login-gated content)
+- The agent needs to **interact with a website** — clicking, typing, scrolling, selecting, uploading files
+
+Do NOT use GCU for:
+- Static content that `web_scrape` handles fine
+- API-accessible data (use the API directly)
+- PDF/file processing
+- Anything that doesn't require a browser UI
+
+## What GCU Nodes Are
+
+- `node_type="gcu"` — a declarative enhancement over `event_loop`
+- Framework auto-prepends browser best-practices system prompt
+- Framework auto-includes all 31 browser tools from `gcu-tools` MCP server
+- Same underlying `EventLoopNode` class — no new imports needed
+- `tools=[]` is correct — tools are auto-populated at runtime
+
+## GCU Architecture Pattern
+
+GCU nodes are **subagents** — invoked via `delegate_to_sub_agent()`, not connected via edges.
+
+- Primary nodes (`event_loop`, client-facing) orchestrate; GCU nodes do browser work
+- Parent node declares `sub_agents=["gcu-node-id"]` and calls `delegate_to_sub_agent(agent_id="gcu-node-id", task="...")`
+- GCU nodes set `max_node_visits=1` (single execution per delegation), `client_facing=False`
+- GCU nodes use `output_keys=["result"]` and return structured JSON via `set_output("result", ...)`
+
+## GCU Node Definition Template
+
+```python
+gcu_browser_node = NodeSpec(
+    id="gcu-browser-worker",
+    name="Browser Worker",
+    description="Browser subagent that does X.",
+    node_type="gcu",
+    client_facing=False,
+    max_node_visits=1,
+    input_keys=[],
+    output_keys=["result"],
+    tools=[],  # Auto-populated with all browser tools
+    system_prompt="""\
+You are a browser agent. Your job: [specific task].
+
+## Workflow
+1. browser_start (only if no browser is running yet)
+2. browser_open(url=TARGET_URL) — note the returned targetId
+3. browser_snapshot to read the page
+4. [task-specific steps]
+5. set_output("result", JSON)
+
+## Output format
+set_output("result", JSON) with:
+- [field]: [type and description]
+""",
+)
+```
+
+## Parent Node Template (orchestrating GCU subagents)
+
+```python
+orchestrator_node = NodeSpec(
+    id="orchestrator",
+    ...
+    node_type="event_loop",
+    sub_agents=["gcu-browser-worker"],
+    system_prompt="""\
+...
+delegate_to_sub_agent(
+    agent_id="gcu-browser-worker",
+    task="Navigate to [URL]. Do [specific task]. Return JSON with [fields]."
+)
+...
+""",
+    tools=[],  # Orchestrator doesn't need browser tools
+)
+```
+
+## mcp_servers.json with GCU
+
+```json
+{
+  "hive-tools": { ... },
+  "gcu-tools": {
+    "transport": "stdio",
+    "command": "uv",
+    "args": ["run", "python", "-m", "gcu.server", "--stdio"],
+    "cwd": "../../tools",
+    "description": "GCU tools for browser automation"
+  }
+}
+```
+
+Note: `gcu-tools` is auto-added if any node uses `node_type="gcu"`, but including it explicitly is fine.
+
+## GCU System Prompt Best Practices
+
+Key rules to bake into GCU node prompts:
+
+- Prefer `browser_snapshot` over `browser_get_text("body")` — compact accessibility tree vs 100KB+ raw HTML
+- Always `browser_wait` after navigation
+- Use large scroll amounts (~2000-5000) for lazy-loaded content
+- For spillover files, use `run_command` with grep, not `read_file`
+- If auth wall detected, report immediately — don't attempt login
+- Keep tool calls per turn ≤10
+- Tab isolation: when browser is already running, use `browser_open(background=true)` and pass `target_id` to every call
+
+## GCU Anti-Patterns
+
+- Using `browser_screenshot` to read text (use `browser_snapshot`)
+- Re-navigating after scrolling (resets scroll position)
+- Attempting login on auth walls
+- Forgetting `target_id` in multi-tab scenarios
+- Putting browser tools directly on `event_loop` nodes instead of using GCU subagent pattern
+- Making GCU nodes `client_facing=True` (they should be autonomous subagents)
@@ -90,6 +90,11 @@ def get_api_key() -> str | None:
     return None
 
 
+def get_gcu_enabled() -> bool:
+    """Return whether GCU (browser automation) is enabled in user config."""
+    return get_hive_config().get("gcu_enabled", False)
+
+
 def get_api_base() -> str | None:
     """Return the api_base URL for OpenAI-compatible endpoints, if configured."""
     llm = get_hive_config().get("llm", {})
 
@@ -159,11 +159,7 @@ def format_error_message(self) -> str:
                     f"  {c.env_var} for {_label(c)}"
                     f"\n    Connect this integration at hive.adenhq.com first."
                 )
-        lines.append(
-            "\nTo fix: run /hive-credentials in Claude Code."
-            "\nIf you've already set up credentials, "
-            "restart your terminal to load them."
-        )
+        lines.append("\nIf you've already set up credentials, restart your terminal to load them.")
         return "\n".join(lines)
 
 
 
@@ -107,17 +107,38 @@ def _extract_spillover_filename(content: str) -> str | None:
 def _compact_tool_calls(tool_calls: list[dict[str, Any]]) -> list[dict[str, Any]]:
     """Truncate tool_call arguments to save context tokens during compaction.
 
-    Preserves ``id``, ``type``, and ``function.name`` exactly.  Truncates
-    ``function.arguments`` (a JSON string) to at most ``_TC_ARG_LIMIT`` chars
-    so that large payloads (e.g. set_output with full findings) don't survive
-    compaction and defeat the purpose of context reduction.
+    Preserves ``id``, ``type``, and ``function.name`` exactly.  When arguments
+    exceed ``_TC_ARG_LIMIT``, replaces the full JSON string with a compact
+    **valid** JSON summary.  The Anthropic API parses tool_call arguments and
+    rejects requests with malformed JSON (e.g. unterminated strings), so we
+    must never produce broken JSON here.
     """
     compact = []
     for tc in tool_calls:
         func = tc.get("function", {})
         args = func.get("arguments", "")
         if len(args) > _TC_ARG_LIMIT:
-            args = args[:_TC_ARG_LIMIT] + "…[truncated]"
+            # Build a valid JSON summary instead of slicing mid-string.
+            # Try to extract top-level keys for a meaningful preview.
+            try:
+                parsed = json.loads(args)
+                if isinstance(parsed, dict):
+                    # Preserve key names, truncate values
+                    summary_parts = []
+                    for k, v in parsed.items():
+                        v_str = str(v)
+                        if len(v_str) > 60:
+                            v_str = v_str[:60] + "..."
+                        summary_parts.append(f"{k}={v_str}")
+                    summary = ", ".join(summary_parts)
+                    if len(summary) > _TC_ARG_LIMIT:
+                        summary = summary[:_TC_ARG_LIMIT] + "..."
+                    args = json.dumps({"_compacted": summary})
+                else:
+                    args = json.dumps({"_compacted": str(parsed)[:_TC_ARG_LIMIT]})
+            except (json.JSONDecodeError, TypeError):
+                # Args were already invalid JSON — wrap the preview safely
+                args = json.dumps({"_compacted": args[:_TC_ARG_LIMIT]})
         compact.append(
             {
                 "id": tc.get("id", ""),
 
@@ -103,7 +103,12 @@ async def evaluate_phase_completion(
 
 
 def _extract_recent_context(conversation: NodeConversation, max_messages: int = 10) -> str:
-    """Extract recent conversation messages for evaluation."""
+    """Extract recent conversation messages for evaluation.
+
+    Includes tool-call summaries from assistant messages so the judge
+    can see what tools were invoked (especially set_output values) even
+    when the assistant message body is empty.
+    """
     messages = conversation.messages
     recent = messages[-max_messages:] if len(messages) > max_messages else messages
 
@@ -112,8 +117,24 @@ def _extract_recent_context(conversation: NodeConversation, max_messages: int =
         role = msg.role.upper()
         content = msg.content or ""
         # Truncate long tool results
-        if msg.role == "tool" and len(content) > 200:
-            content = content[:200] + "..."
+        if msg.role == "tool" and len(content) > 500:
+            content = content[:500] + "..."
+        # For assistant messages with empty content but tool_calls,
+        # summarise the tool calls so the judge knows what happened.
+        if msg.role == "assistant" and not content.strip():
+            tool_calls = getattr(msg, "tool_calls", None)
+            if tool_calls:
+                tc_parts = []
+                for tc in tool_calls:
+                    fn = tc.get("function", {}) if isinstance(tc, dict) else {}
+                    name = fn.get("name", "")
+                    args = fn.get("arguments", "")
+                    if name == "set_output":
+                        # Show the value so the judge can evaluate content quality
+                        tc_parts.append(f"  called {name}({args[:1000]})")
+                    else:
+                        tc_parts.append(f"  called {name}(...)")
+                content = "Tool calls:\n" + "\n".join(tc_parts)
         if content.strip():
             parts.append(f"[{role}]: {content.strip()}")
 
@@ -125,6 +146,10 @@ def _format_outputs(accumulator_state: dict[str, Any]) -> str:
 
     Lists and dicts get structural formatting so the judge can assess
     quantity and structure, not just a truncated stringification.
+
+    String values are given a generous limit (2000 chars) so the judge
+    can verify substantive content (e.g. a research brief with key
+    questions, scope boundaries, and deliverables).
     """
     if not accumulator_state:
         return "(none)"
@@ -144,12 +169,12 @@ def _format_outputs(accumulator_state: dict[str, Any]) -> str:
                 val_str += f"\n    ... and {len(value) - 8} more"
         elif isinstance(value, dict):
             val_str = str(value)
-            if len(val_str) > 400:
-                val_str = val_str[:400] + "..."
+            if len(val_str) > 2000:
+                val_str = val_str[:2000] + "..."
         else:
             val_str = str(value)
-            if len(val_str) > 300:
-                val_str = val_str[:300] + "..."
+            if len(val_str) > 2000:
+                val_str = val_str[:2000] + "..."
         parts.append(f"  {key}: {val_str}")
     return "\n".join(parts)