Merge remote-tracking branch 'origin/feature/queen-worker-comm' into feature/queen-worker-comm

RichardTang-Aden · RichardTang-Aden · commit 8a0cf5e0ae16 · 2026-03-02T21:27:22.000-08:00
diff --git a/core/framework/graph/conversation.py b/core/framework/graph/conversation.py
@@ -107,17 +107,38 @@ def _extract_spillover_filename(content: str) -> str | None:
 def _compact_tool_calls(tool_calls: list[dict[str, Any]]) -> list[dict[str, Any]]:
     """Truncate tool_call arguments to save context tokens during compaction.
 
-    Preserves ``id``, ``type``, and ``function.name`` exactly.  Truncates
-    ``function.arguments`` (a JSON string) to at most ``_TC_ARG_LIMIT`` chars
-    so that large payloads (e.g. set_output with full findings) don't survive
-    compaction and defeat the purpose of context reduction.
+    Preserves ``id``, ``type``, and ``function.name`` exactly.  When arguments
+    exceed ``_TC_ARG_LIMIT``, replaces the full JSON string with a compact
+    **valid** JSON summary.  The Anthropic API parses tool_call arguments and
+    rejects requests with malformed JSON (e.g. unterminated strings), so we
+    must never produce broken JSON here.
     """
     compact = []
     for tc in tool_calls:
         func = tc.get("function", {})
         args = func.get("arguments", "")
         if len(args) > _TC_ARG_LIMIT:
-            args = args[:_TC_ARG_LIMIT] + "…[truncated]"
+            # Build a valid JSON summary instead of slicing mid-string.
+            # Try to extract top-level keys for a meaningful preview.
+            try:
+                parsed = json.loads(args)
+                if isinstance(parsed, dict):
+                    # Preserve key names, truncate values
+                    summary_parts = []
+                    for k, v in parsed.items():
+                        v_str = str(v)
+                        if len(v_str) > 60:
+                            v_str = v_str[:60] + "..."
+                        summary_parts.append(f"{k}={v_str}")
+                    summary = ", ".join(summary_parts)
+                    if len(summary) > _TC_ARG_LIMIT:
+                        summary = summary[:_TC_ARG_LIMIT] + "..."
+                    args = json.dumps({"_compacted": summary})
+                else:
+                    args = json.dumps({"_compacted": str(parsed)[:_TC_ARG_LIMIT]})
+            except (json.JSONDecodeError, TypeError):
+                # Args were already invalid JSON — wrap the preview safely
+                args = json.dumps({"_compacted": args[:_TC_ARG_LIMIT]})
         compact.append(
             {
                 "id": tc.get("id", ""),
diff --git a/core/framework/graph/edge.py b/core/framework/graph/edge.py
@@ -338,6 +338,10 @@ class AsyncEntryPointSpec(BaseModel):
     max_concurrent: int = Field(
         default=10, description="Maximum concurrent executions for this entry point"
     )
+    max_resurrections: int = Field(
+        default=3,
+        description="Auto-restart on non-fatal failure (0 to disable)",
+    )
 
     model_config = {"extra": "allow"}
 
diff --git a/core/framework/graph/event_loop_node.py b/core/framework/graph/event_loop_node.py
@@ -511,6 +511,7 @@ async def execute(self, ctx: NodeContext) -> NodeResult:
         # 5. Stall / doom loop detection state (restored from cursor if resuming)
         recent_responses: list[str] = _restored_recent_responses
         recent_tool_fingerprints: list[list[tuple[str, str]]] = _restored_tool_fingerprints
+        _consecutive_empty_turns: int = 0
 
         # 6. Main loop
         for iteration in range(start_iteration, self._config.max_iterations):
@@ -649,6 +650,22 @@ async def execute(self, ctx: NodeContext) -> NodeResult:
                                 error=str(e)[:500],
                                 execution_id=execution_id,
                             )
+
+                        # For malformed tool call errors, inject feedback into
+                        # the conversation before retrying.  Retrying with the
+                        # same messages is futile — the LLM will reproduce the
+                        # same truncated JSON.  The nudge tells it to shorten
+                        # its arguments.
+                        error_str = str(e).lower()
+                        if "failed to parse tool call" in error_str:
+                            await conversation.add_user_message(
+                                "[System: Your previous tool call had malformed "
+                                "JSON arguments (likely truncated). Keep your "
+                                "tool call arguments shorter and simpler. Do NOT "
+                                "repeat the same long argument — summarize or "
+                                "split into multiple calls.]"
+                            )
+
                         await asyncio.sleep(delay)
                         continue  # retry same iteration
 
@@ -774,6 +791,57 @@ async def execute(self, ctx: NodeContext) -> NodeResult:
                         latency_ms=latency_ms,
                         conversation=conversation if _is_continuous else None,
                     )
+                else:
+                    # Ghost empty stream: LLM returned nothing and outputs
+                    # are still missing.  The conversation hasn't changed, so
+                    # repeating the same call will produce the same empty
+                    # result.  Inject a nudge to break the cycle.
+                    _consecutive_empty_turns += 1
+                    logger.warning(
+                        "[%s] iter=%d: empty response with missing outputs %s (consecutive=%d)",
+                        node_id,
+                        iteration,
+                        missing,
+                        _consecutive_empty_turns,
+                    )
+                    if _consecutive_empty_turns >= self._config.stall_detection_threshold:
+                        # Persistent ghost stream — fail the node.
+                        error_msg = (
+                            f"Ghost empty stream: {_consecutive_empty_turns} "
+                            f"consecutive empty responses with missing "
+                            f"outputs {missing}"
+                        )
+                        latency_ms = int((time.time() - start_time) * 1000)
+                        if ctx.runtime_logger:
+                            ctx.runtime_logger.log_node_complete(
+                                node_id=node_id,
+                                node_name=ctx.node_spec.name,
+                                node_type="event_loop",
+                                success=False,
+                                error=error_msg,
+                                total_steps=iteration + 1,
+                                tokens_used=total_input_tokens + total_output_tokens,
+                                input_tokens=total_input_tokens,
+                                output_tokens=total_output_tokens,
+                                latency_ms=latency_ms,
+                                exit_status="ghost_stream",
+                                accept_count=_accept_count,
+                                retry_count=_retry_count,
+                                escalate_count=_escalate_count,
+                                continue_count=_continue_count,
+                            )
+                        raise RuntimeError(error_msg)
+                    # First nudge — inject a system message to break the
+                    # empty-response cycle.
+                    await conversation.add_user_message(
+                        "[System: Your response was empty. You have required "
+                        f"outputs that are not yet set: {missing}. Review "
+                        "your task and call the appropriate tools to make "
+                        "progress.]"
+                    )
+                    continue
+            else:
+                _consecutive_empty_turns = 0
 
             # 6f. Stall detection
             recent_responses.append(assistant_text)
@@ -2502,6 +2570,7 @@ def _is_transient_error(exc: BaseException) -> bool:
                 "service unavailable",
                 "bad gateway",
                 "overloaded",
+                "failed to parse tool call",
             ]
             return any(kw in error_str for kw in transient_keywords)
 
diff --git a/core/framework/llm/litellm.py b/core/framework/llm/litellm.py
@@ -237,6 +237,11 @@ def _is_stream_transient_error(exc: BaseException) -> bool:
 
     Transient errors (recoverable=True): network issues, server errors, timeouts.
     Permanent errors (recoverable=False): auth, bad request, context window, etc.
+
+    NOTE: "Failed to parse tool call arguments" (malformed LLM output) is NOT
+    transient at the stream level — retrying with the same messages produces the
+    same malformed output.  This error is handled at the EventLoopNode level
+    where the conversation can be modified before retrying.
     """
     try:
         from litellm.exceptions import (
@@ -917,30 +922,6 @@ async def stream(
                 # and we skip the retry path — nothing was yielded in vain.)
                 has_content = accumulated_text or tool_calls_acc
                 if not has_content:
-                    # If the conversation ends with an assistant or tool
-                    # message, an empty stream is expected — the LLM has
-                    # nothing new to say.  Don't burn retries on this;
-                    # let the caller (EventLoopNode) decide what to do.
-                    # Typical case: client_facing node where the LLM set
-                    # all outputs via set_output tool calls, and the tool
-                    # results are the last messages.
-                    last_role = next(
-                        (m["role"] for m in reversed(full_messages) if m.get("role") != "system"),
-                        None,
-                    )
-                    if last_role in ("assistant", "tool"):
-                        logger.warning(
-                            "[stream] %s returned empty stream after %s message "
-                            "(no text, no tool calls). Treating as a no-op turn. "
-                            "If this repeats, the agent may be stuck — check for "
-                            "ghost empty assistant messages in conversation history.",
-                            self.model,
-                            last_role,
-                        )
-                        for event in tail_events:
-                            yield event
-                        return
-
                     # finish_reason=length means the model exhausted
                     # max_tokens before producing content. Retrying with
                     # the same max_tokens will never help.
@@ -958,10 +939,16 @@ async def stream(
                             yield event
                         return
 
-                    # Empty stream after a user message — use short fixed
-                    # retries, not the rate-limit backoff.  This is likely
-                    # a deterministic conversation-structure issue, so long
-                    # exponential waits don't help.
+                    # Empty stream — always retry regardless of last message
+                    # role.  Ghost empty streams after tool results are NOT
+                    # expected no-ops; they create infinite loops when the
+                    # conversation doesn't change between iterations.
+                    # After retries, return the empty result and let the
+                    # caller (EventLoopNode) decide how to handle it.
+                    last_role = next(
+                        (m["role"] for m in reversed(full_messages) if m.get("role") != "system"),
+                        None,
+                    )
                     if attempt < EMPTY_STREAM_MAX_RETRIES:
                         token_count, token_method = _estimate_tokens(
                             self.model,
@@ -974,7 +961,8 @@ async def stream(
                             attempt=attempt,
                         )
                         logger.warning(
-                            f"[stream-retry] {self.model} returned empty stream — "
+                            f"[stream-retry] {self.model} returned empty stream "
+                            f"after {last_role} message — "
                             f"~{token_count} tokens ({token_method}). "
                             f"Request dumped to: {dump_path}. "
                             f"Retrying in {EMPTY_STREAM_RETRY_DELAY}s "
@@ -983,7 +971,17 @@ async def stream(
                         await asyncio.sleep(EMPTY_STREAM_RETRY_DELAY)
                         continue
 
-                # Success (or final attempt) — flush remaining events.
+                    # All retries exhausted — log and return the empty
+                    # result.  EventLoopNode's empty response guard will
+                    # accept if all outputs are set, or handle the ghost
+                    # stream case if outputs are still missing.
+                    logger.error(
+                        f"[stream] {self.model} returned empty stream after "
+                        f"{EMPTY_STREAM_MAX_RETRIES} retries "
+                        f"(last_role={last_role}). Returning empty result."
+                    )
+
+                # Success (or empty after exhausted retries) — flush events.
                 for event in tail_events:
                     yield event
                 return
diff --git a/core/framework/runner/runner.py b/core/framework/runner/runner.py
@@ -1274,6 +1274,7 @@ def _setup_agent_runtime(
                 isolation_level=async_ep.isolation_level,
                 priority=async_ep.priority,
                 max_concurrent=async_ep.max_concurrent,
+                max_resurrections=async_ep.max_resurrections,
             )
             entry_points.append(ep)
 
diff --git a/core/framework/runtime/event_bus.py b/core/framework/runtime/event_bus.py
@@ -130,6 +130,9 @@ class EventType(StrEnum):
     WORKER_ESCALATION_TICKET = "worker_escalation_ticket"
     QUEEN_INTERVENTION_REQUESTED = "queen_intervention_requested"
 
+    # Execution resurrection (auto-restart on non-fatal failure)
+    EXECUTION_RESURRECTED = "execution_resurrected"
+
     # Worker lifecycle (session manager → frontend)
     WORKER_LOADED = "worker_loaded"
     CREDENTIALS_REQUIRED = "credentials_required"
diff --git a/core/framework/runtime/execution_stream.py b/core/framework/runtime/execution_stream.py
diff --git a/docs/runtime_initialization.md b/docs/runtime_initialization.md
diff --git a/examples/templates/job_hunter/nodes/__init__.py b/examples/templates/job_hunter/nodes/__init__.py

Original file line number	Diff line number	Diff line change
`@@ -338,6 +338,10 @@ class AsyncEntryPointSpec(BaseModel):`
`338`	`338`	`max_concurrent: int = Field(`
`339`	`339`	`default=10, description="Maximum concurrent executions for this entry point"`
`340`	`340`	`)`
	`341`	`+ max_resurrections: int = Field(`
	`342`	`+ default=3,`
	`343`	`+ description="Auto-restart on non-fatal failure (0 to disable)",`
	`344`	`+ )`
`341`	`345`
`342`	`346`	`model_config = {"extra": "allow"}`
`343`	`347`
Original file line number	Diff line number	Diff line change
`@@ -1274,6 +1274,7 @@ def _setup_agent_runtime(`
`1274`	`1274`	`isolation_level=async_ep.isolation_level,`
`1275`	`1275`	`priority=async_ep.priority,`
`1276`	`1276`	`max_concurrent=async_ep.max_concurrent,`
	`1277`	`+ max_resurrections=async_ep.max_resurrections,`
`1277`	`1278`	`)`
`1278`	`1279`	`entry_points.append(ep)`
`1279`	`1280`