fix(chat): repair dangling tool_use/tool_result pairs in replayed history

swaroopvarma2359 · swaroopvarma1 · commit e003517089a2 · 2026-06-12T12:43:29.000+05:30
A crash or cancel between the assistant-row persist and the tool-result
persist leaves a dangling tool_use in chat history; window truncation
(CHAT_HISTORY_REPLAY_LIMIT takes the last N rows) can orphan leading
tool_result rows. Both shapes make every later turn fail with provider
400s — the session is bricked.

repair_dangling_tool_uses() makes replayed history provider-safe in both
directions: synthetic error results for unmatched tool_use ids
(first-answer-wins dedupe for duplicated answers), and orphan leading
tool_result messages dropped. Applied on the /message replay path; no-op
on well-formed history.
diff --git a/app/ai/voice/agents/breeze_buddy/chat/block_codec.py b/app/ai/voice/agents/breeze_buddy/chat/block_codec.py
@@ -39,7 +39,7 @@
 from __future__ import annotations
 
 import json
-from typing import Any, Dict, List, cast
+from typing import Any, Dict, List, Optional, Set, cast
 
 from pipecat.frames.frames import FunctionCallFromLLM
 from pipecat.processors.aggregators.llm_context import LLMContextMessage
@@ -186,6 +186,123 @@ def blocks_to_llm_context_messages(
     return out
 
 
+_LOST_RESULT_PAYLOAD = json.dumps(
+    {
+        "status": "error",
+        "reason": (
+            "the execution result for this call was lost; " "treat this call as failed"
+        ),
+    }
+)
+
+
+def repair_dangling_tool_uses(
+    messages: List[LLMContextMessage],
+    exclude_ids: Optional[Set[str]] = None,
+) -> List[LLMContextMessage]:
+    """Make a replayed history provider-safe in BOTH directions.
+
+    1. Dangling tool_use: an assistant message carries ``tool_calls`` with
+       no answering ``{role:"tool"}`` message anywhere later — inject a
+       synthetic error result right after that batch's contiguous tool-run.
+       Covers crash/cancel windows where an approval was DECIDED but its
+       result write was lost (those rows are NOT re-claimable by
+       resolve_dangling_approvals, which only touches PENDING rows).
+    2. Orphan tool_result: a ``{role:"tool"}`` message whose tool_call_id
+       has no preceding assistant ``tool_calls`` in the (windowed) list —
+       dropped. Happens when CHAT_HISTORY_REPLAY_LIMIT cuts a batch in
+       half at the window boundary.
+
+    ``exclude_ids`` are tool_call ids the CALLER is about to answer itself
+    and must stay unanswered here: the approval handler passes the claimed
+    id plus the still-PENDING sibling ids. The ``/message`` path passes
+    nothing (every pending row was resolved + persisted before the history
+    load). Never exclude all decided ids — a decided-but-lost row must be
+    repaired or the session bricks.
+    """
+    exclude: Set[str] = exclude_ids or set()
+    # Ids answered ANYWHERE in the window — a non-contiguous real answer is
+    # already-broken history we won't make worse by adding a duplicate.
+    answered_global: Set[str] = {
+        cast(str, m.get("tool_call_id"))
+        for m in messages
+        if isinstance(m, dict) and m.get("role") == "tool" and m.get("tool_call_id")
+    }
+
+    repaired: List[LLMContextMessage] = []
+    declared_so_far: Set[str] = set()
+    # First answer wins — a second tool_result for the same id (e.g. a
+    # historical cancel-race double write) would make providers reject the
+    # whole conversation on every replay, so later duplicates are dropped.
+    answered_kept: Set[str] = set()
+
+    def _keep_tool(tmsg: Dict[str, Any]) -> None:
+        tcid = tmsg.get("tool_call_id")
+        if tcid not in declared_so_far:
+            logger.warning(
+                f"[block_codec] dropping orphan tool_result "
+                f"{tcid!r} (no preceding tool_use in replay window)"
+            )
+            return
+        if tcid in answered_kept:
+            logger.warning(
+                f"[block_codec] dropping duplicate tool_result for {tcid!r} "
+                "(first answer wins)"
+            )
+            return
+        answered_kept.add(cast(str, tcid))
+        repaired.append(cast(LLMContextMessage, tmsg))
+
+    n = len(messages)
+    i = 0
+    while i < n:
+        msg = cast(Dict[str, Any], messages[i])
+        role = msg.get("role")
+
+        if role == "tool":
+            _keep_tool(msg)
+            i += 1
+            continue
+
+        repaired.append(cast(LLMContextMessage, msg))
+        tool_calls = msg.get("tool_calls") if role == "assistant" else None
+        if tool_calls:
+            batch_ids = [tc.get("id") for tc in tool_calls if tc.get("id")]
+            declared_so_far.update(batch_ids)
+            # Consume the contiguous run of answering tool messages.
+            j = i + 1
+            while (
+                j < n
+                and isinstance(messages[j], dict)
+                and cast(Dict[str, Any], messages[j]).get("role") == "tool"
+            ):
+                _keep_tool(cast(Dict[str, Any], messages[j]))
+                j += 1
+            for tcid in batch_ids:
+                if tcid in exclude or tcid in answered_global:
+                    continue
+                logger.warning(
+                    f"[block_codec] injecting synthetic result for dangling "
+                    f"tool_use {tcid!r}"
+                )
+                repaired.append(
+                    cast(
+                        LLMContextMessage,
+                        {
+                            "role": "tool",
+                            "tool_call_id": tcid,
+                            "content": _LOST_RESULT_PAYLOAD,
+                        },
+                    )
+                )
+            i = j
+            continue
+
+        i += 1
+
+    return repaired
+
+
 def _assistant_row_to_openai(blocks: List[Dict[str, Any]]) -> LLMContextMessage:
     """Anthropic assistant blocks → one OpenAI-shape assistant message.
 
@@ -280,4 +397,5 @@ def _user_row_to_openai(
     "internal_text_block",
     "filter_visible_blocks",
     "blocks_to_llm_context_messages",
+    "repair_dangling_tool_uses",
 ]
diff --git a/app/api/routers/breeze_buddy/chat/handlers.py b/app/api/routers/breeze_buddy/chat/handlers.py
@@ -9,7 +9,7 @@
 import asyncio
 import time
 from datetime import datetime
-from typing import Any, AsyncIterator, Callable, Dict, Optional
+from typing import Any, AsyncIterator, Callable, Dict, Optional, cast
 
 from fastapi import HTTPException, status
 from fastapi.responses import StreamingResponse
@@ -18,6 +18,7 @@
 from app.ai.voice.agents.breeze_buddy.chat.block_codec import (
     blocks_to_llm_context_messages,
     filter_visible_blocks,
+    repair_dangling_tool_uses,
 )
 from app.ai.voice.agents.breeze_buddy.chat.client_context import (
     ClientContextTooLarge,
@@ -491,6 +492,11 @@ async def send_chat_message_handler(
                 if row.role in (ChatMessageRole.USER, ChatMessageRole.ASSISTANT)
             ]
         )
+        # Defensive both-direction repair: an unmatched tool_use (crash or
+        # cancel between the assistant-row persist and the tool-result
+        # persist) gets a synthetic error result; orphan tool_results from
+        # window truncation are dropped. No-op on well-formed history.
+        history = cast(list, repair_dangling_tool_uses(history))
 
         # Load per-session agent state (cart_id, customer_id, etc. for
         # commerce templates). Generic — the runtime doesn't read the
diff --git a/tests/test_block_codec_repair.py b/tests/test_block_codec_repair.py
@@ -0,0 +1,166 @@
+# pyrefly: ignore-errors
+# Same TypedDict-union narrowing limitation as test_block_codec_visibility.py.
+"""repair_dangling_tool_uses — both-direction provider-safety repair.
+
+The replay invariant: every assistant tool_calls id must be answered by a
+{role:"tool"} message, and every tool message must answer a preceding
+tool_calls. The repair covers the two failure directions:
+
+- decided-but-lost rows (crash/cancel between approval claim and result
+  persist) → synthetic error result injected;
+- window-truncation orphans (CHAT_HISTORY_REPLAY_LIMIT cuts a batch in
+  half) → leading orphan tool messages dropped.
+
+``exclude_ids`` semantics are load-bearing (plan review blocker): only the
+ids the caller is about to answer itself may stay unanswered.
+"""
+
+from __future__ import annotations
+
+import json
+
+from app.ai.voice.agents.breeze_buddy.chat.block_codec import (
+    repair_dangling_tool_uses,
+)
+
+
+def _assistant(tool_ids, content=None):
+    return {
+        "role": "assistant",
+        "content": content,
+        "tool_calls": [
+            {
+                "id": tid,
+                "type": "function",
+                "function": {"name": f"fn_{tid}", "arguments": "{}"},
+            }
+            for tid in tool_ids
+        ],
+    }
+
+
+def _tool(tid, content="{}"):
+    return {"role": "tool", "tool_call_id": tid, "content": content}
+
+
+def test_valid_history_passes_through_unchanged():
+    messages = [
+        {"role": "user", "content": "hi"},
+        _assistant(["t1", "t2"]),
+        _tool("t1"),
+        _tool("t2"),
+        {"role": "assistant", "content": "done"},
+    ]
+    assert repair_dangling_tool_uses(list(messages)) == messages
+
+
+def test_dangling_tool_use_gets_synthetic_result():
+    messages = [
+        _assistant(["t1"]),
+        {"role": "user", "content": "hello?"},
+    ]
+    repaired = repair_dangling_tool_uses(messages)
+    assert repaired[1]["role"] == "tool"
+    assert repaired[1]["tool_call_id"] == "t1"
+    payload = json.loads(repaired[1]["content"])
+    assert payload["status"] == "error"
+    assert repaired[2] == {"role": "user", "content": "hello?"}
+
+
+def test_partial_batch_gets_synthetic_for_missing_sibling_only():
+    messages = [
+        _assistant(["t1", "t2"]),
+        _tool("t1", '{"ok": true}'),
+    ]
+    repaired = repair_dangling_tool_uses(messages)
+    assert [m["tool_call_id"] for m in repaired[1:]] == ["t1", "t2"]
+    assert json.loads(repaired[2]["content"])["status"] == "error"
+
+
+def test_exclude_ids_stay_unanswered():
+    """The approval handler excludes the claimed id + pending siblings —
+    those must remain dangling for the resume turn to answer."""
+    messages = [
+        _assistant(["claimed", "pending_sib", "lost"]),
+    ]
+    repaired = repair_dangling_tool_uses(
+        messages, exclude_ids={"claimed", "pending_sib"}
+    )
+    answered = [m["tool_call_id"] for m in repaired if m["role"] == "tool"]
+    assert answered == ["lost"]
+
+
+def test_orphan_leading_tool_result_dropped():
+    """Window truncation can cut the assistant row off the top, leaving
+    tool messages that answer nothing — they must be dropped."""
+    messages = [
+        _tool("from_truncated_batch"),
+        {"role": "user", "content": "next question"},
+        _assistant(["t9"]),
+        _tool("t9"),
+    ]
+    repaired = repair_dangling_tool_uses(messages)
+    assert repaired[0] == {"role": "user", "content": "next question"}
+    assert [m.get("tool_call_id") for m in repaired if m["role"] == "tool"] == ["t9"]
+
+
+def test_already_answered_id_not_duplicated():
+    """A non-contiguous real answer is already-broken history; the repair
+    must not add a duplicate answer for that id."""
+    messages = [
+        _assistant(["t1"]),
+        {"role": "user", "content": "wedged"},
+        _tool("t1"),
+    ]
+    repaired = repair_dangling_tool_uses(messages)
+    answers = [m for m in repaired if m["role"] == "tool"]
+    assert len(answers) == 1
+
+
+def test_duplicate_tool_results_deduped_first_wins():
+    """A historical cancel-race double write must self-heal on replay —
+    providers reject two answers for one tool_use id."""
+    messages = [
+        _assistant(["t1"]),
+        _tool("t1", '{"real": true}'),
+        _tool("t1", '{"status": "error"}'),
+    ]
+    repaired = repair_dangling_tool_uses(messages)
+    answers = [m for m in repaired if m["role"] == "tool"]
+    assert len(answers) == 1
+    assert answers[0]["content"] == '{"real": true}'
+
+
+def test_consecutive_assistant_batches():
+    """Back-to-back assistant tool_calls messages: the synthetic answer for
+    the first batch must land between them, not after the second."""
+    messages = [
+        _assistant(["a1"]),
+        _assistant(["b1"]),
+        _tool("b1"),
+    ]
+    repaired = repair_dangling_tool_uses(messages)
+    roles_and_ids = [(m["role"], m.get("tool_call_id")) for m in repaired]
+    assert roles_and_ids == [
+        ("assistant", None),
+        ("tool", "a1"),
+        ("assistant", None),
+        ("tool", "b1"),
+    ]
+
+
+def test_multiple_batches_repaired_independently():
+    messages = [
+        _assistant(["a1"]),
+        _tool("a1"),
+        {"role": "user", "content": "more"},
+        _assistant(["b1", "b2"]),
+        _tool("b1"),
+    ]
+    repaired = repair_dangling_tool_uses(messages)
+    b_answers = [
+        m["tool_call_id"]
+        for m in repaired
+        if m["role"] == "tool" and m["tool_call_id"].startswith("b")
+    ]
+    assert b_answers == ["b1", "b2"]