srtab
diff --git a/‎daiv/activity/migrations/0009_activity_thread_id.py‎
Lines changed: 8 additions & 1 deletion b/‎daiv/activity/migrations/0009_activity_thread_id.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎daiv/activity/models.py‎
Lines changed: 8 additions & 0 deletions b/‎daiv/activity/models.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎daiv/automation/agent/base.py‎
Lines changed: 1 addition & 1 deletion b/‎daiv/automation/agent/base.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎daiv/automation/agent/middlewares/git.py‎
Lines changed: 16 additions & 3 deletions b/‎daiv/automation/agent/middlewares/git.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎daiv/automation/agent/middlewares/sandbox.py‎
Lines changed: 2 additions & 3 deletions b/‎daiv/automation/agent/middlewares/sandbox.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎daiv/automation/agent/middlewares/web_fetch.py‎
Lines changed: 3 additions & 1 deletion b/‎daiv/automation/agent/middlewares/web_fetch.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎daiv/chat/api/event_filter.py‎
Lines changed: 2 additions & 0 deletions b/‎daiv/chat/api/event_filter.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎daiv/chat/api/streaming.py‎
Lines changed: 58 additions & 26 deletions b/‎daiv/chat/api/streaming.py‎
Lines changed: 58 additions & 26 deletions
diff --git a/‎daiv/chat/api/threads.py‎
Lines changed: 57 additions & 18 deletions b/‎daiv/chat/api/threads.py‎
Lines changed: 57 additions & 18 deletions
@@ -19,5 +19,12 @@ class Migration(migrations.Migration):
                 unique=True,
                 verbose_name="thread ID",
             ),
-        )
+        ),
+        migrations.AddConstraint(
+            model_name="activity",
+            constraint=models.CheckConstraint(
+                condition=models.Q(("thread_id__isnull", True)) | models.Q(("thread_id", ""), _negated=True),
+                name="activity_thread_id_nonempty",
+            ),
+        ),
     ]
@@ -185,6 +185,14 @@ class Meta:
                 condition=models.Q(external_username__gt=""),
             ),
         ]
+        constraints = [
+            # ``thread_id`` is unique=True; "" would collide on the second insert
+            # under Postgres (which treats NULL as not-equal but "" as a real
+            # value). Forbid the empty-string sentinel so callers must use NULL.
+            models.CheckConstraint(
+                condition=models.Q(thread_id__isnull=True) | ~models.Q(thread_id=""), name="activity_thread_id_nonempty"
+            )
+        ]
 
     def __str__(self) -> str:
         return f"{self.get_trigger_type_display()} on {self.repo_id} ({self.status})"
 
@@ -220,7 +220,7 @@ def get_model_kwargs(
                 else:
                     # `enabled: true` is the universal switch on OpenRouter; some providers
                     # (notably z.ai's GLM family) ignore `effort` and require the explicit flag.
-                    _kwargs["extra_body"] = {"reasoning": {"enabled": True, "effort": thinking_level.value}}
+                    _kwargs["extra_body"] = {"reasoning": {"enabled": True, "effort": thinking_level}}
 
             elif _kwargs["model"].startswith("anthropic") and "max_tokens" not in _kwargs:
                 # Avoid rate limiting by setting a fair max_tokens value
 
@@ -3,7 +3,10 @@
 import logging
 from typing import TYPE_CHECKING, Annotated, Any, cast
 
+import httpx
 from asgiref.sync import sync_to_async
+from github import GithubException
+from gitlab.exceptions import GitlabError
 from langchain.agents import AgentState
 from langchain.agents.middleware import AgentMiddleware, ModelRequest, ModelResponse
 from langchain.agents.middleware.types import PrivateStateAttr
@@ -16,6 +19,11 @@
 from codebase.context import RuntimeCtx  # noqa: TC001
 from codebase.utils import GitManager, get_repo_ref
 
+# Platform / transport errors that warrant a soft "no MR" fallback. Bugs
+# (KeyError, AttributeError, etc.) propagate so the run fails loudly rather
+# than producing a duplicate MR downstream.
+_MR_LOOKUP_PLATFORM_ERRORS: tuple[type[BaseException], ...] = (GitlabError, GithubException, httpx.HTTPError)
+
 if TYPE_CHECKING:
     from collections.abc import Awaitable, Callable
 
@@ -134,15 +142,20 @@ async def abefore_agent(self, state: GitState, runtime: Runtime[RuntimeCtx]) ->
             try:
                 git_manager.checkout(merge_request.source_branch)
             except ValueError as e:
-                # The branch does not exist in the repository, so we need to create it.
+                # Branch from the MR no longer exists locally; treat as no MR
+                # and let the publisher decide whether to recreate it.
                 logger.warning("[%s] Failed to checkout to branch '%s': %s", self.name, merge_request.source_branch, e)
                 merge_request = None
 
         return {"merge_request": merge_request, "code_changes": False}
 
     @staticmethod
     async def _alookup_open_mr(context: RuntimeCtx) -> MergeRequest | None:
-        """Best-effort lookup of an open MR whose source branch matches the current ref."""
+        """Best-effort lookup of an open MR whose source branch matches the current ref.
+
+        Soft-fails on platform/transport errors so the agent can still run — the
+        publisher will create a fresh MR if needed. Programming bugs propagate.
+        """
         current_branch = get_repo_ref(context.gitrepo)
         if not current_branch or current_branch == context.config.default_branch:
             return None
@@ -151,7 +164,7 @@ async def _alookup_open_mr(context: RuntimeCtx) -> MergeRequest | None:
             return await sync_to_async(client.get_merge_request_by_branches)(
                 context.repository.slug, current_branch, context.config.default_branch
             )
-        except Exception:
+        except _MR_LOOKUP_PLATFORM_ERRORS:
             logger.exception(
                 "Failed to look up open merge request for %s on %s", context.repository.slug, current_branch
             )
 
@@ -360,9 +360,8 @@ async def abefore_agent(self, state: StateT, runtime: Runtime[RuntimeCtx]) -> di
             dict[str, str] | None: The state updates with the sandbox session ID.
         """
         if not self.close_session and "session_id" in state:
-            # If the session is not being closed, don't start a new one, reuse the existing one.
-            # Also, avoid reusing the session_id if it is already set from a previous run that failed to close
-            # the session.
+            # Subagent path: the parent already started a session and owns its
+            # lifecycle. Skip starting a duplicate.
             return None
 
         session_id = await DAIVSandboxClient().start_session(
 
@@ -208,7 +208,9 @@ async def web_fetch_tool(
 
     prompt = prompt or ""
 
-    # Cache the final response for a given (url, prompt, model).
+    # Cache key is (url, prompt). The summarisation model is intentionally NOT
+    # part of the key today — if the active model is rotated and you want fresh
+    # answers, also bump ``_cache_key_for_response``.
     if prompt.strip() and (cached := _get_cached_response(url=url, prompt=prompt)) is not None:
         return str(cached)
 
 
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import json
 from typing import TYPE_CHECKING, Any
 
 
@@ -1,9 +1,11 @@
+from __future__ import annotations
+
 import logging
+import time
 from dataclasses import dataclass, fields, is_dataclass
 from typing import TYPE_CHECKING, Any
 
 from ag_ui.core.events import EventType, RunErrorEvent
-from ag_ui.encoder import EventEncoder  # noqa: TC002
 from copilotkit import LangGraphAGUIAgent
 from langgraph.store.memory import InMemoryStore
 
@@ -18,28 +20,32 @@
 from .threads import ChatThreadService
 
 if TYPE_CHECKING:
+    from collections.abc import AsyncIterator
+
     from ag_ui.core import RunAgentInput
+    from ag_ui.encoder import EventEncoder
 
     from codebase.base import MergeRequest
+    from codebase.context import RuntimeCtx
 
 logger = logging.getLogger("daiv.chat")
 
 # GitState fields that survive the ag-ui output-schema filter and reach the
-# chat client through STATE_SNAPSHOT events. ``merge_request`` drives the
-# composer MR pill; extend this list when adding new streamable state.
+# chat client through STATE_SNAPSHOT events.
 STREAMED_STATE_KEYS = ("merge_request",)
 
+# Bump ``last_active_at`` at most this often while the stream is alive.
+HEARTBEAT_INTERVAL_S = 5.0
+
 
 class RuntimeContextLangGraphAGUIAgent(LangGraphAGUIAgent):
-    """Default LangGraph's typed ``context=`` kwarg to the daiv RuntimeCtx dataclass.
+    """Inject the daiv RuntimeCtx dataclass into upstream's stream kwargs.
 
-    Upstream's ``get_stream_kwargs`` only accepts dict-shaped contexts (it merges via
-    ``dict.update``), but our graph declares ``context_schema=RuntimeCtx`` and expects
-    the frozen dataclass itself. We use ``setdefault`` so an upstream-provided context
-    still wins if one ever appears; today nothing populates it.
+    Upstream's ``get_stream_kwargs`` only accepts dict-shaped contexts, but our graph
+    declares ``context_schema=RuntimeCtx`` and expects the frozen dataclass itself.
     """
 
-    def __init__(self, *, runtime_context: Any, **kwargs: Any):
+    def __init__(self, *, runtime_context: RuntimeCtx, **kwargs: Any):
         super().__init__(**kwargs)
         self._runtime_context = runtime_context
 
@@ -50,14 +56,17 @@ def get_stream_kwargs(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
 
     def get_schema_keys(self, config: Any) -> dict[str, list[str]]:
         # Upstream calls ``graph.config_schema().schema()`` which recurses into
-        # ``context_schema=RuntimeCtx``. RuntimeCtx holds a ``git.Repo`` field that pydantic
-        # cannot turn into JSON schema, so the call raises PydanticInvalidForJsonSchema.
-        # Derive context keys from the dataclass directly and keep the rest of the shape
-        # matching upstream's contract. ``output`` is the filter applied to every
-        # ``STATE_SNAPSHOT`` payload (``filter_object_by_schema_keys``), so fields we
-        # want streamed to the chat UI must be listed here explicitly.
+        # ``context_schema=RuntimeCtx``. RuntimeCtx holds a ``git.Repo`` field that
+        # pydantic cannot turn into JSON schema, so the call raises
+        # PydanticInvalidForJsonSchema. Derive context keys from the dataclass directly.
         ctx_schema = getattr(self.graph, "context_schema", None)
-        context_keys = [f.name for f in fields(ctx_schema)] if is_dataclass(ctx_schema) else []
+        if not is_dataclass(ctx_schema):
+            logger.warning(
+                "chat: context_schema %r is not a dataclass; STATE_SNAPSHOT context keys will be empty", ctx_schema
+            )
+            context_keys: list[str] = []
+        else:
+            context_keys = [f.name for f in fields(ctx_schema)]
         constant = list(self.constant_schema_keys)
         return {"input": constant, "output": [*constant, *STREAMED_STATE_KEYS], "config": [], "context": context_keys}
 
@@ -76,8 +85,18 @@ class ChatRunStreamer:
     input_data: RunAgentInput
     encoder: EventEncoder
 
-    async def events(self):
+    def __post_init__(self) -> None:
+        # The view passes thread_id/run_id alongside input_data; a future refactor
+        # could desync them silently. Pin the invariant here.
+        if self.thread_id != self.input_data.thread_id:
+            raise ValueError(f"thread_id mismatch: {self.thread_id!r} vs input_data {self.input_data.thread_id!r}")
+        if self.run_id != self.input_data.run_id:
+            raise ValueError(f"run_id mismatch: {self.run_id!r} vs input_data {self.input_data.run_id!r}")
+
+    async def events(self) -> AsyncIterator[str]:
         last_mr: MergeRequest | None = None
+        clean_run = False
+        last_heartbeat = time.monotonic()
         try:
             async with (
                 open_checkpointer() as checkpointer,
@@ -103,22 +122,35 @@ async def events(self):
                         if isinstance(snap, dict) and "merge_request" in snap:
                             last_mr = snap["merge_request"]
                     yield self.encoder.encode(event)
-        except Exception as exc:
+
+                    now = time.monotonic()
+                    if now - last_heartbeat >= HEARTBEAT_INTERVAL_S:
+                        last_heartbeat = now
+                        try:
+                            await ChatThreadService.heartbeat(self.thread_id, self.run_id)
+                        except Exception:
+                            logger.exception("chat: heartbeat failed for thread_id=%s", self.thread_id)
+                clean_run = True
+        except Exception:
             logger.exception("Chat run failed for thread_id=%s run_id=%s", self.thread_id, self.run_id)
             yield self.encoder.encode(
-                RunErrorEvent(type=EventType.RUN_ERROR, message=f"{type(exc).__name__}: {exc}", code="run_failed")
+                RunErrorEvent(
+                    type=EventType.RUN_ERROR, message="Run failed. Check server logs for details.", code="run_failed"
+                )
             )
         finally:
             # Both cleanup steps are wrapped: a post-stream DB hiccup must not
             # retroactively paint a clean run as RUN_ERROR, and a release_run
             # failure must not leave the per-thread slot permanently claimed.
-            # Durable copy of the source_branch so reloads land on it; the pill
-            # itself updates client-side from the same STATE_SNAPSHOT stream.
-            try:
-                await ChatThreadService.persist_ref(self.thread_id, self.ref, last_mr)
-            except Exception:
-                logger.exception("chat: failed to persist thread ref for thread_id=%s", self.thread_id)
+            # ref is only persisted on a clean finish — a partial run could have
+            # checked out a branch without committing, and pinning it would
+            # silently retarget reloads at half-built state.
+            if clean_run:
+                try:
+                    await ChatThreadService.persist_ref(self.thread_id, self.ref, last_mr)
+                except Exception:
+                    logger.exception("chat: failed to persist thread ref for thread_id=%s", self.thread_id)
             try:
-                await ChatThreadService.release_run(self.thread_id)
+                await ChatThreadService.release_run(self.thread_id, self.run_id)
             except Exception:
                 logger.exception("chat: failed to release run slot for thread_id=%s", self.thread_id)
@@ -1,5 +1,9 @@
+from __future__ import annotations
+
+from datetime import timedelta
 from typing import TYPE_CHECKING
 
+from django.db.models import Q
 from django.utils import timezone
 
 from chat.models import ChatThread
@@ -11,18 +15,26 @@
     from codebase.base import MergeRequest
 
 
-def _extract_first_user_message(input_data: RunAgentInput) -> str:
-    return next((c for m in input_data.messages if isinstance(c := getattr(m, "content", ""), str) and c.strip()), "")
+# A claim that hasn't bumped last_active_at within this window is considered
+# orphaned (worker crashed / OOM-killed before the streamer's finally ran) and
+# can be taken over by a fresh claim. Live runs heartbeat well within this
+# window via ``ChatThreadService.heartbeat``.
+STALE_RUN_MINUTES = 30
 
 
-class ChatThreadService:
-    """Encapsulates ``ChatThread`` row operations needed by the chat API.
+def _extract_first_user_message(input_data: RunAgentInput) -> str:
+    """Return the first non-empty content from a human/user role message."""
+    for m in input_data.messages:
+        role = (getattr(m, "role", None) or getattr(m, "type", "") or "").lower()
+        if role not in ("user", "human"):
+            continue
+        content = getattr(m, "content", "")
+        if isinstance(content, str) and content.strip():
+            return content
+    return ""
 
-    The view stays out of the model directly — every read/write goes through
-    this service so the per-thread run-slot protocol (``aget_or_create`` →
-    conditional ``UPDATE`` claim → ``UPDATE`` release) lives in one place.
-    """
 
+class ChatThreadService:
     @staticmethod
     async def get_or_create_for_user(
         *, user: User, thread_id: str, repo_id: str, ref: str, input_data: RunAgentInput
@@ -43,24 +55,51 @@ async def get_or_create_for_user(
 
     @staticmethod
     async def try_claim_run(thread_id: str, run_id: str) -> bool:
-        """Atomic claim: only succeeds if the slot is currently free. Avoids TOCTOU
-        between a "is it free?" read and a "claim it" write when two tabs fire
-        simultaneously.
+        """Atomic claim: succeeds if the slot is free OR its heartbeat is stale.
+
+        Why: a worker crash (OOM, SIGKILL, ASGI transport error before the streaming
+        body iterates) skips the streamer's ``finally`` so ``release_run`` never fires.
+        Without the stale-takeover branch the thread would be unrecoverable forever.
         """
-        claimed = await ChatThread.objects.filter(thread_id=thread_id, active_run_id="").aupdate(
+        stale_cutoff = timezone.now() - timedelta(minutes=STALE_RUN_MINUTES)
+        free_or_stale = Q(active_run_id__isnull=True) | Q(last_active_at__lt=stale_cutoff)
+        claimed = await ChatThread.objects.filter(Q(thread_id=thread_id) & free_or_stale).aupdate(
             active_run_id=run_id, last_active_at=timezone.now()
         )
         return bool(claimed)
 
     @staticmethod
-    async def release_run(thread_id: str) -> None:
-        await ChatThread.objects.filter(thread_id=thread_id).aupdate(active_run_id="", last_active_at=timezone.now())
+    async def heartbeat(thread_id: str, run_id: str) -> None:
+        """Bump ``last_active_at`` while the slot is still ours.
+
+        Filtered on ``active_run_id=run_id`` so a delayed heartbeat from a previous
+        run cannot keep a stale slot alive after another run took it over.
+        """
+        await ChatThread.objects.filter(thread_id=thread_id, active_run_id=run_id).aupdate(
+            last_active_at=timezone.now()
+        )
 
     @staticmethod
-    async def persist_ref(thread_id: str, original_ref: str, mr: MergeRequest | None) -> None:
-        """Sync ``ChatThread.ref`` with the agent's final ``merge_request`` (captured
-        from the live STATE_SNAPSHOT stream — no second checkpoint read needed).
+    async def release_run(thread_id: str, run_id: str) -> None:
+        """Clear the slot only if we still hold it.
+
+        The ``active_run_id=run_id`` guard prevents a delayed cleanup from stomping
+        a freshly-claimed slot taken over via the stale path.
+        """
+        await ChatThread.objects.filter(thread_id=thread_id, active_run_id=run_id).aupdate(
+            active_run_id=None, last_active_at=timezone.now()
+        )
+
+    @staticmethod
+    async def persist_ref(thread_id: str, original_ref: str, mr: MergeRequest | dict | None) -> None:
+        """Sync ``ChatThread.ref`` with the agent's final ``merge_request``.
+
+        Accepts both a live ``MergeRequest`` instance and a dict (the snapshot
+        gets rehydrated through the checkpointer as a plain dict, so resumed
+        runs land here in dict shape).
         """
-        new_ref = mr.source_branch if mr else None
+        if mr is None:
+            return
+        new_ref = mr.get("source_branch") if isinstance(mr, dict) else getattr(mr, "source_branch", None)
         if new_ref and new_ref != original_ref:
             await ChatThread.objects.filter(thread_id=thread_id).aupdate(ref=new_ref)
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import annotations`
	`2`	`+`
`1`	`3`	`import json`
`2`	`4`	`from typing import TYPE_CHECKING, Any`
`3`	`5`