comet-ml · dsblank · Jun 4, 2026 · Jun 8, 2026
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -7,10 +7,13 @@ on:
         description: Issue number to triage
         required: true
         type: number
+  # Dogfood the comment trigger: Scout re-runs when a new comment is posted.
+  issue_comment:
+    types: [created]
 
 # One run per issue at a time
 concurrency:
-  group: scout-issue-${{ github.event.inputs.issue_number }}
+  group: scout-issue-${{ github.event.issue.number || github.event.inputs.issue_number }}
   cancel-in-progress: true
 
 jobs:
@@ -53,4 +56,4 @@ jobs:
           SCOUT_GITHUB_REPO_NAME: ${{ vars.SCOUT_GITHUB_REPO_NAME }}
           OPIK_API_KEY: ${{ secrets.OPIK_API_KEY }}
           OPIK_WORKSPACE: ${{ secrets.OPIK_WORKSPACE }}
-          ISSUE_NUMBER: ${{ github.event.inputs.issue_number }}
+          ISSUE_NUMBER: ${{ github.event.issue.number || github.event.inputs.issue_number }}
diff --git a/README-OPIK-INTEGRATION.md b/README-OPIK-INTEGRATION.md
@@ -153,6 +153,24 @@ def _search_rate_limited(spec: dict) -> GitHubSimulator:
 
 Most scenarios use `"default"`. The long tail registers a Python builder by name and references it from the JSON. The dataset schema doesn't grow.
 
+### Comment threads and author association
+
+Scout runs on new comments as well as on issue open, so a scenario can model a back-and-forth. An issue spec accepts `author_association` (the reporter's relationship to the repo) and a `comments` list — a flat, chronological list, since GitHub issue comments are not nested. Each comment may carry an `association` and an explicit `role`; comments authored by `scout-bot` (or carrying Scout's hidden marker) are treated as prior Scout replies and rendered as assistant turns:
+
+```json
+{
+  "number": 888, "title": "...", "body": "...",
+  "author": "alice", "author_association": "NONE",
+  "comments": [
+    {"author": "carol", "association": "CONTRIBUTOR", "body": "Confirmed — ..."},
+    {"author": "scout-bot", "body": "Early read: ...", "role": "assistant"},
+    {"author": "bob", "association": "MEMBER", "body": "@scout before we fix it: ..."}
+  ]
+}
+```
+
+`agent.build_conversation` turns this into alternating user/assistant turns, prefixing each human turn with `[author (association)]:` so assertions (and the model) can check that Scout weighed maintainer input and answered the latest comment. See `_COMMENT_THREAD` in `evals/starter_scenarios.py`.
+
 ## Dataset item shape
 
 Each Opik Test Suite item has three top-level keys:

diff --git a/README.md b/README.md
@@ -1,12 +1,14 @@
 # Scout 🦉
 
-Scout is a GitHub Action that triages new issues using Anthropic. When an issue is opened, Scout:
+Scout is a GitHub Action that triages issues using Anthropic. When an issue is opened — or a new comment is posted on one — Scout:
 
 1. Searches for similar issues and existing workarounds
 2. Explores the source code to find where the problem lives
 3. Posts a structured comment with a solution, code investigation, and next steps
 4. Escalates complex design issues by applying a configurable label
 
+On comment triggers, Scout reads the whole thread as a conversation — each message is attributed to its author and their repository association (OWNER / MEMBER / COLLABORATOR / CONTRIBUTOR / NONE), so it weighs maintainer input accordingly and replies to the latest comment. It ignores its own comments and other bots; set `SCOUT_COMMENT_TRIGGER_MENTION=true` to only respond when a comment @-mentions Scout.
+
 Activity is traced to [Opik](https://opik.com) for observability. Viewers can rate each response with a 👍/👎 reaction, which is synced back to Opik as human feedback — see [Response feedback](#response-feedback).
 
 ## Setup
@@ -41,6 +43,8 @@ name: Scout Issue Triage
 on:
   issues:
     types: [opened]
+  issue_comment:
+    types: [created]
   workflow_dispatch:
     inputs:
       issue_number:
@@ -91,6 +95,7 @@ The GitHub App must have these permissions:
 | `SCOUT_GITHUB_REPO_OWNER` | yes | Repo owner login |
 | `SCOUT_GITHUB_REPO_NAME` | yes | Repo name |
 | `SCOUT_ESCALATION_TAG` | no | Label for escalated issues (default: `Escalated request`) |
+| `SCOUT_COMMENT_TRIGGER_MENTION` | no | If `true`, comment-triggered runs only fire when the comment @-mentions Scout (default: respond to every non-bot comment) |
 | `OPIK_API_KEY` | **yes** | Opik API key. Opik is required — Scout sources its system prompt from Opik and traces every run there. |
 | `OPIK_WORKSPACE` | **yes** | Opik workspace name |
 | `SCOUT_FEEDBACK_SINCE_DAYS` | no | Feedback sync only: how many days back to scan issues for 👍/👎 reactions (default: `7`) |

diff --git a/evals/starter_scenarios.py b/evals/starter_scenarios.py
@@ -332,12 +332,128 @@
 }
 
 
+# ---------------------------------------------------------------------------
+# Scenario 6: multi-party comment thread — respond to the latest comment,
+# weighing a maintainer's input, building on Scout's own prior reply
+# ---------------------------------------------------------------------------
+
+_COMMENT_THREAD = {
+    "description": "comment-thread-maintainer-followup",
+    "data": {
+        "scenario_id": "comment-thread-maintainer-followup",
+        "scenario": "default",
+        "spec": {
+            "owner": "trainer-org",
+            "name": "trainer-lib",
+            "readme": TRAINER_LIB_README,
+            "issues": [
+                {
+                    "number": 888,
+                    "title": "Resuming from a checkpoint loses optimizer state",
+                    "body": (
+                        "When I resume training with `--resume path/to/ckpt.pt`, the "
+                        "model weights come back but the optimizer state does not — "
+                        "loss spikes for a few hundred steps as Adam's moment "
+                        "estimates re-warm up. Expected: resuming restores optimizer "
+                        "state too, so training continues seamlessly."
+                    ),
+                    "author": "alice",
+                    "author_association": "NONE",
+                    "state": "open",
+                    "labels": [],
+                    "comments": [
+                        {
+                            "author": "carol",
+                            "association": "CONTRIBUTOR",
+                            "body": (
+                                "Confirmed on my end. Looks like `load_checkpoint` in "
+                                "src/checkpoint.py only restores `model.state_dict()` "
+                                "and never touches the optimizer."
+                            ),
+                        },
+                        {
+                            # A prior Scout reply — rendered as an assistant turn.
+                            "author": "scout-bot",
+                            "association": "NONE",
+                            "body": (
+                                "Hi, I'm Scout 🦉. Early read: the gap looks like it's "
+                                "in `src/checkpoint.py` — `load_checkpoint` restores "
+                                "model weights but not optimizer state. Digging further."
+                            ),
+                        },
+                        {
+                            "author": "bob",
+                            "association": "MEMBER",
+                            "body": (
+                                "Agreed this is the spot. @scout before we fix it: is "
+                                "src/checkpoint.py the only place we'd change, or does "
+                                "Trainer need to construct the optimizer *before* "
+                                "loading so there's a state dict to load into?"
+                            ),
+                        },
+                    ],
+                },
+            ],
+            "files": {
+                "src/checkpoint.py": (
+                    "import torch\n"
+                    "\n"
+                    "def save_checkpoint(path, model, optimizer):\n"
+                    "    torch.save({\n"
+                    "        'model': model.state_dict(),\n"
+                    "        'optimizer': optimizer.state_dict(),\n"
+                    "    }, path)\n"
+                    "\n"
+                    "def load_checkpoint(path, model):\n"
+                    "    # BUG: only the model is restored. The saved 'optimizer'\n"
+                    "    # state dict is ignored, so Adam moments reset on resume.\n"
+                    "    ckpt = torch.load(path)\n"
+                    "    model.load_state_dict(ckpt['model'])\n"
+                ),
+                "src/trainer.py": (
+                    "from .checkpoint import load_checkpoint\n"
+                    "\n"
+                    "class Trainer:\n"
+                    "    def fit(self, resume: str | None = None):\n"
+                    "        self.model = build_model()\n"
+                    "        if resume:\n"
+                    "            # Optimizer is created AFTER load — there's nothing\n"
+                    "            # to load optimizer state into at this point.\n"
+                    "            load_checkpoint(resume, self.model)\n"
+                    "        self.optimizer = build_optimizer(self.model)\n"
+                    "        # ... training loop ...\n"
+                ),
+                "tests/test_checkpoint.py": (
+                    "def test_save_load_roundtrip_model():\n"
+                    "    # only asserts model weights match; optimizer not covered\n"
+                    "    pass\n"
+                ),
+            },
+        },
+        "target_issue": 888,
+        "expected": {
+            "should_escalate": False,
+            "should_cite_issue": None,
+            "root_cause_files": ["src/checkpoint.py", "src/trainer.py"],
+        },
+    },
+    "assertions": [
+        "The response directly answers bob's question about whether src/checkpoint.py is the only change needed.",
+        "The response notes that Trainer constructs the optimizer after load_checkpoint, so the optimizer must be created before loading its state (src/trainer.py).",
+        "The response identifies src/checkpoint.py — load_checkpoint ignores the saved optimizer state dict.",
+        "The response reflects the thread: it acknowledges the maintainer/contributor confirmation rather than re-deriving the cause from scratch.",
+        "final_labels does not contain 'Escalated request'.",
+    ],
+}
+
+
 STARTER_SCENARIOS = [
     _SIMPLE_DUPLICATE,
     _CLEAR_BUG_NO_DUPLICATE,
     _ESCALATION_BREAKING_CHANGE,
     _SPAM_OFF_TOPIC,
     _SEARCH_RATE_LIMITED,
+    _COMMENT_THREAD,
 ]
 
 

diff --git a/src/scout/agent.py b/src/scout/agent.py
@@ -160,27 +160,66 @@ def build_repo_context(repo_tree: list[str] | None, readme: str | None) -> str:
     return "\n\n".join(parts)
 
 
-def build_issue_message(issue_data: dict) -> str:
-    """The user turn containing only the issue itself."""
-    comments_text = ""
-    if issue_data["comments"]:
-        formatted = "\n\n".join(
-            f"**@{c['author']}**: {c['body']}" for c in issue_data["comments"]
-        )
-        comments_text = (
-            f"\n\n---\n**Comments ({len(issue_data['comments'])}):**\n\n{formatted}"
-        )
-
-    return (
+def _labeled(author: str, association: str, text: str) -> str:
+    """A speaker-attributed line, e.g. '[gituser23 (CONTRIBUTOR)]: ...'. The
+    association lets the model weigh who is speaking (a MEMBER/OWNER carries more
+    authority than a drive-by NONE)."""
+    return f"[{author} ({association})]: {text}"
+
+
+def build_conversation(issue_data: dict) -> tuple[list[dict], str]:
+    """Render an issue thread as alternating chat turns.
+
+    The issue body and every human comment become ``user`` turns; Scout's own
+    past comments (``role == "assistant"``) become ``assistant`` turns. Each human
+    turn is prefixed with ``[author (association)]:`` so the model can weigh who is
+    speaking. Consecutive same-role turns are merged because the Messages API
+    expects roles to alternate (several humans commenting in a row are all
+    ``user``).
+
+    Returns ``(messages, latest_human_turn)``. ``latest_human_turn`` is the most
+    recent human message — the thing Scout is being asked to respond to — which
+    the caller records as the Opik thread's input so each run reads as one clean
+    user→assistant turn rather than an ever-growing transcript.
+    """
+    association = issue_data.get("author_association", "NONE")
+    header = (
         f"Issue #{issue_data['number']}: {issue_data['title']}\n\n"
-        f"Reporter: @{issue_data['author']}\n"
+        f"Reporter: [{issue_data['author']} ({association})]\n"
         f"Labels: {', '.join(issue_data['labels']) or 'none'}\n"
         f"State: {issue_data['state']}\n\n"
-        f"{issue_data['body'] or '(no description provided)'}"
-        f"{comments_text}\n\n"
-        "Please triage this issue."
+        + _labeled(
+            issue_data["author"],
+            association,
+            issue_data["body"] or "(no description provided)",
+        )
     )
 
+    turns: list[dict] = [{"role": "user", "content": header}]
+    latest_human = header
+    for c in issue_data.get("comments", []):
+        if c.get("role") == "assistant":
+            turns.append({"role": "assistant", "content": c["body"]})
+        else:
+            text = _labeled(c["author"], c.get("association", "NONE"), c["body"])
+            turns.append({"role": "user", "content": text})
+            latest_human = text
+
+    turns.append({
+        "role": "user",
+        "content": "Please triage this issue and respond to the most recent comment.",
+    })
+
+    # Merge consecutive same-role turns into one — the Messages API requires
+    # alternating user/assistant turns.
+    merged: list[dict] = []
+    for turn in turns:
+        if merged and merged[-1]["role"] == turn["role"]:
+            merged[-1]["content"] += "\n\n" + turn["content"]
+        else:
+            merged.append(dict(turn))
+    return merged, latest_human
+
 
 def run_agent(
     provider: RepositoryProvider,
@@ -202,14 +241,14 @@ def run_agent(
     tools = make_tools(provider, issue_number, opik_project=opik_project)
     tool_definitions = make_tool_definitions(escalation_tag)
     issue_data = provider.get_issue_data(issue_number)
-    issue_message = build_issue_message(issue_data)
+    conversation, latest_turn = build_conversation(issue_data)
 
-    def _agent(issue_message: str) -> str:
+    def _agent(conversation: list[dict]) -> str:
         repo_tree = provider.list_directory("")
         readme = provider.fetch_readme()
         repo_context = build_repo_context(repo_tree, readme)
 
-        messages = [{"role": "user", "content": issue_message}]
+        messages = [dict(m) for m in conversation]
         system = [
             {"type": "text", "text": system_prompt, "cache_control": {"type": "ephemeral"}},
             {"type": "text", "text": repo_context, "cache_control": {"type": "ephemeral"}},
@@ -229,16 +268,24 @@ def _agent(issue_message: str) -> str:
             messages.append({"role": "assistant", "content": response.content})
 
             if response.stop_reason == "end_turn":
+                reply = next(
+                    (block.text for block in response.content if hasattr(block, "text")),
+                    "Scout completed without producing a text response.",
+                )
                 td = opik_context.get_current_trace_data()
                 if td:
                     trace_id[0] = td.id
+                    # Group every run for this issue into one Opik thread, and
+                    # record just this turn (latest human message in → Scout reply
+                    # out) so the thread reads as a clean dialogue instead of an
+                    # ever-growing transcript. The model still sees the full
+                    # conversation above; only the recorded trace I/O is scoped.
                     opik_context.update_current_trace(
-                        thread_id=f"issue-{repo_owner}-{repo_name}-{issue_number}"
+                        thread_id=f"issue-{repo_owner}-{repo_name}-{issue_number}",
+                        input={"latest_comment": latest_turn},
+                        output={"response": reply},
                     )
-                for block in response.content:
-                    if hasattr(block, "text"):
-                        return block.text
-                return "Scout completed without producing a text response."
+                return reply
 
             if response.stop_reason == "tool_use":
                 tool_results = []
@@ -271,8 +318,8 @@ def _agent(issue_message: str) -> str:
             project_name=opik_project,
             tags=["scout-repo-agent"],
         )(_agent)
-        text = tracked(issue_message)
+        text = tracked(conversation)
     else:
-        text = _agent(issue_message)
+        text = _agent(conversation)
 
     return text, trace_id[0]
diff --git a/src/scout/markers.py b/src/scout/markers.py
@@ -0,0 +1,20 @@
+"""Shared marker constant for Scout's hidden comment signature.
+
+Scout stamps a hidden HTML marker into every comment it posts. The full marker
+(written by scout.triage._feedback_marker) is::
+
+    <!-- scout-feedback trace_id=<uuid> -->
+
+Two consumers key off it:
+  - scout.feedback maps a comment's 👍/👎 reactions back to its Opik trace
+    (it parses the trace id with scout.feedback.MARKER_RE), and
+  - the triage agent recognizes its own past comments in a thread so it can
+    render them as assistant turns and skip re-triaging when one triggers a run.
+
+SCOUT_COMMENT_MARKER is the trace-id-independent prefix; a substring test against
+it is enough to tell "this comment was written by Scout". Keep it in sync with
+_feedback_marker and scout.feedback.MARKER_RE.
+"""
+from __future__ import annotations
+
+SCOUT_COMMENT_MARKER = "<!-- scout-feedback"