comet-ml · LeoRoccoBreedt · Jun 5, 2026 · Jun 5, 2026 · Jun 8, 2026 · Jun 9, 2026
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,9 @@ __pycache__/
 # Packaging artifacts (pip install -e . / python -m build)
 build/
 dist/
-*.egg-info/
+*.egg-info/
+
+# Generated files
+*.csv
+*.lock
+*.json
diff --git a/optimisation/metrics.py b/optimisation/metrics.py
@@ -0,0 +1,190 @@
+"""Metric functions for Scout prompt optimisation.
+
+All metrics share the same invoke_agent output contract:
+    llm_output = JSON string {"comment": str, "escalated": bool}
+
+Phases:
+    Phase 1 — flag_only_metric      (escalation flag only, no LLM judge)
+    Phase 2 — triage_accuracy       (flag 0.6 + reply quality judge 0.4)
+"""
+from __future__ import annotations
+
+import json
+import os
+import sys
+
+_repo_root = os.path.join(os.path.dirname(__file__), "..")
+sys.path.insert(0, _repo_root)
+sys.path.insert(0, os.path.join(_repo_root, "src"))
+
+import litellm  # noqa: E402
+from opik.evaluation.metrics import AnswerRelevance  # noqa: E402
+from opik.evaluation.metrics.score_result import ScoreResult  # noqa: E402
+
+from scout.triage import OPIK_PROJECT, SCOUT_ESCALATION_TAG  # noqa: E402
+
+JUDGE_MODEL = "anthropic/claude-haiku-4-5-20251001"
+
+JUDGE_PROMPT = """\
+You are evaluating a GitHub issue reply written by Scout, an AI triage agent.
+
+Scout's role is to:
+- Help the issue author understand whether their issue is a bug or not
+- Clarify the code behaviour where possible
+- NOT suggest fixes or code changes
+- Ask for reproduction steps if a bug is suspected
+- Link to relevant documentation when helpful
+- Always introduce itself as Scout with a friendly tone
+- Escalate only when the issue requires a major design decision, breaking API change,
+  or architectural discussion needing maintainer consensus
+
+---
+
+GitHub Issue:
+{issue}
+
+Scout's Reply:
+{reply}
+
+Escalation Applied: {escalated}
+
+---
+
+Score the reply from 0.0 to 1.0:
+- 0.0: fails a hard rule (no Scout intro, suggests fixes, wrong tone, escalation contradicts reply)
+- 0.5: meets hard rules but vague — missing repro steps or docs when clearly needed
+- 0.75: solid reply with minor gaps
+- 1.0: excellent — clear, friendly, on-scope, correctly escalated, repro steps/docs where appropriate
+
+Return JSON only: {{"score": float, "reason": "one sentence"}}
+"""
+
+_answer_relevance_metric = AnswerRelevance(
+    model=JUDGE_MODEL,
+    project_name=OPIK_PROJECT,
+    require_context=False,
+)
+
+
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+def _parse_output(llm_output: str) -> tuple[str, bool]:
+    """Parse invoke_agent JSON output into (comment, escalated).
+
+    Falls back to plain string + tag-in-text detection if JSON is malformed.
+    """
+    try:
+        parsed = json.loads(llm_output)
+        return parsed["comment"], bool(parsed["escalated"])
+    except (json.JSONDecodeError, KeyError):
+        return llm_output, SCOUT_ESCALATION_TAG.lower() in llm_output.lower()
+
+
+def _expected_escalation(dataset_item: dict) -> bool | None:
+    """Return the expected escalation bool, or None if not present in the item."""
+    data = dataset_item.get("data", dataset_item)
+    expected = data.get("expected", {})
+    val = expected.get("should_escalate")
+    return bool(val) if val is not None else None
+
+
+# ---------------------------------------------------------------------------
+# Phase 1 — flag accuracy only
+# ---------------------------------------------------------------------------
+
+def flag_only_metric(dataset_item: dict, llm_output: str) -> ScoreResult:
+    """Escalation flag correctness only. No LLM judge call."""
+    should_escalate = _expected_escalation(dataset_item)
+    if should_escalate is None:
+        return ScoreResult(name="flag_accuracy", value=1.0, reason="No expected flag — skipped.")
+
+    _, output_escalated = _parse_output(llm_output)
+    correct = output_escalated == should_escalate
+    return ScoreResult(
+        name="flag_accuracy",
+        value=1.0 if correct else 0.0,
+        reason="Flag correct." if correct else f"Flag wrong — expected escalate={should_escalate}.",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Phase 2 — triage accuracy (flag + reply quality)
+# ---------------------------------------------------------------------------
+
+def _reply_quality(issue: str, reply: str, escalated: bool) -> ScoreResult:
+    """LLM-as-judge for Scout's reply. Uses JUDGE_MODEL (Haiku) to keep costs low."""
+    prompt = JUDGE_PROMPT.format(issue=issue, reply=reply, escalated=escalated)
+    response = litellm.completion(
+        model=JUDGE_MODEL,
+        messages=[{"role": "user", "content": prompt}],
+    )
+    content = response.choices[0].message.content or ""
+    content = content.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()
+    result = json.loads(content)
+    return ScoreResult(
+        name="reply_quality",
+        value=float(result["score"]),
+        reason=result["reason"],
+    )
+
+
+def triage_accuracy(dataset_item: dict, llm_output: str) -> ScoreResult:
+    """Phase 2 — flag accuracy (0.6) + reply quality judge (0.4)."""
+    comment, output_escalated = _parse_output(llm_output)
+    should_escalate = _expected_escalation(dataset_item)
+
+    flag_score = 1.0
+    flag_reason = "No expected flag."
+    if should_escalate is not None:
+        flag_score = 1.0 if output_escalated == should_escalate else 0.0
+        flag_reason = "Flag correct." if flag_score == 1.0 else f"Flag wrong — expected escalate={should_escalate}."
+
+    issue = dataset_item.get("issue_message", "")
+    reply_result = _reply_quality(issue, comment, output_escalated)
+
+    combined = (flag_score * 0.6) + (reply_result.value * 0.4)
+    return ScoreResult(
+        name="triage_accuracy",
+        value=combined,
+        reason=f"{flag_reason} Reply: {reply_result.reason}",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Legacy metrics (kept for reference and future phases)
+# ---------------------------------------------------------------------------
+
+def escalation_accuracy(dataset_item: dict, llm_output: str) -> float:
+    """Score 1.0 if escalation decision matches expected, 0.0 otherwise."""
+    should_escalate = _expected_escalation(dataset_item)
+    if should_escalate is None:
+        return 1.0
+    _, output_escalated = _parse_output(llm_output)
+    return 1.0 if output_escalated == should_escalate else 0.0
+
+
+def answer_relevance(dataset_item: dict, llm_output: str) -> float:
+    """AnswerRelevance score for the reply comment."""
+    comment, _ = _parse_output(llm_output)
+    result = _answer_relevance_metric.score(
+        input=dataset_item["issue_message"],
+        output=comment,
+    )
+    return result.value
+
+
+def scout_quality(dataset_item: dict, llm_output: str) -> float:
+    """Combined metric: structural completeness (50%) + escalation accuracy (50%)."""
+    comment, output_escalated = _parse_output(llm_output)
+
+    required_sections = ["## Solution", "## Code Investigation", "## Next Steps"]
+    structure_score = sum(s in comment for s in required_sections) / len(required_sections)
+
+    should_escalate = _expected_escalation(dataset_item)
+    escalation_score = 1.0 if should_escalate is None else (
+        1.0 if output_escalated == should_escalate else 0.0
+    )
+
+    return 0.5 * structure_score + 0.5 * escalation_score
diff --git a/optimisation/run_prompt_optimisation.py b/optimisation/run_prompt_optimisation.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+"""Optimise the Scout system prompt against the scout-issues-with-github-sim dataset.
+
+Uses the Opik chat prompt named by SCOUT_OPIK_PROMPT_NAME as the starting point,
+runs each dataset item through the full Scout agent loop with the simulated GitHub
+environment, and scores on escalation accuracy. The best prompt is saved back to
+Opik as a new version of the same chat prompt.
+
+Env vars (on top of the normal Scout config in .env):
+    SCOUT_OPIK_PROMPT_NAME        — chat prompt to optimise (default: scout-triage-system-prompt)
+    SCOUT_OFFLINE_DATASET_NAME    — Opik dataset name (default: scout-issues-with-github-sim)
+"""
+from __future__ import annotations
+
+import json
+import logging
+import os
+import sys
+from typing import Any
+
+# Stub env vars required by scout.triage at import time.
+os.environ.setdefault("ISSUE_NUMBER", "1")
+os.environ.setdefault("GITHUB_TOKEN", "unused")
+os.environ.setdefault("SCOUT_GITHUB_REPO_OWNER", "comet-ml")
+os.environ.setdefault("SCOUT_GITHUB_REPO_NAME", "scout-test-repo")
+
+_repo_root = os.path.join(os.path.dirname(__file__), "..")
+sys.path.insert(0, _repo_root)
+sys.path.insert(0, os.path.join(_repo_root, "src"))
+
+import opik  # noqa: E402
+from dotenv import load_dotenv  # noqa: E402
+from opik_optimizer import ChatPrompt, MetaPromptOptimizer  # noqa: E402
+from opik_optimizer.agents.optimizable_agent import OptimizableAgent  # noqa: E402
+from opik_optimizer.utils.prompt_library import PromptLibrary  # noqa: E402
+
+load_dotenv()
+
+from scout.agent import make_client, run_agent  # noqa: E402
+from scout.providers.scenarios import build  # noqa: E402
+from scout.triage import (  # noqa: E402
+    ANTHROPIC_API_KEY,
+    MAX_TOKENS,
+    MODEL,
+    OPIK_PROJECT,
+    SCOUT_ESCALATION_TAG,
+)
+from metrics import triage_accuracy  # noqa: E402
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+DATASET_NAME = os.environ.get("SCOUT_OFFLINE_DATASET_NAME", "scout-triage-optimisation-runs-v2")
+PROMPT_NAME = os.environ.get("SCOUT_OPIK_PROMPT_NAME", "scout-triage-system-prompt-initial")
+
+
+class ScoutAgent(OptimizableAgent):
+    """Runs the full Scout agent loop for one dataset item using the simulated GitHub environment."""
+
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self._client = make_client(ANTHROPIC_API_KEY, opik_project=OPIK_PROJECT)
+
+    def invoke_agent(
+        self,
+        prompts: dict[str, ChatPrompt],
+        dataset_item: dict[str, Any],
+        allow_tool_use: bool = False,
+        seed: int | None = None,
+    ) -> str:
+        prompt = next(iter(prompts.values()))
+
+        messages = prompt.get_messages(dataset_item)
+        system_prompt = next(
+            (m["content"] for m in messages if m.get("role") == "system"), ""
+        )
+        if not system_prompt:
+            logger.warning("invoke_agent: no system prompt in candidate — skipping")
+            return ""
+
+        data = dataset_item.get("data", dataset_item)
+        scenario = data.get("scenario", "default")
+        spec = data["spec"]
+        target = int(data["target_issue"])
+
+        sim = build(scenario, spec)
+        comment, _ = run_agent(
+            sim,
+            target,
+            client=self._client,
+            system_prompt=system_prompt,
+            escalation_tag=SCOUT_ESCALATION_TAG,
+            repo_owner=sim.owner,
+            repo_name=sim.name,
+            opik_project=OPIK_PROJECT,
+            model=MODEL,
+            max_tokens=MAX_TOKENS,
+        )
+        escalated = SCOUT_ESCALATION_TAG in sim.get_issue_data(target).get("labels", [])
+        return json.dumps({"comment": comment or "", "escalated": escalated})
+
+
+def _scout_reasoning_override(prompts: PromptLibrary) -> None:
+    """Inject Scout-specific task context into the meta-LLM's reasoning prompt.
+
+    Replaces enable_context dataset sampling so the optimizer understands the
+    task domain without advertising dataset fields as template variables.
+    """
+    original = prompts.get("reasoning_system")
+    prompts.set(
+        "reasoning_system",
+        original + """
+
+Task context: You are optimising the system prompt for Scout, a GitHub issue triage agent.
+Scout receives a GitHub issue (title, body, author, labels) and must:
+1. Decide whether to escalate (true) or not (false)
+2. Write a reply to the issue author
+
+Escalation means: the issue requires a major design decision, breaking API change, or
+architectural discussion that needs maintainer consensus.
+No escalation means: bugs, feature requests, duplicate reports, spam — things Scout
+can investigate and respond to directly.
+
+The metric scoring Scout evaluates both escalation accuracy (60%) and reply quality (40%).
+A high-quality reply: introduces Scout by name, uses a friendly tone, does NOT suggest
+code fixes, asks for repro steps when a bug is suspected, and is consistent with the
+escalation decision.
+
+Scout has access to tools that explore the repository codebase and search existing issues.
+It does NOT use template variables in its prompt — do not add placeholders like {data} or
+{issue_message} to the prompt you generate.
+""",
+    )
+
+
+def main() -> None:
+    opik_client = opik.Opik()
+    dataset = opik_client.get_dataset(DATASET_NAME)
+
+    chat_prompt_obj = opik_client.get_chat_prompt(
+        name=PROMPT_NAME,
+        project_name=OPIK_PROJECT,
+    )
+    if chat_prompt_obj is None:
+        sys.exit(f"ERROR: Opik chat prompt {PROMPT_NAME!r} not found in project {OPIK_PROJECT!r}.")
+
+    initial_prompt = ChatPrompt(messages=chat_prompt_obj.template)
+
+    optimizer = MetaPromptOptimizer(
+        model=f"anthropic/{MODEL}",
+        model_parameters={"temperature": 0.0},
+        prompts_per_round=4,
+        n_threads=4,
+        enable_context=False,
+        prompt_overrides=_scout_reasoning_override,
+        seed=42,
+        skip_perfect_score=False,
+    )
+
+    logger.info("Starting optimisation: prompt=%r  dataset=%r", PROMPT_NAME, DATASET_NAME)
+
+    result = optimizer.optimize_prompt(
+        prompt=initial_prompt,
+        dataset=dataset,
+        metric=triage_accuracy,
+        agent=ScoutAgent(project_name=OPIK_PROJECT),
+        n_samples=10,
+        project_name=OPIK_PROJECT,
+        max_trials=2,
+    )
+
+    result.display()
+
+    logger.info(
+        "Review results in the Opik dashboard and manually promote the best prompt "
+        "to a new version of %r if you decide it is better.",
+        PROMPT_NAME,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,6 +13,7 @@ dependencies = [
     "requests>=2.28",
     "opik>=1.0",
     "python-dotenv>=1.0",
+    "opik-optimizer>=3.1.0",
 ]
 
 [project.optional-dependencies]