feat(optimisation): implement phase 2 triage_accuracy metric with LLM judge

LeoRoccoBreedt · claude · LeoRoccoBreedt · commit 44213c367ad5 · 2026-06-10T15:47:50.000+02:00
- Extract all metrics into optimisation/metrics.py to keep main script clean
- Add triage_accuracy metric: flag accuracy (0.6) + reply quality judge (0.4)
- Add _reply_quality helper using claude-haiku for cost-efficient judging
- Strip markdown code fences from judge response before JSON parsing
- Update _scout_reasoning_override to include reply quality context
- Switch main() from flag_only_metric to triage_accuracy for phase 2

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/optimisation/metrics.py b/optimisation/metrics.py
@@ -0,0 +1,190 @@
+"""Metric functions for Scout prompt optimisation.
+
+All metrics share the same invoke_agent output contract:
+    llm_output = JSON string {"comment": str, "escalated": bool}
+
+Phases:
+    Phase 1 — flag_only_metric      (escalation flag only, no LLM judge)
+    Phase 2 — triage_accuracy       (flag 0.6 + reply quality judge 0.4)
+"""
+from __future__ import annotations
+
+import json
+import os
+import sys
+
+_repo_root = os.path.join(os.path.dirname(__file__), "..")
+sys.path.insert(0, _repo_root)
+sys.path.insert(0, os.path.join(_repo_root, "src"))
+
+import litellm  # noqa: E402
+from opik.evaluation.metrics import AnswerRelevance  # noqa: E402
+from opik.evaluation.metrics.score_result import ScoreResult  # noqa: E402
+
+from scout.triage import OPIK_PROJECT, SCOUT_ESCALATION_TAG  # noqa: E402
+
+JUDGE_MODEL = "anthropic/claude-haiku-4-5-20251001"
+
+JUDGE_PROMPT = """\
+You are evaluating a GitHub issue reply written by Scout, an AI triage agent.
+
+Scout's role is to:
+- Help the issue author understand whether their issue is a bug or not
+- Clarify the code behaviour where possible
+- NOT suggest fixes or code changes
+- Ask for reproduction steps if a bug is suspected
+- Link to relevant documentation when helpful
+- Always introduce itself as Scout with a friendly tone
+- Escalate only when the issue requires a major design decision, breaking API change,
+  or architectural discussion needing maintainer consensus
+
+---
+
+GitHub Issue:
+{issue}
+
+Scout's Reply:
+{reply}
+
+Escalation Applied: {escalated}
+
+---
+
+Score the reply from 0.0 to 1.0:
+- 0.0: fails a hard rule (no Scout intro, suggests fixes, wrong tone, escalation contradicts reply)
+- 0.5: meets hard rules but vague — missing repro steps or docs when clearly needed
+- 0.75: solid reply with minor gaps
+- 1.0: excellent — clear, friendly, on-scope, correctly escalated, repro steps/docs where appropriate
+
+Return JSON only: {{"score": float, "reason": "one sentence"}}
+"""
+
+_answer_relevance_metric = AnswerRelevance(
+    model=JUDGE_MODEL,
+    project_name=OPIK_PROJECT,
+    require_context=False,
+)
+
+
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+def _parse_output(llm_output: str) -> tuple[str, bool]:
+    """Parse invoke_agent JSON output into (comment, escalated).
+
+    Falls back to plain string + tag-in-text detection if JSON is malformed.
+    """
+    try:
+        parsed = json.loads(llm_output)
+        return parsed["comment"], bool(parsed["escalated"])
+    except (json.JSONDecodeError, KeyError):
+        return llm_output, SCOUT_ESCALATION_TAG.lower() in llm_output.lower()
+
+
+def _expected_escalation(dataset_item: dict) -> bool | None:
+    """Return the expected escalation bool, or None if not present in the item."""
+    data = dataset_item.get("data", dataset_item)
+    expected = data.get("expected", {})
+    val = expected.get("should_escalate")
+    return bool(val) if val is not None else None
+
+
+# ---------------------------------------------------------------------------
+# Phase 1 — flag accuracy only
+# ---------------------------------------------------------------------------
+
+def flag_only_metric(dataset_item: dict, llm_output: str) -> ScoreResult:
+    """Escalation flag correctness only. No LLM judge call."""
+    should_escalate = _expected_escalation(dataset_item)
+    if should_escalate is None:
+        return ScoreResult(name="flag_accuracy", value=1.0, reason="No expected flag — skipped.")
+
+    _, output_escalated = _parse_output(llm_output)
+    correct = output_escalated == should_escalate
+    return ScoreResult(
+        name="flag_accuracy",
+        value=1.0 if correct else 0.0,
+        reason="Flag correct." if correct else f"Flag wrong — expected escalate={should_escalate}.",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Phase 2 — triage accuracy (flag + reply quality)
+# ---------------------------------------------------------------------------
+
+def _reply_quality(issue: str, reply: str, escalated: bool) -> ScoreResult:
+    """LLM-as-judge for Scout's reply. Uses JUDGE_MODEL (Haiku) to keep costs low."""
+    prompt = JUDGE_PROMPT.format(issue=issue, reply=reply, escalated=escalated)
+    response = litellm.completion(
+        model=JUDGE_MODEL,
+        messages=[{"role": "user", "content": prompt}],
+    )
+    content = response.choices[0].message.content or ""
+    content = content.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()
+    result = json.loads(content)
+    return ScoreResult(
+        name="reply_quality",
+        value=float(result["score"]),
+        reason=result["reason"],
+    )
+
+
+def triage_accuracy(dataset_item: dict, llm_output: str) -> ScoreResult:
+    """Phase 2 — flag accuracy (0.6) + reply quality judge (0.4)."""
+    comment, output_escalated = _parse_output(llm_output)
+    should_escalate = _expected_escalation(dataset_item)
+
+    flag_score = 1.0
+    flag_reason = "No expected flag."
+    if should_escalate is not None:
+        flag_score = 1.0 if output_escalated == should_escalate else 0.0
+        flag_reason = "Flag correct." if flag_score == 1.0 else f"Flag wrong — expected escalate={should_escalate}."
+
+    issue = dataset_item.get("issue_message", "")
+    reply_result = _reply_quality(issue, comment, output_escalated)
+
+    combined = (flag_score * 0.6) + (reply_result.value * 0.4)
+    return ScoreResult(
+        name="triage_accuracy",
+        value=combined,
+        reason=f"{flag_reason} Reply: {reply_result.reason}",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Legacy metrics (kept for reference and future phases)
+# ---------------------------------------------------------------------------
+
+def escalation_accuracy(dataset_item: dict, llm_output: str) -> float:
+    """Score 1.0 if escalation decision matches expected, 0.0 otherwise."""
+    should_escalate = _expected_escalation(dataset_item)
+    if should_escalate is None:
+        return 1.0
+    _, output_escalated = _parse_output(llm_output)
+    return 1.0 if output_escalated == should_escalate else 0.0
+
+
+def answer_relevance(dataset_item: dict, llm_output: str) -> float:
+    """AnswerRelevance score for the reply comment."""
+    comment, _ = _parse_output(llm_output)
+    result = _answer_relevance_metric.score(
+        input=dataset_item["issue_message"],
+        output=comment,
+    )
+    return result.value
+
+
+def scout_quality(dataset_item: dict, llm_output: str) -> float:
+    """Combined metric: structural completeness (50%) + escalation accuracy (50%)."""
+    comment, output_escalated = _parse_output(llm_output)
+
+    required_sections = ["## Solution", "## Code Investigation", "## Next Steps"]
+    structure_score = sum(s in comment for s in required_sections) / len(required_sections)
+
+    should_escalate = _expected_escalation(dataset_item)
+    escalation_score = 1.0 if should_escalate is None else (
+        1.0 if output_escalated == should_escalate else 0.0
+    )
+
+    return 0.5 * structure_score + 0.5 * escalation_score
diff --git a/optimisation/run_prompt_optimisation.py b/optimisation/run_prompt_optimisation.py
@@ -29,8 +29,6 @@
 sys.path.insert(0, os.path.join(_repo_root, "src"))
 
 import opik  # noqa: E402
-from opik.evaluation.metrics import AnswerRelevance  # noqa: E402
-from opik.evaluation.metrics.score_result import ScoreResult  # noqa: E402
 from dotenv import load_dotenv  # noqa: E402
 from opik_optimizer import ChatPrompt, MetaPromptOptimizer  # noqa: E402
 from opik_optimizer.agents.optimizable_agent import OptimizableAgent  # noqa: E402
@@ -47,6 +45,7 @@
     OPIK_PROJECT,
     SCOUT_ESCALATION_TAG,
 )
+from metrics import triage_accuracy  # noqa: E402
 
 logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
 logger = logging.getLogger(__name__)
@@ -101,90 +100,6 @@ def invoke_agent(
         return json.dumps({"comment": comment or "", "escalated": escalated})
 
 
-def _parse_output(llm_output: str) -> tuple[str, bool]:
-    """Parse invoke_agent output into (comment, escalated).
-
-    invoke_agent returns JSON with {"comment": str, "escalated": bool}.
-    Falls back to plain string + tag-in-text detection for safety.
-    """
-    try:
-        parsed = json.loads(llm_output)
-        return parsed["comment"], bool(parsed["escalated"])
-    except (json.JSONDecodeError, KeyError):
-        return llm_output, SCOUT_ESCALATION_TAG.lower() in llm_output.lower()
-
-
-def escalation_accuracy(dataset_item: dict, llm_output: str) -> float:
-    """Score 1.0 if escalation decision matches expected, 0.0 otherwise.
-
-    Items without an expected.should_escalate field score 1.0 so they don't
-    dilute the signal.
-    """
-    data = dataset_item.get("data", dataset_item)
-    expected = data.get("expected", {})
-
-    if "should_escalate" not in expected:
-        return 1.0
-
-    _, output_escalated = _parse_output(llm_output)
-    return 1.0 if output_escalated == expected["should_escalate"] else 0.0
-
-_answer_relevance_metric = AnswerRelevance(
-    model="anthropic/claude-haiku-4-5-20251001",
-    project_name=OPIK_PROJECT,
-    require_context=False,
-)
-
-
-def answer_relevance(dataset_item: dict, llm_output: str) -> float:
-    comment, _ = _parse_output(llm_output)
-    result = _answer_relevance_metric.score(
-        input=dataset_item["issue_message"],
-        output=comment,
-    )
-    return result.value
-
-
-def scout_quality(dataset_item: dict, llm_output: str) -> float:
-    """Combined metric: structural completeness (50%) + escalation accuracy (50%)."""
-    comment, output_escalated = _parse_output(llm_output)
-
-    required_sections = ["## Solution", "## Code Investigation", "## Next Steps"]
-    structure_score = sum(s in comment for s in required_sections) / len(required_sections)
-
-    data = dataset_item.get("data", dataset_item)
-    expected = data.get("expected", {})
-    if "should_escalate" in expected:
-        escalation_score = 1.0 if output_escalated == expected["should_escalate"] else 0.0
-    else:
-        escalation_score = 1.0
-
-    return 0.5 * structure_score + 0.5 * escalation_score
-
-
-def flag_only_metric(dataset_item: dict, llm_output: str) -> ScoreResult:
-    """Phase 1 — escalation flag correctness only. No LLM judge call.
-
-    Reads escalation state from the simulator label (via JSON output from
-    invoke_agent) — the ground truth for whether apply_label() was called.
-    """
-    data = dataset_item.get("data", dataset_item)
-    expected = data.get("expected", {})
-
-    if "should_escalate" not in expected:
-        return ScoreResult(name="flag_accuracy", value=1.0, reason="No expected flag — skipped.")
-
-    should_escalate: bool = expected["should_escalate"]
-    _, output_escalated = _parse_output(llm_output)
-    correct = output_escalated == should_escalate
-
-    return ScoreResult(
-        name="flag_accuracy",
-        value=1.0 if correct else 0.0,
-        reason="Flag correct." if correct else f"Flag wrong — expected escalate={should_escalate}.",
-    )
-
-
 def _scout_reasoning_override(prompts: PromptLibrary) -> None:
     """Inject Scout-specific task context into the meta-LLM's reasoning prompt.
 
@@ -206,6 +121,11 @@ def _scout_reasoning_override(prompts: PromptLibrary) -> None:
 No escalation means: bugs, feature requests, duplicate reports, spam — things Scout
 can investigate and respond to directly.
 
+The metric scoring Scout evaluates both escalation accuracy (60%) and reply quality (40%).
+A high-quality reply: introduces Scout by name, uses a friendly tone, does NOT suggest
+code fixes, asks for repro steps when a bug is suspected, and is consistent with the
+escalation decision.
+
 Scout has access to tools that explore the repository codebase and search existing issues.
 It does NOT use template variables in its prompt — do not add placeholders like {data} or
 {issue_message} to the prompt you generate.
@@ -242,7 +162,7 @@ def main() -> None:
     result = optimizer.optimize_prompt(
         prompt=initial_prompt,
         dataset=dataset,
-        metric=flag_only_metric,
+        metric=triage_accuracy,
         agent=ScoutAgent(project_name=OPIK_PROJECT),
         n_samples=10,
         project_name=OPIK_PROJECT,