feat(optimisation): implement phase 1 flag-only metric with correct escalation detection

LeoRoccoBreedt · claude · LeoRoccoBreedt · commit d6f4696d23ac · 2026-06-10T15:47:50.000+02:00
- Switch metric to flag_only_metric (ScoreResult) for phase 1
- invoke_agent returns JSON with comment + escalated bool
- Escalation detected via sim.get_issue_data labels post-run, not comment text
- Add _parse_output helper used by all metrics to parse invoke_agent output
- Set enable_context=False + prompt_overrides to prevent {data}/{issue_message} placeholders
- Remove auto-save of best prompt to Opik — user promotes manually

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/optimisation/run_prompt_optimisation.py b/optimisation/run_prompt_optimisation.py
@@ -12,6 +12,7 @@
 """
 from __future__ import annotations
 
+import json
 import logging
 import os
 import sys
@@ -33,6 +34,7 @@
 from dotenv import load_dotenv  # noqa: E402
 from opik_optimizer import ChatPrompt, MetaPromptOptimizer  # noqa: E402
 from opik_optimizer.agents.optimizable_agent import OptimizableAgent  # noqa: E402
+from opik_optimizer.utils.prompt_library import PromptLibrary  # noqa: E402
 
 load_dotenv()
 
@@ -69,8 +71,6 @@ def invoke_agent(
     ) -> str:
         prompt = next(iter(prompts.values()))
 
-        # get_messages handles both system= and messages= formats and substitutes
-        # {issue_message} from dataset_item automatically.
         messages = prompt.get_messages(dataset_item)
         system_prompt = next(
             (m["content"] for m in messages if m.get("role") == "system"), ""
@@ -97,7 +97,21 @@ def invoke_agent(
             model=MODEL,
             max_tokens=MAX_TOKENS,
         )
-        return comment or ""
+        escalated = SCOUT_ESCALATION_TAG in sim.get_issue_data(target).get("labels", [])
+        return json.dumps({"comment": comment or "", "escalated": escalated})
+
+
+def _parse_output(llm_output: str) -> tuple[str, bool]:
+    """Parse invoke_agent output into (comment, escalated).
+
+    invoke_agent returns JSON with {"comment": str, "escalated": bool}.
+    Falls back to plain string + tag-in-text detection for safety.
+    """
+    try:
+        parsed = json.loads(llm_output)
+        return parsed["comment"], bool(parsed["escalated"])
+    except (json.JSONDecodeError, KeyError):
+        return llm_output, SCOUT_ESCALATION_TAG.lower() in llm_output.lower()
 
 
 def escalation_accuracy(dataset_item: dict, llm_output: str) -> float:
@@ -112,10 +126,8 @@ def escalation_accuracy(dataset_item: dict, llm_output: str) -> float:
     if "should_escalate" not in expected:
         return 1.0
 
-    should_escalate: bool = expected["should_escalate"]
-    output_escalated = SCOUT_ESCALATION_TAG.lower() in llm_output.lower()
-
-    return 1.0 if output_escalated == should_escalate else 0.0
+    _, output_escalated = _parse_output(llm_output)
+    return 1.0 if output_escalated == expected["should_escalate"] else 0.0
 
 _answer_relevance_metric = AnswerRelevance(
     model="anthropic/claude-haiku-4-5-20251001",
@@ -125,23 +137,25 @@ def escalation_accuracy(dataset_item: dict, llm_output: str) -> float:
 
 
 def answer_relevance(dataset_item: dict, llm_output: str) -> float:
+    comment, _ = _parse_output(llm_output)
     result = _answer_relevance_metric.score(
         input=dataset_item["issue_message"],
-        output=llm_output,
+        output=comment,
     )
     return result.value
 
 
 def scout_quality(dataset_item: dict, llm_output: str) -> float:
     """Combined metric: structural completeness (50%) + escalation accuracy (50%)."""
+    comment, output_escalated = _parse_output(llm_output)
+
     required_sections = ["## Solution", "## Code Investigation", "## Next Steps"]
-    structure_score = sum(s in llm_output for s in required_sections) / len(required_sections)
+    structure_score = sum(s in comment for s in required_sections) / len(required_sections)
 
     data = dataset_item.get("data", dataset_item)
     expected = data.get("expected", {})
     if "should_escalate" in expected:
-        escalated = SCOUT_ESCALATION_TAG.lower() in llm_output.lower()
-        escalation_score = 1.0 if escalated == expected["should_escalate"] else 0.0
+        escalation_score = 1.0 if output_escalated == expected["should_escalate"] else 0.0
     else:
         escalation_score = 1.0
 
@@ -151,19 +165,17 @@ def scout_quality(dataset_item: dict, llm_output: str) -> float:
 def flag_only_metric(dataset_item: dict, llm_output: str) -> ScoreResult:
     """Phase 1 — escalation flag correctness only. No LLM judge call.
 
-    Maps the actual dataset shape (data.expected.should_escalate) and plain-string
-    agent output (escalation detected via SCOUT_ESCALATION_TAG) to the dev-plan
-    flag_only_metric interface.
+    Reads escalation state from the simulator label (via JSON output from
+    invoke_agent) — the ground truth for whether apply_label() was called.
     """
     data = dataset_item.get("data", dataset_item)
     expected = data.get("expected", {})
 
     if "should_escalate" not in expected:
-        # No ground truth for this item — treat as correct so it doesn't dilute signal.
         return ScoreResult(name="flag_accuracy", value=1.0, reason="No expected flag — skipped.")
 
     should_escalate: bool = expected["should_escalate"]
-    output_escalated = SCOUT_ESCALATION_TAG.lower() in llm_output.lower()
+    _, output_escalated = _parse_output(llm_output)
     correct = output_escalated == should_escalate
 
     return ScoreResult(
@@ -173,6 +185,34 @@ def flag_only_metric(dataset_item: dict, llm_output: str) -> ScoreResult:
     )
 
 
+def _scout_reasoning_override(prompts: PromptLibrary) -> None:
+    """Inject Scout-specific task context into the meta-LLM's reasoning prompt.
+
+    Replaces enable_context dataset sampling so the optimizer understands the
+    task domain without advertising dataset fields as template variables.
+    """
+    original = prompts.get("reasoning_system")
+    prompts.set(
+        "reasoning_system",
+        original + """
+
+Task context: You are optimising the system prompt for Scout, a GitHub issue triage agent.
+Scout receives a GitHub issue (title, body, author, labels) and must:
+1. Decide whether to escalate (true) or not (false)
+2. Write a reply to the issue author
+
+Escalation means: the issue requires a major design decision, breaking API change, or
+architectural discussion that needs maintainer consensus.
+No escalation means: bugs, feature requests, duplicate reports, spam — things Scout
+can investigate and respond to directly.
+
+Scout has access to tools that explore the repository codebase and search existing issues.
+It does NOT use template variables in its prompt — do not add placeholders like {data} or
+{issue_message} to the prompt you generate.
+""",
+    )
+
+
 def main() -> None:
     opik_client = opik.Opik()
     dataset = opik_client.get_dataset(DATASET_NAME)
@@ -184,16 +224,15 @@ def main() -> None:
     if chat_prompt_obj is None:
         sys.exit(f"ERROR: Opik chat prompt {PROMPT_NAME!r} not found in project {OPIK_PROJECT!r}.")
 
-    initial_prompt = ChatPrompt(
-        messages=[*chat_prompt_obj.template, {"role": "user", "content": "{issue_message}"}]
-    )
+    initial_prompt = ChatPrompt(messages=chat_prompt_obj.template)
 
     optimizer = MetaPromptOptimizer(
         model=f"anthropic/{MODEL}",
         model_parameters={"temperature": 0.0},
         prompts_per_round=4,
         n_threads=4,
-        enable_context=True,
+        enable_context=False,
+        prompt_overrides=_scout_reasoning_override,
         seed=42,
         skip_perfect_score=False,
     )
@@ -212,18 +251,11 @@ def main() -> None:
 
     result.display()
 
-    # Save the best prompt back to Opik — strip the user template, keep only system messages.
-    best_prompt = result.prompt if isinstance(result.prompt, ChatPrompt) else list(result.prompt.values())[0]
-    all_messages = best_prompt.messages or (
-        [{"role": "system", "content": best_prompt.system}] if best_prompt.system else []
-    )
-    system_messages = [m for m in all_messages if m.get("role") == "system"]
-    opik_client.create_chat_prompt(
-        name=PROMPT_NAME,
-        messages=system_messages,
-        project_name=OPIK_PROJECT,
+    logger.info(
+        "Review results in the Opik dashboard and manually promote the best prompt "
+        "to a new version of %r if you decide it is better.",
+        PROMPT_NAME,
     )
-    logger.info("Best prompt saved to Opik under %r as a new version.", PROMPT_NAME)
 
 
 if __name__ == "__main__":