tkukurin
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 48 additions & 1 deletion b/‎README.md‎
Lines changed: 48 additions & 1 deletion
diff --git a/‎examples/triage/main.py‎
Lines changed: 31 additions & 0 deletions b/‎examples/triage/main.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎examples/triage/scoring.py‎
Lines changed: 200 additions & 0 deletions b/‎examples/triage/scoring.py‎
Lines changed: 200 additions & 0 deletions
@@ -10,3 +10,5 @@ build/
 _version.py
 uv.lock
 .coverage
+logs/
+.pytest_cache/
@@ -188,12 +188,59 @@ uv run examples/date_extraction.py
 uv run examples/calendar_booking.py
 
 # support triage: extraction, classification, validation loop
-uv run examples/support_triage.py
+uv run examples/triage/main.py
+
+# same skill, scored step-by-step with Inspect AI (see section below)
+uv run examples/triage/scoring.py
 
 # all 20 use cases in one file (no external deps)
 uv run examples/showcase.py
 ```
 
+## Inspect AI integration
+
+Score individual steps of a skill with [Inspect AI](https://inspect.aisi.org.uk/) scorers.
+
+- `skill_solver(skill)` wraps a skill as an Inspect `Solver`. Final `result.value` becomes the completion; full trace lands in `state.metadata["llmbda.trace"]`.
+- `step_scorer(name, inner)` adapts any Inspect scorer to read a named step's value instead of the final completion.
+
+```python
+from inspect_ai import Task
+from inspect_ai.scorer import match, model_graded_qa
+from tk.llmbda.inspect import skill_solver, step_scorer
+
+Task(
+    dataset=tickets,
+    solver=skill_solver(support_triage),
+    scorer=[
+        step_scorer("λ::identifiers", match(location="any")),
+        step_scorer("ψ::draft", model_graded_qa()),
+        match(),  # final completion
+    ],
+)
+```
+
+- `entry=` on `skill_solver` customises how the skill input is extracted from `TaskState` (default: `s.input_text`).
+- `project=` on `step_scorer` stringifies non-str step values before the inner scorer sees them (default: `str`; pass `json.dumps` for dicts).
+- Metrics are inherited from the inner scorer; override with `metrics=[...]`.
+
+### Install and run
+
+- **As a library user:** `pip install tk-llmbda[inspect]` — the `inspect` extra pulls in `inspect-ai`.
+- **In this repo:** `inspect-ai` is already in the dev dependency group, so `uv sync` installs it automatically.
+
+A runnable end-to-end example lives in `examples/triage/scoring.py` (the skill itself is defined in `examples/triage/skill.py` and reused by `main.py`):
+
+```bash
+# run the skill + Inspect eval (scripted model, no API keys needed)
+uv run examples/triage/scoring.py
+
+# browse per-sample traces, scorer breakdowns, and step values in the viewer
+uv run inspect view
+```
+
+Every `inspect_eval(...)` call writes a `.eval` log file under `./logs/` which `inspect view` picks up automatically.
+
 ## Development
 
 ```bash
 
@@ -0,0 +1,31 @@
+# %% [markdown]
+# # Support triage: deterministic extraction, scripted LLM steps, repair loop
+#
+# Runs `support_triage` from `skill.py` on the bundled tickets. See
+# `scoring.py` for the Inspect AI evaluation of the same skill.
+
+# %%
+import json
+
+from skill import TICKETS, run_skill, support_triage
+
+# %% [markdown]
+# ## Run the skill on every ticket
+
+# %%
+for ticket in TICKETS:
+    result = run_skill(support_triage, ticket)
+    print(f"\n{ticket['id']} · {ticket['subject']}")
+    print(f"resolved_by: {result.resolved_by}")
+    print(json.dumps(result.value, indent=2))
+    print(f"validation:  {result.metadata}")
+
+# %% [markdown]
+# ## Inspect one trace
+
+# %%
+result = run_skill(support_triage, TICKETS[1])
+for name, step_result in result.trace.items():
+    print(f"\n{name}")
+    print(f"value:    {json.dumps(step_result.value, indent=2)}")
+    print(f"metadata: {json.dumps(step_result.metadata, indent=2)}")
@@ -0,0 +1,200 @@
+# %%
+"""Inspect AI scoring for the support triage skill.
+
+Run:  uv run python examples/triage/scoring.py
+With LLM grader:
+  INSPECT_GRADER=openai/gpt-4o-mini uv run python examples/triage/scoring.py
+View logs:  uv run inspect view  (from examples/triage/)
+"""
+
+import os
+
+from inspect_ai import Task
+from inspect_ai import eval as inspect_eval
+from inspect_ai.dataset import Sample
+from inspect_ai.log import EvalLog
+from inspect_ai.scorer import (
+    Metric,
+    Score,
+    Target,
+    accuracy,
+    match,
+    mean,
+    metric,
+    model_graded_qa,
+    scorer,
+    stderr,
+)
+from inspect_ai.solver import TaskState
+from skill import CLASSIFY, DRAFT, IDENTIFIERS, SUMMARIZE, TICKETS, support_triage
+
+from tk.llmbda.inspect import skill_solver, step_scorer
+
+# %%
+EXPECTED = {
+    "SUP-1001": ("billing_refund", "P2"),
+    "SUP-1002": ("production_incident", "P0"),
+    "SUP-1003": ("account_access", "P1"),  # mis-expects P1 so one cell fails
+}
+
+EVAL_SAMPLES = [
+    Sample(
+        id=t["id"],
+        input=t["subject"],
+        target=list(EXPECTED[t["id"]]),
+        metadata={"ticket": t},
+    )
+    for t in TICKETS
+]
+
+
+# %%
+def _trace(state: TaskState) -> dict:
+    return (state.metadata or {}).get("llmbda.trace", {})
+
+
+classify_matches_intent = step_scorer(
+    CLASSIFY,
+    match(location="exact"),
+    project=lambda v: v["intent"],
+)
+
+
+@scorer(metrics=[accuracy(), stderr()])
+def draft_priority_scorer():
+    async def score(state: TaskState, target: Target) -> Score:
+        got = _trace(state)[DRAFT].value["priority"]
+        want = target[1]
+        return Score(
+            value="C" if got == want else "I",
+            answer=got,
+            explanation=f"expected {want!r}, got {got!r}",
+        )
+
+    return score
+
+
+# %% LLM-graded reply quality (requires INSPECT_GRADER env var)
+REPLY_QUALITY_TEMPLATE = """\
+You are evaluating a customer support reply for quality.
+
+[BEGIN DATA]
+***
+[Customer request]: {question}
+***
+[Support reply]: {answer}
+***
+[END DATA]
+
+Grade the reply as CORRECT if it:
+- Acknowledges the customer's specific issue
+- Is professional and actionable
+- Requests missing information when identifiers are absent
+
+Grade as INCORRECT if the reply is generic, dismissive, or ignores
+key details from the request.
+
+{instructions}
+"""
+
+if grader := os.environ.get(gradevar := "INSPECT_GRADER"):
+    g = model_graded_qa(template=REPLY_QUALITY_TEMPLATE, model=grader)
+    draft_reply_quality = step_scorer(DRAFT, g, project=lambda v: v["customer_reply"])
+else:
+    print(f"[W] env: `set {gradevar}` to run model judge")
+
+
+# %% Heuristic reply scorer — partial credit (0.0 / 0.5 / 1.0)
+_ISSUE_KW = ["refund", "charge", "outage", "escalat", "access", "restore"]
+
+
+@scorer(metrics=[mean(), stderr()])
+def draft_reply_heuristic():
+    async def score(state: TaskState, target: Target) -> Score:  # noqa: ARG001
+        tr = _trace(state)
+        reply = tr[DRAFT].value.get("customer_reply", "").lower()
+        missing_ids = not tr[IDENTIFIERS].value["account_ids"]
+        ack = 0.5 if any(kw in reply for kw in _ISSUE_KW) else 0.0
+        info = (0.5 if "account" in reply else 0.0) if missing_ids else 0.5
+        pts = ack + info
+        reasons = []
+        if ack:
+            reasons.append("acknowledges issue")
+        else:
+            reasons.append("generic reply")
+        if missing_ids:
+            msg = "requests missing id" if info else "missing id not requested"
+            reasons.append(msg)
+        else:
+            reasons.append("no missing ids")
+        return Score(value=pts, answer=reply, explanation="; ".join(reasons))
+
+    return score
+
+
+# %%
+@metric
+def strict_accuracy() -> Metric:
+    def m(scores: list) -> float:
+        if not scores:
+            return 0.0
+        return sum(float(s.score.value) == 1.0 for s in scores) / len(scores)
+
+    return m
+
+
+@scorer(metrics=[accuracy(), stderr(), strict_accuracy()])
+def final_status_scorer():
+    async def score(state: TaskState, target: Target) -> Score:  # noqa: ARG001
+        status = _trace(state)[SUMMARIZE].value.get("status")
+        return Score(
+            value="C" if status == "validated" else "I",
+            answer=str(status),
+            explanation=f"status={status!r}",
+        )
+
+    return score
+
+
+# %%
+_scorers = [
+    classify_matches_intent,
+    draft_priority_scorer(),
+    draft_reply_heuristic(),
+    final_status_scorer(),
+]
+if draft_reply_quality is not None:
+    _scorers.insert(2, draft_reply_quality)
+
+eval_task = Task(
+    name="support_triage_eval",
+    dataset=EVAL_SAMPLES,
+    solver=skill_solver(support_triage, entry=lambda s: s.metadata["ticket"]),
+    scorer=_scorers,
+)
+
+eval_logs = inspect_eval(eval_task, model="none/none", display="none")
+assert isinstance((log := eval_logs[0]), EvalLog), f"{log=}"  # noqa: RUF018
+
+# %%
+print(f"status: {log.status}")
+if log.status != "success":
+    if log.error:
+        print(f"error: {log.error.message}")
+        if log.error.traceback:
+            print(log.error.traceback)
+    raise SystemExit(1)
+
+assert log.results is not None
+for sr in log.results.scores:
+    print(f"\n{sr.name}")
+    for name, mr in sr.metrics.items():
+        print(f"  {name:16s} = {mr.value:.3f}")
+
+# %%
+assert log.samples is not None
+for sample in log.samples:
+    print(f"\n{sample.id}")
+    assert sample.scores is not None
+    for name, sc in sample.scores.items():
+        print(f"  {name:28s} {sc.value}  ({sc.explanation})")