Skip to content

Commit 0cca841

Browse files
committed
Continuous rubric scoring + gradient UI
Replace majority-vote + 3-of-5 pass threshold with raw judge fractions: each criterion score is the mean of judge votes, each sample score is the mean of criterion scores, each model score is the mean of sample scores. No thresholds anywhere. UI and list_evaluations read the same aggregate so the top-line number matches everywhere. UI cell colors switch from red/yellow/green threshold bands to a continuous red->yellow->green HSL gradient. Sample grid drops the pass/fail checkmark in favor of a colored percentage. Side quality-of-life: - .env.keys re-read on each tool call so adding OPENAI/ANTHROPIC/GOOGLE keys no longer needs an MCP restart - "All Evaluations" is a plain <a href> so it navigates reliably when the target route matches the current one
1 parent d92a976 commit 0cca841

61 files changed

Lines changed: 374 additions & 316 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

backend/api/compare.py

Lines changed: 61 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
33
Reads pre-computed JSON from S3/disk. The JSON is built once when an eval
44
completes (see backend.core.eval_results.precompute_eval_results).
5+
6+
Live progress for in-flight evaluations is served by /api/compare/progress,
7+
not by these endpoints.
58
"""
69

710
import logging
@@ -27,137 +30,17 @@ async def _get_user_id(request: Request) -> str:
2730

2831
@router.get("/groups")
2932
async def get_comparison_groups(user_id: str = Depends(_get_user_id)):
30-
"""List evaluation comparison groups for the user.
31-
32-
Serves from pre-computed cache (fast). Merges in running evals
33-
from log headers so they appear without waiting for completion.
34-
"""
35-
from eval_mcp.core.eval_results import _read_log_headers, _build_groups_from_headers
36-
37-
# Serve cached completed evals (instant)
33+
"""List evaluation comparison groups for the user, served from the pre-computed cache."""
3834
cached = load_eval_groups(user_id)
39-
cached_groups = cached.get("groups", []) if cached else []
40-
cached_ids = {g["id"] for g in cached_groups}
41-
42-
# Find running evals not in cache
43-
log_dir = get_user_log_dir(user_id)
44-
headers = await _read_log_headers(log_dir)
45-
started_headers = [h for h in headers if h.get("status") == "started"]
46-
47-
if not started_headers:
48-
if cached_groups:
49-
return cached
50-
# No cache and no running — build fresh
51-
await precompute_eval_results(user_id)
52-
return load_eval_groups(user_id) or {"groups": []}
53-
54-
# Build groups from started headers only, merge with cache
55-
all_data = _build_groups_from_headers(started_headers)
56-
new_groups = [g for g in all_data.get("groups", []) if g["id"] not in cached_ids]
57-
58-
merged = new_groups + cached_groups
59-
merged.sort(key=lambda g: g.get("created", ""), reverse=True)
60-
return {"groups": merged}
35+
if cached:
36+
return cached
37+
await precompute_eval_results(user_id)
38+
return load_eval_groups(user_id) or {"groups": []}
6139

6240

6341
@router.get("/detail")
6442
async def get_comparison_detail(group_id: str, user_id: str = Depends(_get_user_id)):
6543
"""Get full comparison data for a specific evaluation group."""
66-
from eval_mcp.core.eval_results import _read_log_headers, _build_groups_from_headers
67-
68-
# For running evals, read partial results directly (skip cache)
69-
log_dir = get_user_log_dir(user_id)
70-
headers = await _read_log_headers(log_dir)
71-
group_headers = [h for h in headers if (h.get("run_id") or h["file"]) == group_id]
72-
if group_headers and any(h.get("status") == "started" for h in group_headers):
73-
import asyncio
74-
from functools import partial
75-
from inspect_ai.log import read_eval_log_sample_summaries
76-
77-
models = list(dict.fromkeys(h["model"] for h in group_headers))
78-
total_samples = group_headers[0].get("dataset_samples", 0)
79-
samples_by_id: dict[str, dict] = {}
80-
aggregate: dict[str, dict] = {}
81-
82-
criteria_names: set[str] = set()
83-
criteria_votes: dict[str, dict[str, list[bool]]] = {} # model -> criterion -> [passed]
84-
85-
for h in group_headers:
86-
model = h["model"]
87-
try:
88-
loop = asyncio.get_event_loop()
89-
summaries = await loop.run_in_executor(None, partial(read_eval_log_sample_summaries, h["file"]))
90-
completed = [s for s in summaries if s.scores]
91-
scores_sum = 0.0
92-
model_criteria_votes: dict[str, list[bool]] = {}
93-
94-
for s in completed:
95-
score_obj = next(iter(s.scores.values())) if s.scores else None
96-
if not score_obj:
97-
continue
98-
val = score_obj.value
99-
if val == "C":
100-
scores_sum += 1.0
101-
elif isinstance(val, (int, float)):
102-
scores_sum += float(val)
103-
104-
# Extract per-criterion results
105-
if score_obj.metadata and "criteria_results" in score_obj.metadata:
106-
for cr in score_obj.metadata["criteria_results"]:
107-
cname = cr["name"]
108-
criteria_names.add(cname)
109-
if cname not in model_criteria_votes:
110-
model_criteria_votes[cname] = []
111-
model_criteria_votes[cname].append(cr["passed"])
112-
113-
avg = scores_sum / len(completed) if completed else 0
114-
by_criterion = {}
115-
for cname, votes in model_criteria_votes.items():
116-
by_criterion[cname] = sum(votes) / len(votes) if votes else 0
117-
aggregate[model] = {"overall": avg, "byCriterion": by_criterion}
118-
criteria_votes[model] = model_criteria_votes
119-
120-
for s in completed:
121-
sid = str(s.id)
122-
if sid not in samples_by_id:
123-
sample_input = s.input if isinstance(s.input, str) else str(s.input[0].content if s.input else "")
124-
samples_by_id[sid] = {
125-
"id": sid,
126-
"input": sample_input[:300],
127-
"target": s.target[0] if isinstance(s.target, list) else str(s.target or ""),
128-
"results": {},
129-
}
130-
score_obj = next(iter(s.scores.values())) if s.scores else None
131-
passed = score_obj.value == "C" if score_obj else False
132-
score_num = 1.0 if passed else (float(score_obj.value) if score_obj and isinstance(score_obj.value, (int, float)) else 0.0)
133-
criteria_results = []
134-
if score_obj and score_obj.metadata and "criteria_results" in score_obj.metadata:
135-
criteria_results = [
136-
{"name": cr["name"], "passed": cr["passed"], "votes_for": cr.get("votes_for", 0), "total": cr.get("total", 0)}
137-
for cr in score_obj.metadata["criteria_results"]
138-
]
139-
samples_by_id[sid]["results"][model] = {
140-
"passed": passed,
141-
"score": score_num,
142-
"output": "",
143-
"explanation": score_obj.explanation[:200] if score_obj and score_obj.explanation else "",
144-
"criteriaResults": criteria_results,
145-
}
146-
except Exception as e:
147-
logger.warning(f"Failed to read summaries for {model}: {e}")
148-
aggregate[model] = {"overall": 0, "byCriterion": {}}
149-
150-
return {
151-
"models": models,
152-
"samples": list(samples_by_id.values()),
153-
"aggregate": aggregate,
154-
"criteria": sorted(criteria_names),
155-
"stats": {m: {"total_tokens": 0} for m in models},
156-
"status": "running",
157-
"sampleCount": total_samples,
158-
"completedSamples": len(samples_by_id),
159-
}
160-
16144
data = load_eval_detail(user_id, group_id)
16245
if data:
16346
return data
@@ -352,3 +235,56 @@ async def generate_report_pdf(
352235
"Content-Disposition": f'attachment; filename="eval_report_{safe_id}.pdf"',
353236
},
354237
)
238+
239+
240+
@router.get("/report/{group_id}")
241+
async def download_report(group_id: str, user_id: str = Depends(_get_user_id)):
242+
"""Serve a previously generated PDF report for an evaluation group.
243+
244+
Reads from S3 in production, local disk in dev. Returns 404 if the
245+
report hasn't been generated yet (in which case the caller should POST
246+
to /report/pdf or ask the MCP agent to generate one).
247+
"""
248+
import os
249+
from eval_mcp.core.user_storage import (
250+
DATA_BUCKET,
251+
_get_s3_client,
252+
_s3_enabled,
253+
get_user_base_dir,
254+
)
255+
256+
if not user_id or "/" in user_id or "\\" in user_id or user_id in (".", ".."):
257+
raise HTTPException(status_code=400, detail="invalid user_id")
258+
safe_id = group_id.replace("/", "_").replace("\\", "_")
259+
filename = f"report_{safe_id}.pdf"
260+
261+
if _s3_enabled():
262+
key = f"users/{user_id}/store/reports/{filename}"
263+
try:
264+
obj = _get_s3_client().get_object(Bucket=DATA_BUCKET, Key=key)
265+
except Exception as e:
266+
if getattr(e, "response", {}).get("Error", {}).get("Code") in ("NoSuchKey", "404"):
267+
raise HTTPException(
268+
status_code=404,
269+
detail="Report not generated yet.",
270+
)
271+
logger.warning(f"Failed to fetch report s3://{DATA_BUCKET}/{key}: {e}")
272+
raise HTTPException(status_code=500, detail="failed to fetch report")
273+
pdf_bytes = obj["Body"].read()
274+
else:
275+
base_real = os.path.realpath(str(get_user_base_dir()))
276+
pdf_real = os.path.realpath(os.path.join(base_real, user_id, "store", "reports", filename))
277+
if not pdf_real.startswith(base_real + os.sep):
278+
raise HTTPException(status_code=400, detail="invalid path")
279+
if not os.path.isfile(pdf_real):
280+
raise HTTPException(status_code=404, detail="Report not generated yet.")
281+
with open(pdf_real, "rb") as f:
282+
pdf_bytes = f.read()
283+
284+
return Response(
285+
content=pdf_bytes,
286+
media_type="application/pdf",
287+
headers={
288+
"Content-Disposition": f'attachment; filename="eval_report_{safe_id}.pdf"',
289+
},
290+
)

eval_mcp/core/eval_results.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -295,9 +295,14 @@ def _col_sort_key(k: str) -> tuple:
295295
score_data["criteriaResults"] = all_criteria
296296
else:
297297
for scorer_name, score in scorers.items():
298-
score_data["passed"] = score["value"] == "C"
299298
metadata = score.get("metadata", {})
300-
score_data["score"] = metadata.get("jury_score", 1.0 if score["value"] == "C" else 0.0)
299+
raw_value = score.get("value")
300+
if isinstance(raw_value, (int, float)):
301+
sample_score = float(raw_value)
302+
else:
303+
sample_score = metadata.get("jury_score", 1.0 if raw_value == "C" else 0.0)
304+
score_data["score"] = sample_score
305+
score_data["passed"] = sample_score > 0.5
301306
score_data["explanation"] = score.get("explanation", "")
302307
criteria_results = metadata.get("criteria_results", [])
303308
score_data["criteriaResults"] = criteria_results
@@ -335,12 +340,18 @@ def _col_sort_key(k: str) -> tuple:
335340

336341
by_criterion: dict[str, float] = {}
337342
for criterion in criteria_set:
338-
criterion_passed = 0
343+
crit_values: list[float] = []
339344
for s in model_samples:
340345
for cr in s.get("criteriaResults", []):
341-
if cr["name"] == criterion and cr["passed"]:
342-
criterion_passed += 1
343-
by_criterion[criterion] = criterion_passed / max(total, 1)
346+
if cr["name"] != criterion:
347+
continue
348+
if "score" in cr:
349+
crit_values.append(float(cr["score"]))
350+
elif cr.get("total", 0) > 0:
351+
crit_values.append(cr["votes_for"] / cr["total"])
352+
else:
353+
crit_values.append(1.0 if cr.get("passed") else 0.0)
354+
by_criterion[criterion] = sum(crit_values) / len(crit_values) if crit_values else 0.0
344355

345356
by_stage: dict[str, float] = {}
346357
if is_pipeline and pipeline_stages:

eval_mcp/server.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -546,13 +546,17 @@ async def list_evaluations(
546546
"""
547547
List completed evaluations.
548548
549-
Returns a list of previous evaluation runs with IDs, descriptions, and timestamps.
549+
Each entry returns a `score` object with:
550+
- metrics.overall: the same 0.0-1.0 rubric average shown in the UI
551+
(mean of per-criterion scores, no pass/fail threshold)
552+
- byCriterion: per-criterion 0.0-1.0 breakdown (Core Claim, Terminology,
553+
Factual, Coverage, Reasoning — whatever the judge emitted)
550554
551555
Args:
552556
limit: Maximum number of evaluations to return (default: 20)
553557
554558
Returns:
555-
JSON with list of evaluations and their metadata
559+
JSON with list of evaluations and their aggregated scores.
556560
"""
557561
_auto_pull(user_id)
558562
args = {"user_id": _user(user_id), "limit": limit}

eval_mcp/tools/create_agent_eval_config.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
from inspect_ai.agent import Agent, AgentState, agent, sandbox_agent_bridge
3535
from inspect_ai.dataset import json_dataset, FieldSpec
3636
from inspect_ai.model import ChatMessageUser, ChatMessageSystem, get_model
37-
from inspect_ai.scorer import Score, accuracy, scorer, stderr
37+
from inspect_ai.scorer import Score, mean, scorer, stderr
3838
from inspect_ai.util import sandbox
3939
4040
from inspect_ai.tool._tool_info import ToolInfo
@@ -127,12 +127,12 @@ async def execute(state: AgentState) -> AgentState:
127127
return execute
128128
129129
130-
@scorer(metrics=[accuracy(), stderr()])
130+
@scorer(metrics=[mean(), stderr()])
131131
def jury_scorer():
132132
async def score(state, target):
133133
output = state.output.completion if state.output else ""
134134
if not output:
135-
return Score(value="I", answer="", explanation="No output generated")
135+
return Score(value=0.0, answer="", explanation="No output generated")
136136
137137
question = str(state.input)
138138
golden = target.text if target else ""
@@ -173,30 +173,27 @@ async def score(state, target):
173173
for n in criteria_names:
174174
v = votes[n]
175175
if not v:
176-
results.append({{"name": n, "votes_for": 0, "total": 0, "passed": False, "note": "no valid responses"}})
176+
results.append({{"name": n, "votes_for": 0, "total": 0, "score": 0.0, "note": "no valid responses"}})
177177
else:
178178
vf = sum(v)
179-
results.append({{"name": n, "votes_for": vf, "total": len(v), "passed": vf > len(v) / 2}})
179+
results.append({{"name": n, "votes_for": vf, "total": len(v), "score": vf / len(v)}})
180180
181-
n_passed = sum(1 for r in results if r["passed"])
182-
n_total = len(criteria_names)
183-
jury_score = n_passed / max(n_total, 1)
184-
passed = jury_score > 0.5
181+
scored = [r for r in results if "note" not in r]
182+
jury_score = sum(r["score"] for r in scored) / len(scored) if scored else 0.0
185183
186-
lines = [f"Jury: {{'PASS' if passed else 'FAIL'}} ({{n_passed}}/{{n_total}} criteria)", ""]
184+
lines = [f"Jury score: {{jury_score:.2f}} ({{len(scored)}}/{{len(criteria_names)}} criteria graded)", ""]
187185
for r in results:
188-
s = "PASS" if r["passed"] else "FAIL"
189186
extra = f" - {{r['note']}}" if "note" in r else ""
190-
lines.append(f" {{r['name']}}: {{s}} ({{r['votes_for']}}/{{r['total']}}){{extra}}")
187+
lines.append(f" {{r['name']}}: {{r['score']:.2f}} ({{r['votes_for']}}/{{r['total']}} judges){{extra}}")
191188
lines += ["", "Judges:"] + details
192189
if errors:
193190
lines += ["", "Errors:"] + errors
194191
195192
return Score(
196-
value="C" if passed else "I",
193+
value=jury_score,
197194
answer=output[:200],
198195
explanation="\\n".join(lines),
199-
metadata={{"jury_score": jury_score, "criteria_passed": n_passed, "criteria_total": n_total, "criteria_results": results}},
196+
metadata={{"jury_score": jury_score, "criteria_results": results}},
200197
)
201198
202199
return score

eval_mcp/tools/create_config.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def build_config_json(
7070
from inspect_ai import Task, task
7171
from inspect_ai.dataset import json_dataset, FieldSpec
7272
from inspect_ai.model import ChatMessageUser, ChatMessageSystem, get_model
73-
from inspect_ai.scorer import Score, accuracy, scorer, stderr
73+
from inspect_ai.scorer import Score, mean, scorer, stderr
7474
from inspect_ai.solver import generate, prompt_template
7575
7676
from inspect_ai.tool._tool_info import ToolInfo
@@ -130,12 +130,12 @@ def _extract_scores(output, criteria_names):
130130
return scores, args.get("reason", ""), None
131131
132132
133-
@scorer(metrics=[accuracy(), stderr()])
133+
@scorer(metrics=[mean(), stderr()])
134134
def jury_scorer():
135135
async def score(state, target):
136136
output = state.output.completion if state.output else ""
137137
if not output:
138-
return Score(value="I", answer="", explanation="No output generated")
138+
return Score(value=0.0, answer="", explanation="No output generated")
139139
140140
question = str(state.input)
141141
golden = target.text if target else ""
@@ -176,30 +176,28 @@ async def score(state, target):
176176
for n in criteria_names:
177177
v = votes[n]
178178
if not v:
179-
results.append({{"name": n, "votes_for": 0, "total": 0, "passed": False, "note": "no valid responses"}})
179+
results.append({{"name": n, "votes_for": 0, "total": 0, "score": 0.0, "note": "no valid responses"}})
180180
else:
181181
vf = sum(v)
182-
results.append({{"name": n, "votes_for": vf, "total": len(v), "passed": vf > len(v) / 2}})
182+
results.append({{"name": n, "votes_for": vf, "total": len(v), "score": vf / len(v)}})
183183
184-
n_passed = sum(1 for r in results if r["passed"])
185-
n_total = len(criteria_names)
186-
jury_score = n_passed / max(n_total, 1)
187-
passed = jury_score > 0.5
184+
# Sample score = mean of per-criterion judge-fractions. No thresholds.
185+
scored = [r for r in results if "note" not in r]
186+
jury_score = sum(r["score"] for r in scored) / len(scored) if scored else 0.0
188187
189-
lines = [f"Jury: {{'PASS' if passed else 'FAIL'}} ({{n_passed}}/{{n_total}} criteria)", ""]
188+
lines = [f"Jury score: {{jury_score:.2f}} ({{len(scored)}}/{{len(criteria_names)}} criteria graded)", ""]
190189
for r in results:
191-
s = "PASS" if r["passed"] else "FAIL"
192190
extra = f" - {{r['note']}}" if "note" in r else ""
193-
lines.append(f" {{r['name']}}: {{s}} ({{r['votes_for']}}/{{r['total']}}){{extra}}")
191+
lines.append(f" {{r['name']}}: {{r['score']:.2f}} ({{r['votes_for']}}/{{r['total']}} judges){{extra}}")
194192
lines += ["", "Judges:"] + details
195193
if errors:
196194
lines += ["", "Errors:"] + errors
197195
198196
return Score(
199-
value="C" if passed else "I",
197+
value=jury_score,
200198
answer=output[:200],
201199
explanation="\\n".join(lines),
202-
metadata={{"jury_score": jury_score, "criteria_passed": n_passed, "criteria_total": n_total, "criteria_results": results}},
200+
metadata={{"jury_score": jury_score, "criteria_results": results}},
203201
)
204202
205203
return score

0 commit comments

Comments
 (0)