awslabs
diff --git a/‎backend/api/compare.py‎
Lines changed: 61 additions & 125 deletions b/‎backend/api/compare.py‎
Lines changed: 61 additions & 125 deletions
diff --git a/‎eval_mcp/core/eval_results.py‎
Lines changed: 17 additions & 6 deletions b/‎eval_mcp/core/eval_results.py‎
Lines changed: 17 additions & 6 deletions
diff --git a/‎eval_mcp/server.py‎
Lines changed: 6 additions & 2 deletions b/‎eval_mcp/server.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎eval_mcp/tools/create_agent_eval_config.py‎
Lines changed: 11 additions & 14 deletions b/‎eval_mcp/tools/create_agent_eval_config.py‎
Lines changed: 11 additions & 14 deletions
diff --git a/‎eval_mcp/tools/create_config.py‎
Lines changed: 12 additions & 14 deletions b/‎eval_mcp/tools/create_config.py‎
Lines changed: 12 additions & 14 deletions
@@ -2,6 +2,9 @@
 
 Reads pre-computed JSON from S3/disk. The JSON is built once when an eval
 completes (see backend.core.eval_results.precompute_eval_results).
+
+Live progress for in-flight evaluations is served by /api/compare/progress,
+not by these endpoints.
 """
 
 import logging
@@ -27,137 +30,17 @@ async def _get_user_id(request: Request) -> str:
 
 @router.get("/groups")
 async def get_comparison_groups(user_id: str = Depends(_get_user_id)):
-    """List evaluation comparison groups for the user.
-
-    Serves from pre-computed cache (fast). Merges in running evals
-    from log headers so they appear without waiting for completion.
-    """
-    from eval_mcp.core.eval_results import _read_log_headers, _build_groups_from_headers
-
-    # Serve cached completed evals (instant)
+    """List evaluation comparison groups for the user, served from the pre-computed cache."""
     cached = load_eval_groups(user_id)
-    cached_groups = cached.get("groups", []) if cached else []
-    cached_ids = {g["id"] for g in cached_groups}
-
-    # Find running evals not in cache
-    log_dir = get_user_log_dir(user_id)
-    headers = await _read_log_headers(log_dir)
-    started_headers = [h for h in headers if h.get("status") == "started"]
-
-    if not started_headers:
-        if cached_groups:
-            return cached
-        # No cache and no running — build fresh
-        await precompute_eval_results(user_id)
-        return load_eval_groups(user_id) or {"groups": []}
-
-    # Build groups from started headers only, merge with cache
-    all_data = _build_groups_from_headers(started_headers)
-    new_groups = [g for g in all_data.get("groups", []) if g["id"] not in cached_ids]
-
-    merged = new_groups + cached_groups
-    merged.sort(key=lambda g: g.get("created", ""), reverse=True)
-    return {"groups": merged}
+    if cached:
+        return cached
+    await precompute_eval_results(user_id)
+    return load_eval_groups(user_id) or {"groups": []}
 
 
 @router.get("/detail")
 async def get_comparison_detail(group_id: str, user_id: str = Depends(_get_user_id)):
     """Get full comparison data for a specific evaluation group."""
-    from eval_mcp.core.eval_results import _read_log_headers, _build_groups_from_headers
-
-    # For running evals, read partial results directly (skip cache)
-    log_dir = get_user_log_dir(user_id)
-    headers = await _read_log_headers(log_dir)
-    group_headers = [h for h in headers if (h.get("run_id") or h["file"]) == group_id]
-    if group_headers and any(h.get("status") == "started" for h in group_headers):
-        import asyncio
-        from functools import partial
-        from inspect_ai.log import read_eval_log_sample_summaries
-
-        models = list(dict.fromkeys(h["model"] for h in group_headers))
-        total_samples = group_headers[0].get("dataset_samples", 0)
-        samples_by_id: dict[str, dict] = {}
-        aggregate: dict[str, dict] = {}
-
-        criteria_names: set[str] = set()
-        criteria_votes: dict[str, dict[str, list[bool]]] = {}  # model -> criterion -> [passed]
-
-        for h in group_headers:
-            model = h["model"]
-            try:
-                loop = asyncio.get_event_loop()
-                summaries = await loop.run_in_executor(None, partial(read_eval_log_sample_summaries, h["file"]))
-                completed = [s for s in summaries if s.scores]
-                scores_sum = 0.0
-                model_criteria_votes: dict[str, list[bool]] = {}
-
-                for s in completed:
-                    score_obj = next(iter(s.scores.values())) if s.scores else None
-                    if not score_obj:
-                        continue
-                    val = score_obj.value
-                    if val == "C":
-                        scores_sum += 1.0
-                    elif isinstance(val, (int, float)):
-                        scores_sum += float(val)
-
-                    # Extract per-criterion results
-                    if score_obj.metadata and "criteria_results" in score_obj.metadata:
-                        for cr in score_obj.metadata["criteria_results"]:
-                            cname = cr["name"]
-                            criteria_names.add(cname)
-                            if cname not in model_criteria_votes:
-                                model_criteria_votes[cname] = []
-                            model_criteria_votes[cname].append(cr["passed"])
-
-                avg = scores_sum / len(completed) if completed else 0
-                by_criterion = {}
-                for cname, votes in model_criteria_votes.items():
-                    by_criterion[cname] = sum(votes) / len(votes) if votes else 0
-                aggregate[model] = {"overall": avg, "byCriterion": by_criterion}
-                criteria_votes[model] = model_criteria_votes
-
-                for s in completed:
-                    sid = str(s.id)
-                    if sid not in samples_by_id:
-                        sample_input = s.input if isinstance(s.input, str) else str(s.input[0].content if s.input else "")
-                        samples_by_id[sid] = {
-                            "id": sid,
-                            "input": sample_input[:300],
-                            "target": s.target[0] if isinstance(s.target, list) else str(s.target or ""),
-                            "results": {},
-                        }
-                    score_obj = next(iter(s.scores.values())) if s.scores else None
-                    passed = score_obj.value == "C" if score_obj else False
-                    score_num = 1.0 if passed else (float(score_obj.value) if score_obj and isinstance(score_obj.value, (int, float)) else 0.0)
-                    criteria_results = []
-                    if score_obj and score_obj.metadata and "criteria_results" in score_obj.metadata:
-                        criteria_results = [
-                            {"name": cr["name"], "passed": cr["passed"], "votes_for": cr.get("votes_for", 0), "total": cr.get("total", 0)}
-                            for cr in score_obj.metadata["criteria_results"]
-                        ]
-                    samples_by_id[sid]["results"][model] = {
-                        "passed": passed,
-                        "score": score_num,
-                        "output": "",
-                        "explanation": score_obj.explanation[:200] if score_obj and score_obj.explanation else "",
-                        "criteriaResults": criteria_results,
-                    }
-            except Exception as e:
-                logger.warning(f"Failed to read summaries for {model}: {e}")
-                aggregate[model] = {"overall": 0, "byCriterion": {}}
-
-        return {
-            "models": models,
-            "samples": list(samples_by_id.values()),
-            "aggregate": aggregate,
-            "criteria": sorted(criteria_names),
-            "stats": {m: {"total_tokens": 0} for m in models},
-            "status": "running",
-            "sampleCount": total_samples,
-            "completedSamples": len(samples_by_id),
-        }
-
     data = load_eval_detail(user_id, group_id)
     if data:
         return data
@@ -352,3 +235,56 @@ async def generate_report_pdf(
             "Content-Disposition": f'attachment; filename="eval_report_{safe_id}.pdf"',
         },
     )
+
+
+@router.get("/report/{group_id}")
+async def download_report(group_id: str, user_id: str = Depends(_get_user_id)):
+    """Serve a previously generated PDF report for an evaluation group.
+
+    Reads from S3 in production, local disk in dev. Returns 404 if the
+    report hasn't been generated yet (in which case the caller should POST
+    to /report/pdf or ask the MCP agent to generate one).
+    """
+    import os
+    from eval_mcp.core.user_storage import (
+        DATA_BUCKET,
+        _get_s3_client,
+        _s3_enabled,
+        get_user_base_dir,
+    )
+
+    if not user_id or "/" in user_id or "\\" in user_id or user_id in (".", ".."):
+        raise HTTPException(status_code=400, detail="invalid user_id")
+    safe_id = group_id.replace("/", "_").replace("\\", "_")
+    filename = f"report_{safe_id}.pdf"
+
+    if _s3_enabled():
+        key = f"users/{user_id}/store/reports/{filename}"
+        try:
+            obj = _get_s3_client().get_object(Bucket=DATA_BUCKET, Key=key)
+        except Exception as e:
+            if getattr(e, "response", {}).get("Error", {}).get("Code") in ("NoSuchKey", "404"):
+                raise HTTPException(
+                    status_code=404,
+                    detail="Report not generated yet.",
+                )
+            logger.warning(f"Failed to fetch report s3://{DATA_BUCKET}/{key}: {e}")
+            raise HTTPException(status_code=500, detail="failed to fetch report")
+        pdf_bytes = obj["Body"].read()
+    else:
+        base_real = os.path.realpath(str(get_user_base_dir()))
+        pdf_real = os.path.realpath(os.path.join(base_real, user_id, "store", "reports", filename))
+        if not pdf_real.startswith(base_real + os.sep):
+            raise HTTPException(status_code=400, detail="invalid path")
+        if not os.path.isfile(pdf_real):
+            raise HTTPException(status_code=404, detail="Report not generated yet.")
+        with open(pdf_real, "rb") as f:
+            pdf_bytes = f.read()
+
+    return Response(
+        content=pdf_bytes,
+        media_type="application/pdf",
+        headers={
+            "Content-Disposition": f'attachment; filename="eval_report_{safe_id}.pdf"',
+        },
+    )
@@ -295,9 +295,14 @@ def _col_sort_key(k: str) -> tuple:
                     score_data["criteriaResults"] = all_criteria
                 else:
                     for scorer_name, score in scorers.items():
-                        score_data["passed"] = score["value"] == "C"
                         metadata = score.get("metadata", {})
-                        score_data["score"] = metadata.get("jury_score", 1.0 if score["value"] == "C" else 0.0)
+                        raw_value = score.get("value")
+                        if isinstance(raw_value, (int, float)):
+                            sample_score = float(raw_value)
+                        else:
+                            sample_score = metadata.get("jury_score", 1.0 if raw_value == "C" else 0.0)
+                        score_data["score"] = sample_score
+                        score_data["passed"] = sample_score > 0.5
                         score_data["explanation"] = score.get("explanation", "")
                         criteria_results = metadata.get("criteria_results", [])
                         score_data["criteriaResults"] = criteria_results
@@ -335,12 +340,18 @@ def _col_sort_key(k: str) -> tuple:
 
         by_criterion: dict[str, float] = {}
         for criterion in criteria_set:
-            criterion_passed = 0
+            crit_values: list[float] = []
             for s in model_samples:
                 for cr in s.get("criteriaResults", []):
-                    if cr["name"] == criterion and cr["passed"]:
-                        criterion_passed += 1
-            by_criterion[criterion] = criterion_passed / max(total, 1)
+                    if cr["name"] != criterion:
+                        continue
+                    if "score" in cr:
+                        crit_values.append(float(cr["score"]))
+                    elif cr.get("total", 0) > 0:
+                        crit_values.append(cr["votes_for"] / cr["total"])
+                    else:
+                        crit_values.append(1.0 if cr.get("passed") else 0.0)
+            by_criterion[criterion] = sum(crit_values) / len(crit_values) if crit_values else 0.0
 
         by_stage: dict[str, float] = {}
         if is_pipeline and pipeline_stages:
 
@@ -546,13 +546,17 @@ async def list_evaluations(
     """
     List completed evaluations.
 
-    Returns a list of previous evaluation runs with IDs, descriptions, and timestamps.
+    Each entry returns a `score` object with:
+      - metrics.overall: the same 0.0-1.0 rubric average shown in the UI
+        (mean of per-criterion scores, no pass/fail threshold)
+      - byCriterion: per-criterion 0.0-1.0 breakdown (Core Claim, Terminology,
+        Factual, Coverage, Reasoning — whatever the judge emitted)
 
     Args:
         limit: Maximum number of evaluations to return (default: 20)
 
     Returns:
-        JSON with list of evaluations and their metadata
+        JSON with list of evaluations and their aggregated scores.
     """
     _auto_pull(user_id)
     args = {"user_id": _user(user_id), "limit": limit}
 
@@ -34,7 +34,7 @@
 from inspect_ai.agent import Agent, AgentState, agent, sandbox_agent_bridge
 from inspect_ai.dataset import json_dataset, FieldSpec
 from inspect_ai.model import ChatMessageUser, ChatMessageSystem, get_model
-from inspect_ai.scorer import Score, accuracy, scorer, stderr
+from inspect_ai.scorer import Score, mean, scorer, stderr
 from inspect_ai.util import sandbox
 
 from inspect_ai.tool._tool_info import ToolInfo
@@ -127,12 +127,12 @@ async def execute(state: AgentState) -> AgentState:
     return execute
 
 
-@scorer(metrics=[accuracy(), stderr()])
+@scorer(metrics=[mean(), stderr()])
 def jury_scorer():
     async def score(state, target):
         output = state.output.completion if state.output else ""
         if not output:
-            return Score(value="I", answer="", explanation="No output generated")
+            return Score(value=0.0, answer="", explanation="No output generated")
 
         question = str(state.input)
         golden = target.text if target else ""
@@ -173,30 +173,27 @@ async def score(state, target):
         for n in criteria_names:
             v = votes[n]
             if not v:
-                results.append({{"name": n, "votes_for": 0, "total": 0, "passed": False, "note": "no valid responses"}})
+                results.append({{"name": n, "votes_for": 0, "total": 0, "score": 0.0, "note": "no valid responses"}})
             else:
                 vf = sum(v)
-                results.append({{"name": n, "votes_for": vf, "total": len(v), "passed": vf > len(v) / 2}})
+                results.append({{"name": n, "votes_for": vf, "total": len(v), "score": vf / len(v)}})
 
-        n_passed = sum(1 for r in results if r["passed"])
-        n_total = len(criteria_names)
-        jury_score = n_passed / max(n_total, 1)
-        passed = jury_score > 0.5
+        scored = [r for r in results if "note" not in r]
+        jury_score = sum(r["score"] for r in scored) / len(scored) if scored else 0.0
 
-        lines = [f"Jury: {{'PASS' if passed else 'FAIL'}} ({{n_passed}}/{{n_total}} criteria)", ""]
+        lines = [f"Jury score: {{jury_score:.2f}} ({{len(scored)}}/{{len(criteria_names)}} criteria graded)", ""]
         for r in results:
-            s = "PASS" if r["passed"] else "FAIL"
             extra = f" - {{r['note']}}" if "note" in r else ""
-            lines.append(f"  {{r['name']}}: {{s}} ({{r['votes_for']}}/{{r['total']}}){{extra}}")
+            lines.append(f"  {{r['name']}}: {{r['score']:.2f}} ({{r['votes_for']}}/{{r['total']}} judges){{extra}}")
         lines += ["", "Judges:"] + details
         if errors:
             lines += ["", "Errors:"] + errors
 
         return Score(
-            value="C" if passed else "I",
+            value=jury_score,
             answer=output[:200],
             explanation="\\n".join(lines),
-            metadata={{"jury_score": jury_score, "criteria_passed": n_passed, "criteria_total": n_total, "criteria_results": results}},
+            metadata={{"jury_score": jury_score, "criteria_results": results}},
         )
 
     return score
 
@@ -70,7 +70,7 @@ def build_config_json(
 from inspect_ai import Task, task
 from inspect_ai.dataset import json_dataset, FieldSpec
 from inspect_ai.model import ChatMessageUser, ChatMessageSystem, get_model
-from inspect_ai.scorer import Score, accuracy, scorer, stderr
+from inspect_ai.scorer import Score, mean, scorer, stderr
 from inspect_ai.solver import generate, prompt_template
 
 from inspect_ai.tool._tool_info import ToolInfo
@@ -130,12 +130,12 @@ def _extract_scores(output, criteria_names):
     return scores, args.get("reason", ""), None
 
 
-@scorer(metrics=[accuracy(), stderr()])
+@scorer(metrics=[mean(), stderr()])
 def jury_scorer():
     async def score(state, target):
         output = state.output.completion if state.output else ""
         if not output:
-            return Score(value="I", answer="", explanation="No output generated")
+            return Score(value=0.0, answer="", explanation="No output generated")
 
         question = str(state.input)
         golden = target.text if target else ""
@@ -176,30 +176,28 @@ async def score(state, target):
         for n in criteria_names:
             v = votes[n]
             if not v:
-                results.append({{"name": n, "votes_for": 0, "total": 0, "passed": False, "note": "no valid responses"}})
+                results.append({{"name": n, "votes_for": 0, "total": 0, "score": 0.0, "note": "no valid responses"}})
             else:
                 vf = sum(v)
-                results.append({{"name": n, "votes_for": vf, "total": len(v), "passed": vf > len(v) / 2}})
+                results.append({{"name": n, "votes_for": vf, "total": len(v), "score": vf / len(v)}})
 
-        n_passed = sum(1 for r in results if r["passed"])
-        n_total = len(criteria_names)
-        jury_score = n_passed / max(n_total, 1)
-        passed = jury_score > 0.5
+        # Sample score = mean of per-criterion judge-fractions. No thresholds.
+        scored = [r for r in results if "note" not in r]
+        jury_score = sum(r["score"] for r in scored) / len(scored) if scored else 0.0
 
-        lines = [f"Jury: {{'PASS' if passed else 'FAIL'}} ({{n_passed}}/{{n_total}} criteria)", ""]
+        lines = [f"Jury score: {{jury_score:.2f}} ({{len(scored)}}/{{len(criteria_names)}} criteria graded)", ""]
         for r in results:
-            s = "PASS" if r["passed"] else "FAIL"
             extra = f" - {{r['note']}}" if "note" in r else ""
-            lines.append(f"  {{r['name']}}: {{s}} ({{r['votes_for']}}/{{r['total']}}){{extra}}")
+            lines.append(f"  {{r['name']}}: {{r['score']:.2f}} ({{r['votes_for']}}/{{r['total']}} judges){{extra}}")
         lines += ["", "Judges:"] + details
         if errors:
             lines += ["", "Errors:"] + errors
 
         return Score(
-            value="C" if passed else "I",
+            value=jury_score,
             answer=output[:200],
             explanation="\\n".join(lines),
-            metadata={{"jury_score": jury_score, "criteria_passed": n_passed, "criteria_total": n_total, "criteria_results": results}},
+            metadata={{"jury_score": jury_score, "criteria_results": results}},
         )
 
     return score