awslabs · DataGomes · May 19, 2026 · May 19, 2026
diff --git a/backend/api/main.py b/backend/api/main.py
@@ -1560,13 +1560,15 @@ async def delete_judge(
 # ============== Inspect AI Viewer ==============
 
 from backend.api.compare import router as compare_router
+from backend.api.optimizations import router as optimizations_router
 from backend.core.inspect_viewer import create_viewer_app, get_viewer_dist_directory
 from inspect_ai._view.fastapi_server import _InspectStaticFiles
 from inspect_ai._util.file import filesystem
 from inspect_ai._view._dist import resolve_dist_directory
 
 # Mount comparison API before the Inspect viewer (include_router takes priority over mount)
 app.include_router(compare_router, prefix="/api/compare")
+app.include_router(optimizations_router, prefix="/api/optimizations")
 
 _log_dir = os.environ.get("INSPECT_LOG_DIR", os.environ.get("USER_STORAGE_BASE", "backend/users"))
 _fs = filesystem(_log_dir)

diff --git a/backend/api/optimizations.py b/backend/api/optimizations.py
@@ -0,0 +1,61 @@
+"""HTTP API for the Prompts Optimized tab.
+
+Thin wrapper over the persistence layer added in
+``eval_mcp/core/user_storage.py``. The frontend hits these endpoints to
+populate the list rail and detail pane — both pages mirror the eval
+``/api/compare/{groups,detail}`` shape so the page-level state
+management stays consistent.
+"""
+
+import logging
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+
+from eval_mcp.core.user_storage import (
+    get_optimization_from_db,
+    list_optimizations_from_db,
+)
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter()
+
+
+async def _get_user_id(request: Request) -> str:
+    """Same auth shim used by /api/compare — read the cognito proxy header."""
+    user_id = request.headers.get("X-Forwarded-User")
+    if not user_id:
+        raise HTTPException(status_code=401, detail="Not authenticated")
+    return user_id
+
+
+@router.get("/list")
+async def list_optimizations(
+    search: str = "",
+    user_id: str = Depends(_get_user_id),
+):
+    """Return optimization-run summary rows, newest-first.
+
+    Each entry is a compact summary (id, dataset, judge, providers,
+    winner_iter, winner_test_score, status, created_at). Full details
+    come from /detail.
+    """
+    rows = list_optimizations_from_db(user_id, search_term=search)
+    return {"optimizations": rows}
+
+
+@router.get("/detail")
+async def get_optimization_detail(
+    id: str,
+    user_id: str = Depends(_get_user_id),
+):
+    """Return the full optimization record by ID.
+
+    Includes per-iteration history (prompt text + train pass rate),
+    test scores per iter, rationales, and metadata. The frontend
+    renders the chart and prompt diff from this single payload.
+    """
+    record = get_optimization_from_db(user_id, id)
+    if not record:
+        raise HTTPException(status_code=404, detail="Optimization not found")
+    return record
diff --git a/eval_mcp/core/user_storage.py b/eval_mcp/core/user_storage.py
@@ -610,6 +610,107 @@ def delete_dataset_from_db(user_id: str, dataset_id: str) -> bool:
     return False
 
 
+# ---------------------------------------------------------------------------
+# Optimizations — prompt-optimizer runs persist a single JSON record linking
+# the per-iteration eval config names + the best prompt. Each iteration is
+# still a real eval in its own .eval log, so list_evaluations keeps working;
+# this layer just lets the optimization tab group them.
+# ---------------------------------------------------------------------------
+
+
+def save_optimization_to_db(user_id: str, record: dict[str, Any]) -> str:
+    """Persist an optimization run. ``record`` is the full plan-shaped
+    dict (id, dataset, judge, providers, initial_prompt, winner_prompt,
+    history, test_results, created_at). Returns the optimization id."""
+    optimization_id = record.get("id") or _generate_id("opt")
+    record["id"] = optimization_id
+    record["type"] = "optimization"
+    if "created_at" not in record:
+        record["created_at"] = int(datetime.now().timestamp() * 1000)
+    record["updated_at"] = int(datetime.now().timestamp() * 1000)
+
+    if _s3_enabled():
+        _s3_save_json(user_id, "optimizations", f"{optimization_id}.json", record)
+    else:
+        store_dir = _get_json_store_dir(user_id, "optimizations")
+        _save_json_file(store_dir / f"{optimization_id}.json", record, user_id)
+
+    return optimization_id
+
+
+def get_optimization_from_db(
+    user_id: str, optimization_id: str
+) -> Optional[dict[str, Any]]:
+    if _s3_enabled():
+        data = _s3_load_json(user_id, "optimizations", f"{optimization_id}.json")
+    else:
+        store_dir = _get_json_store_dir(user_id, "optimizations")
+        data = _load_json_file(store_dir / f"{optimization_id}.json")
+
+    if data and data.get("type") == "optimization":
+        return data
+    return None
+
+
+def list_optimizations_from_db(
+    user_id: str, search_term: str = ""
+) -> list[dict[str, Any]]:
+    """Return optimization summary rows newest-first. Filter by
+    case-insensitive substring match on dataset / initial_prompt /
+    winner_prompt when ``search_term`` is non-empty."""
+    if _s3_enabled():
+        entries = _s3_list_json(user_id, "optimizations")
+    else:
+        store_dir = _get_json_store_dir(user_id, "optimizations")
+        entries = _list_json_files(store_dir)
+
+    out: list[dict[str, Any]] = []
+    needle = search_term.lower().strip()
+    for entry in entries:
+        if entry.get("type") != "optimization":
+            continue
+        if needle:
+            haystack = " ".join([
+                str(entry.get("dataset", "")),
+                str(entry.get("initial_prompt", "")),
+                str(entry.get("winner_prompt", "")),
+            ]).lower()
+            if needle not in haystack:
+                continue
+        out.append({
+            "id": entry["id"],
+            "dataset": entry.get("dataset"),
+            "judge": entry.get("judge"),
+            "providers": entry.get("providers", []),
+            "winner_iter": entry.get("winner_iter"),
+            "winner_test_score": entry.get("winner_test_score"),
+            "iterations_run": len(entry.get("history", [])),
+            "status": entry.get("status", "complete"),
+            "created_at": entry.get("created_at"),
+        })
+    out.sort(key=lambda r: r.get("created_at") or 0, reverse=True)
+    return out
+
+
+def delete_optimization_from_db(user_id: str, optimization_id: str) -> bool:
+    if _s3_enabled():
+        return _s3_delete_json(
+            user_id, "optimizations", f"{optimization_id}.json"
+        )
+    store_dir = _get_json_store_dir(user_id, "optimizations")
+    safe_id = os.path.basename(optimization_id)
+    if not safe_id or safe_id != optimization_id:
+        raise ValueError(f"invalid optimization_id: {optimization_id!r}")
+    base_real = os.path.realpath(str(get_user_base_dir()))
+    target_real = os.path.realpath(str(store_dir / f"{safe_id}.json"))
+    if not target_real.startswith(base_real + os.sep):
+        raise ValueError(f"path escape attempt: {target_real}")
+    if os.path.exists(target_real):
+        os.unlink(target_real)
+        return True
+    return False
+
+
 def update_dataset_in_db(
     user_id: str,
     dataset_id: str,

diff --git a/eval_mcp/server.py b/eval_mcp/server.py
@@ -33,6 +33,9 @@
 from eval_mcp.tools.list_judges import handle_list_judges
 from eval_mcp.tools.list_evaluations import handle_list_evaluations
 from eval_mcp.tools.get_evaluation_details import handle_get_evaluation_details
+from eval_mcp.tools.optimize_prompt import handle_optimize_prompt
+from eval_mcp.tools.list_optimizations import handle_list_optimizations
+from eval_mcp.tools.get_optimization_details import handle_get_optimization_details
 from eval_mcp.tools.run_eval import (
     handle_run_evaluation,
     handle_retry_evaluation,
@@ -830,6 +833,116 @@ async def get_evaluation_details(
     return result[0].text
 
 
+@mcp.tool(annotations=CREATE_REMOTE)
+async def optimize_prompt(
+    dataset: DatasetName,
+    judge: JudgeName,
+    initial_prompt: str = "{question}",
+    providers: ProvidersList = None,
+    max_iterations: int = 3,
+    sample_size: int = 10,
+    test_holdout: float = 0.4,
+    user_id: str = None,
+) -> str:
+    """
+    Iteratively improve a prompt template against a dataset using
+    failure-driven LLM feedback. Analog of skill-creator's run_loop.py.
+
+    Splits the dataset into train (60%) / test (40%). Each iteration:
+      1. Score the current prompt on a random train sample.
+      2. Show the optimizer LLM the failures + per-criterion improvement
+         notes from the judges.
+      3. The optimizer proposes a new prompt — edits, not rewrites, when
+         the current prompt is long and structured.
+      4. Repeat up to `max_iterations` or until train converges.
+    Finally evaluates every attempted prompt on the held-out test set;
+    winner is the highest test pass rate. Ties broken by earlier iter.
+
+    Args:
+        dataset: Dataset name from list_datasets.
+        judge: Judge name from list_judges (provides the criteria).
+        initial_prompt: Starting prompt template. MUST contain `{question}`.
+            Defaults to `{question}` (pass-through, no wrapping).
+        providers: Provider model IDs. Stored for reference; v1 scores
+            in-process via the default Bedrock model singleton.
+        max_iterations: Hard ceiling on refinement passes (default 3).
+        sample_size: Train samples scored per iteration (default 10).
+        test_holdout: Fraction of dataset held out for test scoring (default 0.4).
+
+    Returns:
+        JSON: optimization_id, winner_iter, winner_test_score, winner_prompt,
+        per-iter train pass rates, status.
+    """
+    _auto_pull(user_id)
+    args = {
+        "user_id": _user(user_id),
+        "dataset": dataset,
+        "judge": judge,
+        "initial_prompt": initial_prompt,
+        "providers": providers or [],
+        "max_iterations": max_iterations,
+        "sample_size": sample_size,
+        "test_holdout": test_holdout,
+    }
+    result = await handle_optimize_prompt(bedrock, args)
+    _auto_push(user_id)
+    return result[0].text
+
+
+@mcp.tool(annotations=READ_LOCAL)
+async def list_optimizations(
+    user_id: str = None,
+    limit: LimitParam = 20,
+    offset: OffsetParam = 0,
+    search: str = "",
+    response_format: ResponseFormat = "json",
+) -> str:
+    """
+    List prompt-optimization runs newest-first.
+
+    Args:
+        limit: Page size (default 20).
+        offset: Page start (default 0).
+        search: Optional substring filter on dataset / initial / winner prompt.
+        response_format: "json" (default) or "markdown".
+
+    Returns:
+        JSON or markdown listing with pagination metadata.
+    """
+    _auto_pull(user_id)
+    args = {
+        "user_id": _user(user_id),
+        "limit": limit,
+        "offset": offset,
+        "search": search,
+        "response_format": response_format,
+    }
+    result = await handle_list_optimizations(args)
+    return result[0].text
+
+
+@mcp.tool(annotations=READ_LOCAL)
+async def get_optimization_details(
+    optimization_id: str,
+    user_id: str = None,
+) -> str:
+    """
+    Get the full record for a single optimization run.
+
+    Returns: initial_prompt, winner_prompt, winner_iter, per-iteration
+    history (prompt text + train pass rate), per-iteration test scores,
+    rationales for each proposal, and metadata (dataset, judge,
+    providers, status).
+
+    Args:
+        optimization_id: ID from list_optimizations.
+    """
+    _auto_pull(user_id)
+    args = {"user_id": _user(user_id), "optimization_id": optimization_id}
+    result = await handle_get_optimization_details(args)
+    return result[0].text
+
+
 @mcp.tool(annotations=RUN_REMOTE)
 async def run_evaluation(
     configName: ConfigName,