Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1560,13 +1560,15 @@ async def delete_judge(
# ============== Inspect AI Viewer ==============

from backend.api.compare import router as compare_router
from backend.api.optimizations import router as optimizations_router
from backend.core.inspect_viewer import create_viewer_app, get_viewer_dist_directory
from inspect_ai._view.fastapi_server import _InspectStaticFiles
from inspect_ai._util.file import filesystem
from inspect_ai._view._dist import resolve_dist_directory

# Mount comparison API before the Inspect viewer (include_router takes priority over mount)
app.include_router(compare_router, prefix="/api/compare")
app.include_router(optimizations_router, prefix="/api/optimizations")

_log_dir = os.environ.get("INSPECT_LOG_DIR", os.environ.get("USER_STORAGE_BASE", "backend/users"))
_fs = filesystem(_log_dir)
Expand Down
61 changes: 61 additions & 0 deletions backend/api/optimizations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""HTTP API for the Prompts Optimized tab.

Thin wrapper over the persistence layer added in
``eval_mcp/core/user_storage.py``. The frontend hits these endpoints to
populate the list rail and detail pane — both pages mirror the eval
``/api/compare/{groups,detail}`` shape so the page-level state
management stays consistent.
"""

import logging

from fastapi import APIRouter, Depends, HTTPException, Request

from eval_mcp.core.user_storage import (
get_optimization_from_db,
list_optimizations_from_db,
)

logger = logging.getLogger(__name__)

router = APIRouter()


async def _get_user_id(request: Request) -> str:
"""Same auth shim used by /api/compare — read the cognito proxy header."""
user_id = request.headers.get("X-Forwarded-User")
if not user_id:
raise HTTPException(status_code=401, detail="Not authenticated")
return user_id


@router.get("/list")
async def list_optimizations(
search: str = "",
user_id: str = Depends(_get_user_id),
):
"""Return optimization-run summary rows, newest-first.

Each entry is a compact summary (id, dataset, judge, providers,
winner_iter, winner_test_score, status, created_at). Full details
come from /detail.
"""
rows = list_optimizations_from_db(user_id, search_term=search)
return {"optimizations": rows}


@router.get("/detail")
async def get_optimization_detail(
id: str,
user_id: str = Depends(_get_user_id),
):
"""Return the full optimization record by ID.

Includes per-iteration history (prompt text + train pass rate),
test scores per iter, rationales, and metadata. The frontend
renders the chart and prompt diff from this single payload.
"""
record = get_optimization_from_db(user_id, id)
if not record:
raise HTTPException(status_code=404, detail="Optimization not found")
return record
101 changes: 101 additions & 0 deletions eval_mcp/core/user_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,107 @@ def delete_dataset_from_db(user_id: str, dataset_id: str) -> bool:
return False


# ---------------------------------------------------------------------------
# Optimizations — prompt-optimizer runs persist a single JSON record linking
# the per-iteration eval config names + the best prompt. Each iteration is
# still a real eval in its own .eval log, so list_evaluations keeps working;
# this layer just lets the optimization tab group them.
# ---------------------------------------------------------------------------


def save_optimization_to_db(user_id: str, record: dict[str, Any]) -> str:
"""Persist an optimization run. ``record`` is the full plan-shaped
dict (id, dataset, judge, providers, initial_prompt, winner_prompt,
history, test_results, created_at). Returns the optimization id."""
optimization_id = record.get("id") or _generate_id("opt")
record["id"] = optimization_id
record["type"] = "optimization"
if "created_at" not in record:
record["created_at"] = int(datetime.now().timestamp() * 1000)
record["updated_at"] = int(datetime.now().timestamp() * 1000)

if _s3_enabled():
_s3_save_json(user_id, "optimizations", f"{optimization_id}.json", record)
else:
store_dir = _get_json_store_dir(user_id, "optimizations")
_save_json_file(store_dir / f"{optimization_id}.json", record, user_id)

return optimization_id


def get_optimization_from_db(
user_id: str, optimization_id: str
) -> Optional[dict[str, Any]]:
if _s3_enabled():
data = _s3_load_json(user_id, "optimizations", f"{optimization_id}.json")
else:
store_dir = _get_json_store_dir(user_id, "optimizations")
data = _load_json_file(store_dir / f"{optimization_id}.json")

if data and data.get("type") == "optimization":
return data
return None


def list_optimizations_from_db(
user_id: str, search_term: str = ""
) -> list[dict[str, Any]]:
"""Return optimization summary rows newest-first. Filter by
case-insensitive substring match on dataset / initial_prompt /
winner_prompt when ``search_term`` is non-empty."""
if _s3_enabled():
entries = _s3_list_json(user_id, "optimizations")
else:
store_dir = _get_json_store_dir(user_id, "optimizations")
entries = _list_json_files(store_dir)

out: list[dict[str, Any]] = []
needle = search_term.lower().strip()
for entry in entries:
if entry.get("type") != "optimization":
continue
if needle:
haystack = " ".join([
str(entry.get("dataset", "")),
str(entry.get("initial_prompt", "")),
str(entry.get("winner_prompt", "")),
]).lower()
if needle not in haystack:
continue
out.append({
"id": entry["id"],
"dataset": entry.get("dataset"),
"judge": entry.get("judge"),
"providers": entry.get("providers", []),
"winner_iter": entry.get("winner_iter"),
"winner_test_score": entry.get("winner_test_score"),
"iterations_run": len(entry.get("history", [])),
"status": entry.get("status", "complete"),
"created_at": entry.get("created_at"),
})
out.sort(key=lambda r: r.get("created_at") or 0, reverse=True)
return out


def delete_optimization_from_db(user_id: str, optimization_id: str) -> bool:
if _s3_enabled():
return _s3_delete_json(
user_id, "optimizations", f"{optimization_id}.json"
)
store_dir = _get_json_store_dir(user_id, "optimizations")
safe_id = os.path.basename(optimization_id)
if not safe_id or safe_id != optimization_id:
raise ValueError(f"invalid optimization_id: {optimization_id!r}")
base_real = os.path.realpath(str(get_user_base_dir()))
target_real = os.path.realpath(str(store_dir / f"{safe_id}.json"))
if not target_real.startswith(base_real + os.sep):
raise ValueError(f"path escape attempt: {target_real}")
if os.path.exists(target_real):
os.unlink(target_real)
return True
return False


def update_dataset_in_db(
user_id: str,
dataset_id: str,
Expand Down
113 changes: 113 additions & 0 deletions eval_mcp/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
from eval_mcp.tools.list_judges import handle_list_judges
from eval_mcp.tools.list_evaluations import handle_list_evaluations
from eval_mcp.tools.get_evaluation_details import handle_get_evaluation_details
from eval_mcp.tools.optimize_prompt import handle_optimize_prompt
from eval_mcp.tools.list_optimizations import handle_list_optimizations
from eval_mcp.tools.get_optimization_details import handle_get_optimization_details
from eval_mcp.tools.run_eval import (
handle_run_evaluation,
handle_retry_evaluation,
Expand Down Expand Up @@ -830,6 +833,116 @@ async def get_evaluation_details(
return result[0].text


@mcp.tool(annotations=CREATE_REMOTE)
async def optimize_prompt(
dataset: DatasetName,
judge: JudgeName,
initial_prompt: str = "{question}",
providers: ProvidersList = None,
max_iterations: int = 3,
sample_size: int = 10,
test_holdout: float = 0.4,
user_id: str = None,
) -> str:
"""
Iteratively improve a prompt template against a dataset using
failure-driven LLM feedback. Analog of skill-creator's run_loop.py.

Splits the dataset into train (60%) / test (40%). Each iteration:
1. Score the current prompt on a random train sample.
2. Show the optimizer LLM the failures + per-criterion improvement
notes from the judges.
3. The optimizer proposes a new prompt — edits, not rewrites, when
the current prompt is long and structured.
4. Repeat up to `max_iterations` or until train converges.
Finally evaluates every attempted prompt on the held-out test set;
winner is the highest test pass rate. Ties broken by earlier iter.

Args:
dataset: Dataset name from list_datasets.
judge: Judge name from list_judges (provides the criteria).
initial_prompt: Starting prompt template. MUST contain `{question}`.
Defaults to `{question}` (pass-through, no wrapping).
providers: Provider model IDs. Stored for reference; v1 scores
in-process via the default Bedrock model singleton.
max_iterations: Hard ceiling on refinement passes (default 3).
sample_size: Train samples scored per iteration (default 10).
test_holdout: Fraction of dataset held out for test scoring (default 0.4).

Returns:
JSON: optimization_id, winner_iter, winner_test_score, winner_prompt,
per-iter train pass rates, status.
"""
_auto_pull(user_id)
args = {
"user_id": _user(user_id),
"dataset": dataset,
"judge": judge,
"initial_prompt": initial_prompt,
"providers": providers or [],
"max_iterations": max_iterations,
"sample_size": sample_size,
"test_holdout": test_holdout,
}
result = await handle_optimize_prompt(bedrock, args)
_auto_push(user_id)
return result[0].text


@mcp.tool(annotations=READ_LOCAL)
async def list_optimizations(
user_id: str = None,
limit: LimitParam = 20,
offset: OffsetParam = 0,
search: str = "",
response_format: ResponseFormat = "json",
) -> str:
"""
List prompt-optimization runs newest-first.

Args:
limit: Page size (default 20).
offset: Page start (default 0).
search: Optional substring filter on dataset / initial / winner prompt.
response_format: "json" (default) or "markdown".

Returns:
JSON or markdown listing with pagination metadata.
"""
_auto_pull(user_id)
args = {
"user_id": _user(user_id),
"limit": limit,
"offset": offset,
"search": search,
"response_format": response_format,
}
result = await handle_list_optimizations(args)
return result[0].text


@mcp.tool(annotations=READ_LOCAL)
async def get_optimization_details(
optimization_id: str,
user_id: str = None,
) -> str:
"""
Get the full record for a single optimization run.

Returns: initial_prompt, winner_prompt, winner_iter, per-iteration
history (prompt text + train pass rate), per-iteration test scores,
rationales for each proposal, and metadata (dataset, judge,
providers, status).

Args:
optimization_id: ID from list_optimizations.
"""
_auto_pull(user_id)
args = {"user_id": _user(user_id), "optimization_id": optimization_id}
result = await handle_get_optimization_details(args)
return result[0].text


@mcp.tool(annotations=RUN_REMOTE)
async def run_evaluation(
configName: ConfigName,
Expand Down
Loading