Skip to content

Commit d7eec33

Browse files
authored
Merge pull request #54 from awslabs/feat/mcp-builder-optimizations
Apply mcp-builder best practices + add phase 4 evaluation
2 parents a32ffbf + 134c788 commit d7eec33

9 files changed

Lines changed: 1281 additions & 190 deletions

File tree

eval_mcp/server.py

Lines changed: 335 additions & 113 deletions
Large diffs are not rendered by default.

eval_mcp/tools/list_datasets.py

Lines changed: 72 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,45 @@
88
from eval_mcp.core.user_storage import list_datasets_from_db
99

1010

11+
def _dataset_preview(dataset: Dict[str, Any]) -> Dict[str, Any]:
12+
tests = dataset.get("tests", [])
13+
num_samples = dataset.get("num_samples", len(tests) if isinstance(tests, list) else 0)
14+
preview = ""
15+
if isinstance(tests, list) and len(tests) > 0:
16+
first_item = tests[0]
17+
if isinstance(first_item, dict):
18+
preview = (
19+
first_item.get("vars", {}).get("question")
20+
or first_item.get("question")
21+
or str(first_item)[:100]
22+
)
23+
preview = preview[:100]
24+
if len(preview) == 100:
25+
preview += "..."
26+
return {
27+
"id": dataset.get("id"),
28+
"name": dataset.get("name"),
29+
"num_samples": num_samples,
30+
"preview": preview or None,
31+
}
32+
33+
1134
async def handle_list_datasets(args: Dict[str, Any]) -> List[TextContent]:
12-
"""Handle list_datasets tool call.
35+
"""List datasets with pagination and optional JSON format.
1336
14-
Lists all datasets from the user's database.
15-
Returns details about each dataset including number of samples and preview.
37+
Args (from `args` dict):
38+
user_id: required
39+
searchTerm: optional case-insensitive name filter
40+
limit: page size, default 20
41+
offset: page start, default 0
42+
response_format: "markdown" (default) or "json"
1643
"""
1744
try:
18-
# Get required user_id and optional search filter
1945
user_id = args.get("user_id")
2046
search_term = (args.get("searchTerm") or "").lower()
47+
limit = max(1, int(args.get("limit", 20) or 20))
48+
offset = max(0, int(args.get("offset", 0) or 0))
49+
response_format = (args.get("response_format") or "markdown").lower()
2150

2251
if not user_id:
2352
return [
@@ -27,38 +56,48 @@ async def handle_list_datasets(args: Dict[str, Any]) -> List[TextContent]:
2756
)
2857
]
2958

30-
# Get datasets from database
31-
datasets = list_datasets_from_db(user_id, search_term)
59+
all_datasets = list_datasets_from_db(user_id, search_term)
60+
total = len(all_datasets)
61+
page = all_datasets[offset : offset + limit]
62+
has_more = offset + len(page) < total
63+
next_offset = offset + len(page) if has_more else None
3264

33-
if not datasets:
34-
msg = f"No datasets found matching '{search_term}'" if search_term else "No datasets found. Create your first dataset with save_dataset."
35-
return [TextContent(type="text", text=msg)]
36-
37-
# Format output
38-
output = f"Found {len(datasets)} dataset(s):\n\n"
39-
40-
for dataset in datasets:
41-
tests = dataset.get("tests", [])
42-
num_samples = dataset.get("num_samples", len(tests) if isinstance(tests, list) else 0)
65+
if response_format == "json":
66+
return [
67+
TextContent(
68+
type="text",
69+
text=json.dumps(
70+
{
71+
"success": True,
72+
"total": total,
73+
"count": len(page),
74+
"offset": offset,
75+
"has_more": has_more,
76+
"next_offset": next_offset,
77+
"items": [_dataset_preview(d) for d in page],
78+
},
79+
indent=2,
80+
),
81+
)
82+
]
4383

44-
# Get first question as preview
45-
preview = ""
46-
if isinstance(tests, list) and len(tests) > 0:
47-
first_item = tests[0]
48-
if isinstance(first_item, dict):
49-
preview = (
50-
first_item.get("vars", {}).get("question")
51-
or first_item.get("question")
52-
or str(first_item)[:100]
53-
)
54-
preview = preview[:100]
55-
if len(preview) == 100:
56-
preview += "..."
84+
if not all_datasets:
85+
msg = (
86+
f"No datasets found matching '{search_term}'"
87+
if search_term
88+
else "No datasets found. Create your first dataset with save_dataset."
89+
)
90+
return [TextContent(type="text", text=msg)]
5791

58-
output += f"📊 **{dataset['name']}**\n"
59-
output += f" ID: {dataset['id'][:16]}...\n"
60-
output += f" Samples: {num_samples}\n"
61-
output += f" Preview: {preview or 'No preview available'}\n\n"
92+
output = f"Found {total} dataset(s) — showing {offset + 1}-{offset + len(page)}:\n\n"
93+
for dataset in page:
94+
p = _dataset_preview(dataset)
95+
output += f"📊 **{p['name']}**\n"
96+
output += f" ID: {(p['id'] or '')[:16]}...\n"
97+
output += f" Samples: {p['num_samples']}\n"
98+
output += f" Preview: {p['preview'] or 'No preview available'}\n\n"
99+
if has_more:
100+
output += f"More available — pass offset={next_offset} to see the next page.\n"
62101

63102
return [TextContent(type="text", text=output)]
64103

eval_mcp/tools/list_evaluations.py

Lines changed: 62 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,19 @@
1313

1414

1515
async def handle_list_evaluations(args: Dict[str, Any]) -> List[TextContent]:
16-
"""List evaluations by reading .eval log files from the user's logs directory."""
16+
"""List evaluations with pagination and optional markdown format.
17+
18+
Args (from `args` dict):
19+
user_id: required
20+
limit: page size, default 20
21+
offset: page start, default 0
22+
response_format: "json" (default — eval payloads are heavy) or "markdown"
23+
"""
1724
try:
1825
user_id = args.get("user_id")
19-
limit = args.get("limit", 20)
26+
limit = max(1, int(args.get("limit", 20) or 20))
27+
offset = max(0, int(args.get("offset", 0) or 0))
28+
response_format = (args.get("response_format") or "json").lower()
2029

2130
if not user_id:
2231
return [
@@ -30,16 +39,26 @@ async def handle_list_evaluations(args: Dict[str, Any]) -> List[TextContent]:
3039
eval_log_infos = await list_eval_logs_async(log_dir)
3140

3241
if not eval_log_infos:
33-
return [
34-
TextContent(
35-
type="text",
36-
text=json.dumps({
37-
"success": True,
38-
"evaluations": [],
39-
"message": "No evaluations found. Run an evaluation first.",
40-
}),
41-
)
42-
]
42+
empty_text = (
43+
json.dumps({
44+
"success": True,
45+
"total": 0,
46+
"count": 0,
47+
"offset": offset,
48+
"has_more": False,
49+
"next_offset": None,
50+
"evaluations": [],
51+
"message": "No evaluations found. Run an evaluation first.",
52+
})
53+
if response_format == "json"
54+
else "No evaluations found. Run an evaluation first."
55+
)
56+
return [TextContent(type="text", text=empty_text)]
57+
58+
total = len(eval_log_infos)
59+
page_infos = eval_log_infos[offset : offset + limit]
60+
has_more = offset + len(page_infos) < total
61+
next_offset = offset + len(page_infos) if has_more else None
4362

4463
# Cache pre-computed details per group so multi-model groups only
4564
# deserialize once (same summary UI/PDF consume).
@@ -54,7 +73,7 @@ def _detail_for(run_id: str) -> Dict[str, Any] | None:
5473
return detail_cache[run_id]
5574

5675
evaluations = []
57-
for info in eval_log_infos[:limit]:
76+
for info in page_infos:
5877
try:
5978
log = await read_eval_log_async(info.name, header_only=True)
6079

@@ -97,11 +116,36 @@ def _detail_for(run_id: str) -> Dict[str, Any] | None:
97116
except Exception:
98117
continue
99118

100-
return [TextContent(type="text", text=json.dumps({
101-
"success": True,
102-
"evaluations": evaluations,
103-
"total": len(evaluations),
104-
}, indent=2))]
119+
if response_format == "markdown":
120+
output = f"Found {total} evaluation(s) — showing {offset + 1}-{offset + len(evaluations)}:\n\n"
121+
for e in evaluations:
122+
overall = e["score"]["metrics"].get("overall")
123+
overall_str = f"{overall:.2f}" if isinstance(overall, (int, float)) else "—"
124+
output += f"🧪 **{e['task']}** ({e['model']})\n"
125+
output += f" ID: {e['id']}\n"
126+
output += f" Status: {e['status']} · Samples: {e['totalSamples']} · Overall: {overall_str}\n"
127+
output += f" Created: {e['createdAt']}\n\n"
128+
if has_more:
129+
output += f"More available — pass offset={next_offset} to see the next page.\n"
130+
return [TextContent(type="text", text=output)]
131+
132+
return [
133+
TextContent(
134+
type="text",
135+
text=json.dumps(
136+
{
137+
"success": True,
138+
"total": total,
139+
"count": len(evaluations),
140+
"offset": offset,
141+
"has_more": has_more,
142+
"next_offset": next_offset,
143+
"evaluations": evaluations,
144+
},
145+
indent=2,
146+
),
147+
)
148+
]
105149

106150
except Exception as e:
107151
return [

eval_mcp/tools/list_judges.py

Lines changed: 65 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,34 @@
88
from eval_mcp.core.user_storage import list_judges_from_db
99

1010

11+
def _judge_summary(judge: Dict[str, Any]) -> Dict[str, Any]:
12+
config = judge.get("config") or {}
13+
criteria = config.get("criteria") or []
14+
return {
15+
"id": judge.get("id"),
16+
"name": judge.get("name"),
17+
"domain": config.get("domain", "unknown"),
18+
"criteria_count": len(criteria),
19+
"criteria_names": [c.get("name", "") for c in criteria],
20+
}
21+
22+
1123
async def handle_list_judges(args: Dict[str, Any]) -> List[TextContent]:
12-
"""Handle list_judges tool call.
24+
"""List LLM judges with pagination and optional JSON format.
1325
14-
Lists all LLM judges from the user's database.
15-
Returns details about each judge including domain and criteria.
26+
Args (from `args` dict):
27+
user_id: required
28+
searchTerm: optional case-insensitive name filter
29+
limit: page size, default 20
30+
offset: page start, default 0
31+
response_format: "markdown" (default) or "json"
1632
"""
1733
try:
18-
# Get required user_id and optional search filter
1934
user_id = args.get("user_id")
2035
search_term = (args.get("searchTerm") or "").lower()
36+
limit = max(1, int(args.get("limit", 20) or 20))
37+
offset = max(0, int(args.get("offset", 0) or 0))
38+
response_format = (args.get("response_format") or "markdown").lower()
2139

2240
if not user_id:
2341
return [
@@ -27,31 +45,52 @@ async def handle_list_judges(args: Dict[str, Any]) -> List[TextContent]:
2745
)
2846
]
2947

30-
# Get judges from database
31-
judges = list_judges_from_db(user_id, search_term)
32-
33-
if not judges:
34-
msg = f"No judges found matching '{search_term}'" if search_term else "No judges found. Create your first judge with generate_judge."
35-
return [TextContent(type="text", text=msg)]
36-
37-
# Format output
38-
output = f"Found {len(judges)} judge(s):\n\n"
48+
all_judges = list_judges_from_db(user_id, search_term)
49+
total = len(all_judges)
50+
page = all_judges[offset : offset + limit]
51+
has_more = offset + len(page) < total
52+
next_offset = offset + len(page) if has_more else None
3953

40-
for judge in judges:
41-
config = judge["config"]
42-
domain = config.get("domain", "unknown")
43-
criteria = config.get("criteria", [])
44-
criteria_count = len(criteria)
54+
if response_format == "json":
55+
return [
56+
TextContent(
57+
type="text",
58+
text=json.dumps(
59+
{
60+
"success": True,
61+
"total": total,
62+
"count": len(page),
63+
"offset": offset,
64+
"has_more": has_more,
65+
"next_offset": next_offset,
66+
"items": [_judge_summary(j) for j in page],
67+
},
68+
indent=2,
69+
),
70+
)
71+
]
4572

46-
# Format criteria preview
47-
criteria_preview = ", ".join([c.get("name", "") for c in criteria[:3]])
48-
if len(criteria) > 3:
49-
criteria_preview += f" (+{len(criteria) - 3} more)"
73+
if not all_judges:
74+
msg = (
75+
f"No judges found matching '{search_term}'"
76+
if search_term
77+
else "No judges found. Create your first judge with generate_judge."
78+
)
79+
return [TextContent(type="text", text=msg)]
5080

51-
output += f"⚖️ **{judge['name']}**\n"
52-
output += f" ID: {judge['id']}\n"
53-
output += f" Domain: {domain}\n"
54-
output += f" Criteria ({criteria_count}): {criteria_preview}\n\n"
81+
output = f"Found {total} judge(s) — showing {offset + 1}-{offset + len(page)}:\n\n"
82+
for judge in page:
83+
s = _judge_summary(judge)
84+
names = s["criteria_names"]
85+
criteria_preview = ", ".join(names[:3])
86+
if len(names) > 3:
87+
criteria_preview += f" (+{len(names) - 3} more)"
88+
output += f"⚖️ **{s['name']}**\n"
89+
output += f" ID: {s['id']}\n"
90+
output += f" Domain: {s['domain']}\n"
91+
output += f" Criteria ({s['criteria_count']}): {criteria_preview}\n\n"
92+
if has_more:
93+
output += f"More available — pass offset={next_offset} to see the next page.\n"
5594

5695
return [TextContent(type="text", text=output)]
5796

tests/mcp_eval/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)