awslabs
diff --git a/‎eval_mcp/server.py‎
Lines changed: 335 additions & 113 deletions b/‎eval_mcp/server.py‎
Lines changed: 335 additions & 113 deletions
diff --git a/‎eval_mcp/tools/list_datasets.py‎
Lines changed: 72 additions & 33 deletions b/‎eval_mcp/tools/list_datasets.py‎
Lines changed: 72 additions & 33 deletions
diff --git a/‎eval_mcp/tools/list_evaluations.py‎
Lines changed: 62 additions & 18 deletions b/‎eval_mcp/tools/list_evaluations.py‎
Lines changed: 62 additions & 18 deletions
diff --git a/‎eval_mcp/tools/list_judges.py‎
Lines changed: 65 additions & 26 deletions b/‎eval_mcp/tools/list_judges.py‎
Lines changed: 65 additions & 26 deletions
diff --git a/‎tests/mcp_eval/__init__.py‎ b/‎tests/mcp_eval/__init__.py‎
@@ -8,16 +8,45 @@
 from eval_mcp.core.user_storage import list_datasets_from_db
 
 
+def _dataset_preview(dataset: Dict[str, Any]) -> Dict[str, Any]:
+    tests = dataset.get("tests", [])
+    num_samples = dataset.get("num_samples", len(tests) if isinstance(tests, list) else 0)
+    preview = ""
+    if isinstance(tests, list) and len(tests) > 0:
+        first_item = tests[0]
+        if isinstance(first_item, dict):
+            preview = (
+                first_item.get("vars", {}).get("question")
+                or first_item.get("question")
+                or str(first_item)[:100]
+            )
+            preview = preview[:100]
+            if len(preview) == 100:
+                preview += "..."
+    return {
+        "id": dataset.get("id"),
+        "name": dataset.get("name"),
+        "num_samples": num_samples,
+        "preview": preview or None,
+    }
+
+
 async def handle_list_datasets(args: Dict[str, Any]) -> List[TextContent]:
-    """Handle list_datasets tool call.
+    """List datasets with pagination and optional JSON format.
 
-    Lists all datasets from the user's database.
-    Returns details about each dataset including number of samples and preview.
+    Args (from `args` dict):
+        user_id: required
+        searchTerm: optional case-insensitive name filter
+        limit: page size, default 20
+        offset: page start, default 0
+        response_format: "markdown" (default) or "json"
     """
     try:
-        # Get required user_id and optional search filter
         user_id = args.get("user_id")
         search_term = (args.get("searchTerm") or "").lower()
+        limit = max(1, int(args.get("limit", 20) or 20))
+        offset = max(0, int(args.get("offset", 0) or 0))
+        response_format = (args.get("response_format") or "markdown").lower()
 
         if not user_id:
             return [
@@ -27,38 +56,48 @@ async def handle_list_datasets(args: Dict[str, Any]) -> List[TextContent]:
                 )
             ]
 
-        # Get datasets from database
-        datasets = list_datasets_from_db(user_id, search_term)
+        all_datasets = list_datasets_from_db(user_id, search_term)
+        total = len(all_datasets)
+        page = all_datasets[offset : offset + limit]
+        has_more = offset + len(page) < total
+        next_offset = offset + len(page) if has_more else None
 
-        if not datasets:
-            msg = f"No datasets found matching '{search_term}'" if search_term else "No datasets found. Create your first dataset with save_dataset."
-            return [TextContent(type="text", text=msg)]
-
-        # Format output
-        output = f"Found {len(datasets)} dataset(s):\n\n"
-
-        for dataset in datasets:
-            tests = dataset.get("tests", [])
-            num_samples = dataset.get("num_samples", len(tests) if isinstance(tests, list) else 0)
+        if response_format == "json":
+            return [
+                TextContent(
+                    type="text",
+                    text=json.dumps(
+                        {
+                            "success": True,
+                            "total": total,
+                            "count": len(page),
+                            "offset": offset,
+                            "has_more": has_more,
+                            "next_offset": next_offset,
+                            "items": [_dataset_preview(d) for d in page],
+                        },
+                        indent=2,
+                    ),
+                )
+            ]
 
-            # Get first question as preview
-            preview = ""
-            if isinstance(tests, list) and len(tests) > 0:
-                first_item = tests[0]
-                if isinstance(first_item, dict):
-                    preview = (
-                        first_item.get("vars", {}).get("question")
-                        or first_item.get("question")
-                        or str(first_item)[:100]
-                    )
-                    preview = preview[:100]
-                    if len(preview) == 100:
-                        preview += "..."
+        if not all_datasets:
+            msg = (
+                f"No datasets found matching '{search_term}'"
+                if search_term
+                else "No datasets found. Create your first dataset with save_dataset."
+            )
+            return [TextContent(type="text", text=msg)]
 
-            output += f"📊 **{dataset['name']}**\n"
-            output += f"   ID: {dataset['id'][:16]}...\n"
-            output += f"   Samples: {num_samples}\n"
-            output += f"   Preview: {preview or 'No preview available'}\n\n"
+        output = f"Found {total} dataset(s) — showing {offset + 1}-{offset + len(page)}:\n\n"
+        for dataset in page:
+            p = _dataset_preview(dataset)
+            output += f"📊 **{p['name']}**\n"
+            output += f"   ID: {(p['id'] or '')[:16]}...\n"
+            output += f"   Samples: {p['num_samples']}\n"
+            output += f"   Preview: {p['preview'] or 'No preview available'}\n\n"
+        if has_more:
+            output += f"More available — pass offset={next_offset} to see the next page.\n"
 
         return [TextContent(type="text", text=output)]
 
 
@@ -13,10 +13,19 @@
 
 
 async def handle_list_evaluations(args: Dict[str, Any]) -> List[TextContent]:
-    """List evaluations by reading .eval log files from the user's logs directory."""
+    """List evaluations with pagination and optional markdown format.
+
+    Args (from `args` dict):
+        user_id: required
+        limit: page size, default 20
+        offset: page start, default 0
+        response_format: "json" (default — eval payloads are heavy) or "markdown"
+    """
     try:
         user_id = args.get("user_id")
-        limit = args.get("limit", 20)
+        limit = max(1, int(args.get("limit", 20) or 20))
+        offset = max(0, int(args.get("offset", 0) or 0))
+        response_format = (args.get("response_format") or "json").lower()
 
         if not user_id:
             return [
@@ -30,16 +39,26 @@ async def handle_list_evaluations(args: Dict[str, Any]) -> List[TextContent]:
         eval_log_infos = await list_eval_logs_async(log_dir)
 
         if not eval_log_infos:
-            return [
-                TextContent(
-                    type="text",
-                    text=json.dumps({
-                        "success": True,
-                        "evaluations": [],
-                        "message": "No evaluations found. Run an evaluation first.",
-                    }),
-                )
-            ]
+            empty_text = (
+                json.dumps({
+                    "success": True,
+                    "total": 0,
+                    "count": 0,
+                    "offset": offset,
+                    "has_more": False,
+                    "next_offset": None,
+                    "evaluations": [],
+                    "message": "No evaluations found. Run an evaluation first.",
+                })
+                if response_format == "json"
+                else "No evaluations found. Run an evaluation first."
+            )
+            return [TextContent(type="text", text=empty_text)]
+
+        total = len(eval_log_infos)
+        page_infos = eval_log_infos[offset : offset + limit]
+        has_more = offset + len(page_infos) < total
+        next_offset = offset + len(page_infos) if has_more else None
 
         # Cache pre-computed details per group so multi-model groups only
         # deserialize once (same summary UI/PDF consume).
@@ -54,7 +73,7 @@ def _detail_for(run_id: str) -> Dict[str, Any] | None:
             return detail_cache[run_id]
 
         evaluations = []
-        for info in eval_log_infos[:limit]:
+        for info in page_infos:
             try:
                 log = await read_eval_log_async(info.name, header_only=True)
 
@@ -97,11 +116,36 @@ def _detail_for(run_id: str) -> Dict[str, Any] | None:
             except Exception:
                 continue
 
-        return [TextContent(type="text", text=json.dumps({
-            "success": True,
-            "evaluations": evaluations,
-            "total": len(evaluations),
-        }, indent=2))]
+        if response_format == "markdown":
+            output = f"Found {total} evaluation(s) — showing {offset + 1}-{offset + len(evaluations)}:\n\n"
+            for e in evaluations:
+                overall = e["score"]["metrics"].get("overall")
+                overall_str = f"{overall:.2f}" if isinstance(overall, (int, float)) else "—"
+                output += f"🧪 **{e['task']}** ({e['model']})\n"
+                output += f"   ID: {e['id']}\n"
+                output += f"   Status: {e['status']} · Samples: {e['totalSamples']} · Overall: {overall_str}\n"
+                output += f"   Created: {e['createdAt']}\n\n"
+            if has_more:
+                output += f"More available — pass offset={next_offset} to see the next page.\n"
+            return [TextContent(type="text", text=output)]
+
+        return [
+            TextContent(
+                type="text",
+                text=json.dumps(
+                    {
+                        "success": True,
+                        "total": total,
+                        "count": len(evaluations),
+                        "offset": offset,
+                        "has_more": has_more,
+                        "next_offset": next_offset,
+                        "evaluations": evaluations,
+                    },
+                    indent=2,
+                ),
+            )
+        ]
 
     except Exception as e:
         return [
 
@@ -8,16 +8,34 @@
 from eval_mcp.core.user_storage import list_judges_from_db
 
 
+def _judge_summary(judge: Dict[str, Any]) -> Dict[str, Any]:
+    config = judge.get("config") or {}
+    criteria = config.get("criteria") or []
+    return {
+        "id": judge.get("id"),
+        "name": judge.get("name"),
+        "domain": config.get("domain", "unknown"),
+        "criteria_count": len(criteria),
+        "criteria_names": [c.get("name", "") for c in criteria],
+    }
+
+
 async def handle_list_judges(args: Dict[str, Any]) -> List[TextContent]:
-    """Handle list_judges tool call.
+    """List LLM judges with pagination and optional JSON format.
 
-    Lists all LLM judges from the user's database.
-    Returns details about each judge including domain and criteria.
+    Args (from `args` dict):
+        user_id: required
+        searchTerm: optional case-insensitive name filter
+        limit: page size, default 20
+        offset: page start, default 0
+        response_format: "markdown" (default) or "json"
     """
     try:
-        # Get required user_id and optional search filter
         user_id = args.get("user_id")
         search_term = (args.get("searchTerm") or "").lower()
+        limit = max(1, int(args.get("limit", 20) or 20))
+        offset = max(0, int(args.get("offset", 0) or 0))
+        response_format = (args.get("response_format") or "markdown").lower()
 
         if not user_id:
             return [
@@ -27,31 +45,52 @@ async def handle_list_judges(args: Dict[str, Any]) -> List[TextContent]:
                 )
             ]
 
-        # Get judges from database
-        judges = list_judges_from_db(user_id, search_term)
-
-        if not judges:
-            msg = f"No judges found matching '{search_term}'" if search_term else "No judges found. Create your first judge with generate_judge."
-            return [TextContent(type="text", text=msg)]
-
-        # Format output
-        output = f"Found {len(judges)} judge(s):\n\n"
+        all_judges = list_judges_from_db(user_id, search_term)
+        total = len(all_judges)
+        page = all_judges[offset : offset + limit]
+        has_more = offset + len(page) < total
+        next_offset = offset + len(page) if has_more else None
 
-        for judge in judges:
-            config = judge["config"]
-            domain = config.get("domain", "unknown")
-            criteria = config.get("criteria", [])
-            criteria_count = len(criteria)
+        if response_format == "json":
+            return [
+                TextContent(
+                    type="text",
+                    text=json.dumps(
+                        {
+                            "success": True,
+                            "total": total,
+                            "count": len(page),
+                            "offset": offset,
+                            "has_more": has_more,
+                            "next_offset": next_offset,
+                            "items": [_judge_summary(j) for j in page],
+                        },
+                        indent=2,
+                    ),
+                )
+            ]
 
-            # Format criteria preview
-            criteria_preview = ", ".join([c.get("name", "") for c in criteria[:3]])
-            if len(criteria) > 3:
-                criteria_preview += f" (+{len(criteria) - 3} more)"
+        if not all_judges:
+            msg = (
+                f"No judges found matching '{search_term}'"
+                if search_term
+                else "No judges found. Create your first judge with generate_judge."
+            )
+            return [TextContent(type="text", text=msg)]
 
-            output += f"⚖️  **{judge['name']}**\n"
-            output += f"   ID: {judge['id']}\n"
-            output += f"   Domain: {domain}\n"
-            output += f"   Criteria ({criteria_count}): {criteria_preview}\n\n"
+        output = f"Found {total} judge(s) — showing {offset + 1}-{offset + len(page)}:\n\n"
+        for judge in page:
+            s = _judge_summary(judge)
+            names = s["criteria_names"]
+            criteria_preview = ", ".join(names[:3])
+            if len(names) > 3:
+                criteria_preview += f" (+{len(names) - 3} more)"
+            output += f"⚖️  **{s['name']}**\n"
+            output += f"   ID: {s['id']}\n"
+            output += f"   Domain: {s['domain']}\n"
+            output += f"   Criteria ({s['criteria_count']}): {criteria_preview}\n\n"
+        if has_more:
+            output += f"More available — pass offset={next_offset} to see the next page.\n"
 
         return [TextContent(type="text", text=output)]