22
33Reads pre-computed JSON from S3/disk. The JSON is built once when an eval
44completes (see backend.core.eval_results.precompute_eval_results).
5+
6+ Live progress for in-flight evaluations is served by /api/compare/progress,
7+ not by these endpoints.
58"""
69
710import logging
@@ -27,137 +30,17 @@ async def _get_user_id(request: Request) -> str:
2730
2831@router .get ("/groups" )
2932async def get_comparison_groups (user_id : str = Depends (_get_user_id )):
30- """List evaluation comparison groups for the user.
31-
32- Serves from pre-computed cache (fast). Merges in running evals
33- from log headers so they appear without waiting for completion.
34- """
35- from eval_mcp .core .eval_results import _read_log_headers , _build_groups_from_headers
36-
37- # Serve cached completed evals (instant)
33+ """List evaluation comparison groups for the user, served from the pre-computed cache."""
3834 cached = load_eval_groups (user_id )
39- cached_groups = cached .get ("groups" , []) if cached else []
40- cached_ids = {g ["id" ] for g in cached_groups }
41-
42- # Find running evals not in cache
43- log_dir = get_user_log_dir (user_id )
44- headers = await _read_log_headers (log_dir )
45- started_headers = [h for h in headers if h .get ("status" ) == "started" ]
46-
47- if not started_headers :
48- if cached_groups :
49- return cached
50- # No cache and no running — build fresh
51- await precompute_eval_results (user_id )
52- return load_eval_groups (user_id ) or {"groups" : []}
53-
54- # Build groups from started headers only, merge with cache
55- all_data = _build_groups_from_headers (started_headers )
56- new_groups = [g for g in all_data .get ("groups" , []) if g ["id" ] not in cached_ids ]
57-
58- merged = new_groups + cached_groups
59- merged .sort (key = lambda g : g .get ("created" , "" ), reverse = True )
60- return {"groups" : merged }
35+ if cached :
36+ return cached
37+ await precompute_eval_results (user_id )
38+ return load_eval_groups (user_id ) or {"groups" : []}
6139
6240
6341@router .get ("/detail" )
6442async def get_comparison_detail (group_id : str , user_id : str = Depends (_get_user_id )):
6543 """Get full comparison data for a specific evaluation group."""
66- from eval_mcp .core .eval_results import _read_log_headers , _build_groups_from_headers
67-
68- # For running evals, read partial results directly (skip cache)
69- log_dir = get_user_log_dir (user_id )
70- headers = await _read_log_headers (log_dir )
71- group_headers = [h for h in headers if (h .get ("run_id" ) or h ["file" ]) == group_id ]
72- if group_headers and any (h .get ("status" ) == "started" for h in group_headers ):
73- import asyncio
74- from functools import partial
75- from inspect_ai .log import read_eval_log_sample_summaries
76-
77- models = list (dict .fromkeys (h ["model" ] for h in group_headers ))
78- total_samples = group_headers [0 ].get ("dataset_samples" , 0 )
79- samples_by_id : dict [str , dict ] = {}
80- aggregate : dict [str , dict ] = {}
81-
82- criteria_names : set [str ] = set ()
83- criteria_votes : dict [str , dict [str , list [bool ]]] = {} # model -> criterion -> [passed]
84-
85- for h in group_headers :
86- model = h ["model" ]
87- try :
88- loop = asyncio .get_event_loop ()
89- summaries = await loop .run_in_executor (None , partial (read_eval_log_sample_summaries , h ["file" ]))
90- completed = [s for s in summaries if s .scores ]
91- scores_sum = 0.0
92- model_criteria_votes : dict [str , list [bool ]] = {}
93-
94- for s in completed :
95- score_obj = next (iter (s .scores .values ())) if s .scores else None
96- if not score_obj :
97- continue
98- val = score_obj .value
99- if val == "C" :
100- scores_sum += 1.0
101- elif isinstance (val , (int , float )):
102- scores_sum += float (val )
103-
104- # Extract per-criterion results
105- if score_obj .metadata and "criteria_results" in score_obj .metadata :
106- for cr in score_obj .metadata ["criteria_results" ]:
107- cname = cr ["name" ]
108- criteria_names .add (cname )
109- if cname not in model_criteria_votes :
110- model_criteria_votes [cname ] = []
111- model_criteria_votes [cname ].append (cr ["passed" ])
112-
113- avg = scores_sum / len (completed ) if completed else 0
114- by_criterion = {}
115- for cname , votes in model_criteria_votes .items ():
116- by_criterion [cname ] = sum (votes ) / len (votes ) if votes else 0
117- aggregate [model ] = {"overall" : avg , "byCriterion" : by_criterion }
118- criteria_votes [model ] = model_criteria_votes
119-
120- for s in completed :
121- sid = str (s .id )
122- if sid not in samples_by_id :
123- sample_input = s .input if isinstance (s .input , str ) else str (s .input [0 ].content if s .input else "" )
124- samples_by_id [sid ] = {
125- "id" : sid ,
126- "input" : sample_input [:300 ],
127- "target" : s .target [0 ] if isinstance (s .target , list ) else str (s .target or "" ),
128- "results" : {},
129- }
130- score_obj = next (iter (s .scores .values ())) if s .scores else None
131- passed = score_obj .value == "C" if score_obj else False
132- score_num = 1.0 if passed else (float (score_obj .value ) if score_obj and isinstance (score_obj .value , (int , float )) else 0.0 )
133- criteria_results = []
134- if score_obj and score_obj .metadata and "criteria_results" in score_obj .metadata :
135- criteria_results = [
136- {"name" : cr ["name" ], "passed" : cr ["passed" ], "votes_for" : cr .get ("votes_for" , 0 ), "total" : cr .get ("total" , 0 )}
137- for cr in score_obj .metadata ["criteria_results" ]
138- ]
139- samples_by_id [sid ]["results" ][model ] = {
140- "passed" : passed ,
141- "score" : score_num ,
142- "output" : "" ,
143- "explanation" : score_obj .explanation [:200 ] if score_obj and score_obj .explanation else "" ,
144- "criteriaResults" : criteria_results ,
145- }
146- except Exception as e :
147- logger .warning (f"Failed to read summaries for { model } : { e } " )
148- aggregate [model ] = {"overall" : 0 , "byCriterion" : {}}
149-
150- return {
151- "models" : models ,
152- "samples" : list (samples_by_id .values ()),
153- "aggregate" : aggregate ,
154- "criteria" : sorted (criteria_names ),
155- "stats" : {m : {"total_tokens" : 0 } for m in models },
156- "status" : "running" ,
157- "sampleCount" : total_samples ,
158- "completedSamples" : len (samples_by_id ),
159- }
160-
16144 data = load_eval_detail (user_id , group_id )
16245 if data :
16346 return data
@@ -352,3 +235,56 @@ async def generate_report_pdf(
352235 "Content-Disposition" : f'attachment; filename="eval_report_{ safe_id } .pdf"' ,
353236 },
354237 )
238+
239+
240+ @router .get ("/report/{group_id}" )
241+ async def download_report (group_id : str , user_id : str = Depends (_get_user_id )):
242+ """Serve a previously generated PDF report for an evaluation group.
243+
244+ Reads from S3 in production, local disk in dev. Returns 404 if the
245+ report hasn't been generated yet (in which case the caller should POST
246+ to /report/pdf or ask the MCP agent to generate one).
247+ """
248+ import os
249+ from eval_mcp .core .user_storage import (
250+ DATA_BUCKET ,
251+ _get_s3_client ,
252+ _s3_enabled ,
253+ get_user_base_dir ,
254+ )
255+
256+ if not user_id or "/" in user_id or "\\ " in user_id or user_id in ("." , ".." ):
257+ raise HTTPException (status_code = 400 , detail = "invalid user_id" )
258+ safe_id = group_id .replace ("/" , "_" ).replace ("\\ " , "_" )
259+ filename = f"report_{ safe_id } .pdf"
260+
261+ if _s3_enabled ():
262+ key = f"users/{ user_id } /store/reports/{ filename } "
263+ try :
264+ obj = _get_s3_client ().get_object (Bucket = DATA_BUCKET , Key = key )
265+ except Exception as e :
266+ if getattr (e , "response" , {}).get ("Error" , {}).get ("Code" ) in ("NoSuchKey" , "404" ):
267+ raise HTTPException (
268+ status_code = 404 ,
269+ detail = "Report not generated yet." ,
270+ )
271+ logger .warning (f"Failed to fetch report s3://{ DATA_BUCKET } /{ key } : { e } " )
272+ raise HTTPException (status_code = 500 , detail = "failed to fetch report" )
273+ pdf_bytes = obj ["Body" ].read ()
274+ else :
275+ base_real = os .path .realpath (str (get_user_base_dir ()))
276+ pdf_real = os .path .realpath (os .path .join (base_real , user_id , "store" , "reports" , filename ))
277+ if not pdf_real .startswith (base_real + os .sep ):
278+ raise HTTPException (status_code = 400 , detail = "invalid path" )
279+ if not os .path .isfile (pdf_real ):
280+ raise HTTPException (status_code = 404 , detail = "Report not generated yet." )
281+ with open (pdf_real , "rb" ) as f :
282+ pdf_bytes = f .read ()
283+
284+ return Response (
285+ content = pdf_bytes ,
286+ media_type = "application/pdf" ,
287+ headers = {
288+ "Content-Disposition" : f'attachment; filename="eval_report_{ safe_id } .pdf"' ,
289+ },
290+ )
0 commit comments