@@ -12,20 +12,49 @@ class EvaluationReport(BaseModel):
1212 A report of the evaluation of a task.
1313
1414 Attributes:
15+ evaluator_name: The name of the evaluator that produced this report.
1516 overall_score: The overall score of the task.
1617 scores: A list of the score for each test case in order.
1718 cases: A list of records for each test case.
1819 test_passes: A list of booleans indicating whether the test pass or fail.
1920 reasons: A list of reason for each test case.
2021 """
2122
23+ evaluator_name : str = ""
2224 overall_score : float
2325 scores : list [float ]
2426 cases : list [dict ]
2527 test_passes : list [bool ]
2628 reasons : list [str ] = []
2729 detailed_results : list [list [EvaluationOutput ]] = []
2830
31+ @classmethod
32+ def flatten (cls , reports : list ["EvaluationReport" ]) -> "EvaluationReport" :
33+ """Flatten multiple evaluation reports into a single report."""
34+ if not reports :
35+ return cls (overall_score = 0.0 , scores = [], cases = [], test_passes = [])
36+
37+ scores , cases , passes , reasons , detailed = [], [], [], [], []
38+
39+ for report in reports :
40+ evaluator = report .evaluator_name or "Unknown"
41+ for i , case in enumerate (report .cases ):
42+ cases .append ({** case , "evaluator" : evaluator })
43+ scores .append (report .scores [i ] if i < len (report .scores ) else 0.0 )
44+ passes .append (report .test_passes [i ] if i < len (report .test_passes ) else False )
45+ reasons .append (report .reasons [i ] if i < len (report .reasons ) else "" )
46+ detailed .append (report .detailed_results [i ] if i < len (report .detailed_results ) else [])
47+
48+ return cls (
49+ evaluator_name = "Combined" ,
50+ overall_score = sum (scores ) / len (scores ) if scores else 0.0 ,
51+ scores = scores ,
52+ cases = cases ,
53+ test_passes = passes ,
54+ reasons = reasons ,
55+ detailed_results = detailed ,
56+ )
57+
2958 def _display (
3059 self ,
3160 static : bool = True ,
@@ -60,12 +89,13 @@ def _display(
6089 for i in range (len (self .scores )):
6190 name = self .cases [i ].get ("name" , f"Test { i + 1 } " )
6291 reason = self .reasons [i ] if i < len (self .reasons ) else "N/A"
63- details_dict = {
64- "name" : name ,
65- "score" : f"{ self .scores [i ]:.2f} " ,
66- "test_pass" : self .test_passes [i ],
67- "reason" : reason ,
68- }
92+ details_dict = {"name" : name }
93+ # Include evaluator column for flattened reports (right after name)
94+ if "evaluator" in self .cases [i ]:
95+ details_dict ["evaluator" ] = self .cases [i ]["evaluator" ]
96+ details_dict ["score" ] = f"{ self .scores [i ]:.2f} "
97+ details_dict ["test_pass" ] = self .test_passes [i ]
98+ details_dict ["reason" ] = reason
6999 if include_input :
70100 details_dict ["input" ] = str (self .cases [i ].get ("input" ))
71101 if include_actual_output :
0 commit comments