Skip to content

Commit b7ae989

Browse files
authored
feat(report): allow flattened report (#157)
1 parent e5a4c61 commit b7ae989

File tree

4 files changed

+396
-21
lines changed

4 files changed

+396
-21
lines changed

src/strands_evals/display/display_console.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -79,21 +79,20 @@ def display_items(self):
7979
symbol = "▼" if item["expanded"] else "▶"
8080
case = item["details"]
8181
pass_status = "✅" if case["test_pass"] else "❌"
82-
other_fields = list(case.values())[3:]
83-
if item["expanded"]: # We always to render at least the index, name, score, test_pass, and reason
84-
renderables = [
85-
f"{symbol} {key}",
86-
case.get("name", f"Test {key}"),
87-
case.get("score"),
88-
pass_status,
89-
] + other_fields
90-
else:
91-
renderables = [
92-
f"{symbol} {key}",
93-
case.get("name", f"Test {key}"),
94-
case.get("score"),
95-
pass_status,
96-
] + len(other_fields) * ["..."]
82+
83+
# Build renderables dynamically based on headers order
84+
renderables = [f"{symbol} {key}"] # index column
85+
for header in list(case.keys()):
86+
if header == "test_pass":
87+
renderables.append(pass_status)
88+
elif item["expanded"]:
89+
renderables.append(case.get(header, ""))
90+
else:
91+
# Show actual values for core fields, "..." for rest when collapsed
92+
if header in ("name", "evaluator", "score"):
93+
renderables.append(case.get(header, ""))
94+
else:
95+
renderables.append("...")
9796
table.add_row(*renderables)
9897

9998
console.print(table)

src/strands_evals/experiment.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,7 @@ def _evaluate_with_retry(evaluator=evaluator, evaluation_context=evaluation_cont
654654
eval_name = evaluator.get_type_name()
655655
data = evaluator_data[eval_name]
656656
report = EvaluationReport(
657+
evaluator_name=eval_name,
657658
overall_score=sum(data["scores"]) / len(data["scores"]) if len(data["scores"]) else 0,
658659
scores=data["scores"],
659660
test_passes=data["test_passes"],
@@ -721,6 +722,7 @@ async def run_evaluations_async(self, task: Callable, max_workers: int = 10) ->
721722
data = evaluator_data[eval_name]
722723
scores = data["scores"]
723724
report = EvaluationReport(
725+
evaluator_name=eval_name,
724726
overall_score=sum(scores) / len(scores) if scores else 0,
725727
scores=scores,
726728
test_passes=data["test_passes"],

src/strands_evals/types/evaluation_report.py

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,49 @@ class EvaluationReport(BaseModel):
1212
A report of the evaluation of a task.
1313
1414
Attributes:
15+
evaluator_name: The name of the evaluator that produced this report.
1516
overall_score: The overall score of the task.
1617
scores: A list of the score for each test case in order.
1718
cases: A list of records for each test case.
1819
test_passes: A list of booleans indicating whether the test pass or fail.
1920
reasons: A list of reason for each test case.
2021
"""
2122

23+
evaluator_name: str = ""
2224
overall_score: float
2325
scores: list[float]
2426
cases: list[dict]
2527
test_passes: list[bool]
2628
reasons: list[str] = []
2729
detailed_results: list[list[EvaluationOutput]] = []
2830

31+
@classmethod
32+
def flatten(cls, reports: list["EvaluationReport"]) -> "EvaluationReport":
33+
"""Flatten multiple evaluation reports into a single report."""
34+
if not reports:
35+
return cls(overall_score=0.0, scores=[], cases=[], test_passes=[])
36+
37+
scores, cases, passes, reasons, detailed = [], [], [], [], []
38+
39+
for report in reports:
40+
evaluator = report.evaluator_name or "Unknown"
41+
for i, case in enumerate(report.cases):
42+
cases.append({**case, "evaluator": evaluator})
43+
scores.append(report.scores[i] if i < len(report.scores) else 0.0)
44+
passes.append(report.test_passes[i] if i < len(report.test_passes) else False)
45+
reasons.append(report.reasons[i] if i < len(report.reasons) else "")
46+
detailed.append(report.detailed_results[i] if i < len(report.detailed_results) else [])
47+
48+
return cls(
49+
evaluator_name="Combined",
50+
overall_score=sum(scores) / len(scores) if scores else 0.0,
51+
scores=scores,
52+
cases=cases,
53+
test_passes=passes,
54+
reasons=reasons,
55+
detailed_results=detailed,
56+
)
57+
2958
def _display(
3059
self,
3160
static: bool = True,
@@ -60,12 +89,13 @@ def _display(
6089
for i in range(len(self.scores)):
6190
name = self.cases[i].get("name", f"Test {i + 1}")
6291
reason = self.reasons[i] if i < len(self.reasons) else "N/A"
63-
details_dict = {
64-
"name": name,
65-
"score": f"{self.scores[i]:.2f}",
66-
"test_pass": self.test_passes[i],
67-
"reason": reason,
68-
}
92+
details_dict = {"name": name}
93+
# Include evaluator column for flattened reports (right after name)
94+
if "evaluator" in self.cases[i]:
95+
details_dict["evaluator"] = self.cases[i]["evaluator"]
96+
details_dict["score"] = f"{self.scores[i]:.2f}"
97+
details_dict["test_pass"] = self.test_passes[i]
98+
details_dict["reason"] = reason
6999
if include_input:
70100
details_dict["input"] = str(self.cases[i].get("input"))
71101
if include_actual_output:

0 commit comments

Comments
 (0)