Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 14 additions & 15 deletions src/strands_evals/display/display_console.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,21 +79,20 @@ def display_items(self):
symbol = "▼" if item["expanded"] else "▶"
case = item["details"]
pass_status = "✅" if case["test_pass"] else "❌"
other_fields = list(case.values())[3:]
if item["expanded"]: # We always to render at least the index, name, score, test_pass, and reason
renderables = [
f"{symbol} {key}",
case.get("name", f"Test {key}"),
case.get("score"),
pass_status,
] + other_fields
else:
renderables = [
f"{symbol} {key}",
case.get("name", f"Test {key}"),
case.get("score"),
pass_status,
] + len(other_fields) * ["..."]

# Build renderables dynamically based on headers order
renderables = [f"{symbol} {key}"] # index column
for header in list(case.keys()):
if header == "test_pass":
renderables.append(pass_status)
elif item["expanded"]:
renderables.append(case.get(header, ""))
else:
# Show actual values for core fields, "..." for rest when collapsed
if header in ("name", "evaluator", "score"):
renderables.append(case.get(header, ""))
else:
renderables.append("...")
table.add_row(*renderables)

console.print(table)
Expand Down
2 changes: 2 additions & 0 deletions src/strands_evals/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,7 @@ def _evaluate_with_retry(evaluator=evaluator, evaluation_context=evaluation_cont
eval_name = evaluator.get_type_name()
data = evaluator_data[eval_name]
report = EvaluationReport(
evaluator_name=eval_name,
overall_score=sum(data["scores"]) / len(data["scores"]) if len(data["scores"]) else 0,
scores=data["scores"],
test_passes=data["test_passes"],
Expand Down Expand Up @@ -721,6 +722,7 @@ async def run_evaluations_async(self, task: Callable, max_workers: int = 10) ->
data = evaluator_data[eval_name]
scores = data["scores"]
report = EvaluationReport(
evaluator_name=eval_name,
overall_score=sum(scores) / len(scores) if scores else 0,
scores=scores,
test_passes=data["test_passes"],
Expand Down
42 changes: 36 additions & 6 deletions src/strands_evals/types/evaluation_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,49 @@ class EvaluationReport(BaseModel):
A report of the evaluation of a task.

Attributes:
evaluator_name: The name of the evaluator that produced this report.
overall_score: The overall score of the task.
scores: A list of the score for each test case in order.
cases: A list of records for each test case.
test_passes: A list of booleans indicating whether the test pass or fail.
reasons: A list of reason for each test case.
"""

evaluator_name: str = ""
overall_score: float
scores: list[float]
cases: list[dict]
test_passes: list[bool]
reasons: list[str] = []
detailed_results: list[list[EvaluationOutput]] = []

@classmethod
def flatten(cls, reports: list["EvaluationReport"]) -> "EvaluationReport":
"""Flatten multiple evaluation reports into a single report."""
if not reports:
return cls(overall_score=0.0, scores=[], cases=[], test_passes=[])

scores, cases, passes, reasons, detailed = [], [], [], [], []

for report in reports:
evaluator = report.evaluator_name or "Unknown"
for i, case in enumerate(report.cases):
cases.append({**case, "evaluator": evaluator})
scores.append(report.scores[i] if i < len(report.scores) else 0.0)
passes.append(report.test_passes[i] if i < len(report.test_passes) else False)
reasons.append(report.reasons[i] if i < len(report.reasons) else "")
detailed.append(report.detailed_results[i] if i < len(report.detailed_results) else [])

return cls(
evaluator_name="Combined",
overall_score=sum(scores) / len(scores) if scores else 0.0,
scores=scores,
cases=cases,
test_passes=passes,
reasons=reasons,
detailed_results=detailed,
)

def _display(
self,
static: bool = True,
Expand Down Expand Up @@ -60,12 +89,13 @@ def _display(
for i in range(len(self.scores)):
name = self.cases[i].get("name", f"Test {i + 1}")
reason = self.reasons[i] if i < len(self.reasons) else "N/A"
details_dict = {
"name": name,
"score": f"{self.scores[i]:.2f}",
"test_pass": self.test_passes[i],
"reason": reason,
}
details_dict = {"name": name}
# Include evaluator column for flattened reports (right after name)
if "evaluator" in self.cases[i]:
details_dict["evaluator"] = self.cases[i]["evaluator"]
details_dict["score"] = f"{self.scores[i]:.2f}"
details_dict["test_pass"] = self.test_passes[i]
details_dict["reason"] = reason
if include_input:
details_dict["input"] = str(self.cases[i].get("input"))
if include_actual_output:
Expand Down
Loading