Skip to content

Commit 8db4c70

Browse files
author
Jonathan Buck
committed
feat: change Evaluator.evaluate() to return list[EvaluationOutput]
BREAKING CHANGE: Evaluator.evaluate() and evaluate_async() now return list[EvaluationOutput] instead of single EvaluationOutput to support multi-metric evaluation scenarios. - Add aggregator property to Evaluator base class with default mean aggregation - Update all evaluator implementations to return lists - InteractionsEvaluator now returns all intermediate evaluations instead of only the last - Add detailed_results field to EvaluationReport for drill-down into individual metrics - Update display to show detailed metrics tree when cases are expanded - Dataset aggregates multiple outputs per case using evaluator's aggregator function
1 parent 9d032f9 commit 8db4c70

15 files changed

+265
-105
lines changed
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
"""
2+
Simple example demonstrating multi-metric evaluation.
3+
4+
This toy evaluator checks multiple aspects of a response:
5+
1. Length check (is response long enough?)
6+
2. Keyword check (does it contain expected keywords?)
7+
3. Sentiment check (is it positive?)
8+
9+
Each check produces its own metric, and they're aggregated into a final score.
10+
"""
11+
12+
from strands_evals import Case, Dataset
13+
from strands_evals.evaluators import Evaluator
14+
from strands_evals.types import EvaluationData, EvaluationOutput
15+
16+
17+
class MultiAspectEvaluator(Evaluator[str, str]):
18+
"""Evaluates multiple aspects of a text response."""
19+
20+
def __init__(self, min_length: int = 10, required_keywords: list[str] | None = None):
21+
super().__init__()
22+
self.min_length = min_length
23+
self.required_keywords = required_keywords or []
24+
25+
def evaluate(self, evaluation_case: EvaluationData[str, str]) -> list[EvaluationOutput]:
26+
"""Returns one EvaluationOutput per aspect checked."""
27+
actual = evaluation_case.actual_output or ""
28+
results = []
29+
30+
# Metric 1: Length check
31+
length_ok = len(actual) >= self.min_length
32+
results.append(
33+
EvaluationOutput(
34+
score=1.0 if length_ok else 0.0,
35+
test_pass=length_ok,
36+
reason=f"Length: {len(actual)} chars ({'✓' if length_ok else '✗'} min {self.min_length})",
37+
)
38+
)
39+
40+
# Metric 2: Keyword check
41+
keywords_found = [kw for kw in self.required_keywords if kw.lower() in actual.lower()]
42+
keyword_score = len(keywords_found) / len(self.required_keywords) if self.required_keywords else 1.0
43+
results.append(
44+
EvaluationOutput(
45+
score=keyword_score,
46+
test_pass=keyword_score >= 0.5,
47+
reason=f"Keywords: {len(keywords_found)}/{len(self.required_keywords)} found {keywords_found}",
48+
)
49+
)
50+
51+
# Metric 3: Sentiment check (simple heuristic)
52+
positive_words = ["good", "great", "excellent", "happy", "wonderful"]
53+
has_positive = any(word in actual.lower() for word in positive_words)
54+
results.append(
55+
EvaluationOutput(
56+
score=1.0 if has_positive else 0.5,
57+
test_pass=True, # Always pass, just informational
58+
reason=f"Sentiment: {'Positive' if has_positive else 'Neutral'}",
59+
)
60+
)
61+
62+
return results
63+
64+
async def evaluate_async(self, evaluation_case: EvaluationData[str, str]) -> list[EvaluationOutput]:
65+
# For this simple example, just call sync version
66+
return self.evaluate(evaluation_case)
67+
68+
69+
# Task function: simple echo with modifications
70+
def simple_task(query: str) -> str:
71+
return f"This is a great response to: {query}. It provides excellent information!"
72+
73+
74+
if __name__ == "__main__":
75+
# Create test cases
76+
test_cases = [
77+
Case[str, str](
78+
name="short-response",
79+
input="Hi",
80+
expected_output="Short reply",
81+
),
82+
Case[str, str](
83+
name="long-response",
84+
input="Tell me about Python",
85+
expected_output="Python is great",
86+
),
87+
]
88+
89+
# Create evaluator that checks: length >= 20, contains ["response", "information"]
90+
evaluator = MultiAspectEvaluator(min_length=20, required_keywords=["response", "information"])
91+
92+
# Create dataset and run
93+
dataset = Dataset[str, str](cases=test_cases, evaluator=evaluator)
94+
report = dataset.run_evaluations(simple_task)
95+
96+
# Show results
97+
print("\n=== Programmatic Access to Detailed Results ===")
98+
for i, detailed in enumerate(report.detailed_results):
99+
print(f"\nCase {i}: {report.cases[i]['name']}")
100+
print(f" Aggregate Score: {report.scores[i]:.2f}")
101+
print(f" Individual Metrics ({len(detailed)}):")
102+
for j, metric in enumerate(detailed):
103+
print(f" {j + 1}. Score={metric.score:.2f}, Pass={metric.test_pass}, Reason={metric.reason}")
104+
105+
# Interactive display
106+
print("\n=== Interactive Display ===")
107+
report.run_display()

src/strands_evals/dataset.py

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -240,22 +240,26 @@ async def _worker(self, queue: asyncio.Queue, task: Callable, results: list):
240240
"gen_ai.eval.case.name": case_name,
241241
},
242242
) as eval_span:
243-
evaluation_output = await self.evaluator.evaluate_async(evaluation_context)
243+
evaluation_outputs = await self.evaluator.evaluate_async(evaluation_context)
244+
(aggregate_score, aggregate_pass, aggregate_reason) = self.evaluator.aggregator(
245+
evaluation_outputs
246+
)
244247
eval_span.set_attributes(
245248
{
246-
"gen_ai.eval.output.score": evaluation_output.score,
247-
"gen_ai.eval.output.test_pass": evaluation_output.test_pass,
248-
"gen_ai.eval.output.reason": evaluation_output.reason or "",
249+
"gen_ai.eval.output.score": aggregate_score,
250+
"gen_ai.eval.output.test_pass": aggregate_pass,
251+
"gen_ai.eval.output.reason": aggregate_reason or "",
249252
}
250253
)
251254

252255
# Store results
253256
results.append(
254257
{
255258
"case": evaluation_context.model_dump(),
256-
"test_pass": evaluation_output.test_pass,
257-
"score": evaluation_output.score,
258-
"reason": evaluation_output.reason or "",
259+
"test_pass": aggregate_pass,
260+
"score": aggregate_score,
261+
"reason": aggregate_reason or "",
262+
"detailed_results": evaluation_outputs,
259263
}
260264
)
261265

@@ -267,6 +271,7 @@ async def _worker(self, queue: asyncio.Queue, task: Callable, results: list):
267271
"test_pass": False,
268272
"score": 0,
269273
"reason": f"An error occurred: {str(e)}",
274+
"detailed_results": [],
270275
}
271276
)
272277
finally:
@@ -288,6 +293,7 @@ def run_evaluations(self, task: Callable[[InputT], OutputT | dict[str, Any]]) ->
288293
test_passes = []
289294
cases: list = []
290295
reasons = []
296+
detailed_results = []
291297

292298
for case in self._cases:
293299
case_name = case.name or f"case_{len(cases)}"
@@ -330,33 +336,39 @@ def run_evaluations(self, task: Callable[[InputT], OutputT | dict[str, Any]]) ->
330336
"gen_ai.eval.case.name": case_name,
331337
},
332338
) as eval_span:
333-
evaluation_output = self.evaluator.evaluate(evaluation_context)
339+
evaluation_outputs = self.evaluator.evaluate(evaluation_context)
340+
(aggregate_score, aggregate_pass, aggregate_reason) = self.evaluator.aggregator(
341+
evaluation_outputs
342+
)
334343
eval_span.set_attributes(
335344
{
336-
"gen_ai.eval.output.score": evaluation_output.score,
337-
"gen_ai.eval.output.test_pass": evaluation_output.test_pass,
338-
"gen_ai.eval.output.reason": evaluation_output.reason or "",
345+
"gen_ai.eval.output.score": aggregate_score,
346+
"gen_ai.eval.output.test_pass": aggregate_pass,
347+
"gen_ai.eval.output.reason": aggregate_reason or "",
339348
}
340349
)
341350

342351
cases.append(evaluation_context.model_dump())
343-
test_passes.append(evaluation_output.test_pass)
344-
scores.append(evaluation_output.score)
345-
reasons.append(evaluation_output.reason or "")
352+
test_passes.append(aggregate_pass)
353+
scores.append(aggregate_score)
354+
reasons.append(aggregate_reason or "")
355+
detailed_results.append(evaluation_outputs)
346356

347357
except Exception as e:
348358
case_span.record_exception(e)
349359
cases.append(case.model_dump())
350360
test_passes.append(False)
351361
scores.append(0)
352362
reasons.append(f"An error occured : {str(e)}")
363+
detailed_results.append([])
353364

354365
report = EvaluationReport(
355366
overall_score=sum(scores) / len(scores) if len(scores) else 0,
356367
scores=scores,
357368
test_passes=test_passes,
358369
cases=cases,
359370
reasons=reasons,
371+
detailed_results=detailed_results,
360372
)
361373

362374
return report
@@ -395,6 +407,7 @@ async def run_evaluations_async(self, task: Callable, max_workers: int = 10) ->
395407
test_passes = [r["test_pass"] for r in results]
396408
cases = [r["case"] for r in results]
397409
reasons = [r["reason"] for r in results]
410+
detailed_results = [r["detailed_results"] for r in results]
398411

399412
# Create and return report
400413
return EvaluationReport(
@@ -403,6 +416,7 @@ async def run_evaluations_async(self, task: Callable, max_workers: int = 10) ->
403416
test_passes=test_passes,
404417
cases=cases,
405418
reasons=reasons,
419+
detailed_results=detailed_results,
406420
)
407421

408422
def to_dict(self) -> dict:

src/strands_evals/display/display_console.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from rich.panel import Panel
33
from rich.prompt import Prompt
44
from rich.table import Table
5+
from rich.tree import Tree
56

67
console = Console()
78

@@ -28,7 +29,8 @@ class CollapsibleTableReportDisplay:
2829
"test_pass: bool,
2930
"reason": str,
3031
... # will display everything that's given like actual_output etc.
31-
}
32+
},
33+
"detailed_results": list[EvaluationOutput],
3234
},
3335
"expanded": bool
3436
}
@@ -93,8 +95,22 @@ def display_items(self):
9395
pass_status,
9496
] + len(other_fields) * ["..."]
9597
table.add_row(*renderables)
98+
9699
console.print(table)
97100

101+
for key, item in self.items.items():
102+
if item["expanded"] and item.get("detailed_results"):
103+
detailed_results = item["detailed_results"]
104+
if len(detailed_results) > 1: # Only show if multiple metrics
105+
tree = Tree(f"[bold cyan]📋 Detailed Metrics for Case {key}[/bold cyan]")
106+
for i, result in enumerate(detailed_results):
107+
status = "✅" if result.test_pass else "❌"
108+
metric_node = tree.add(f"[yellow]Metric {i + 1}[/yellow]: Score={result.score:.2f} {status}")
109+
if result.reason:
110+
metric_node.add(f"[dim]{result.reason}[/dim]")
111+
console.print(tree)
112+
console.print()
113+
98114
def run(self, static: bool = False):
99115
"""
100116
Run the interactive display loop. If static, then the terminal will only display the report.

src/strands_evals/evaluators/evaluator.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,17 @@ class Evaluator(Generic[InputT, OutputT]):
3535
# Optional: subclasses can set this to enable trace parsing
3636
evaluation_level: EvaluationLevel | None = None
3737

38-
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> EvaluationOutput:
38+
def __init__(self):
39+
self.aggregator = self._default_aggregator
40+
41+
@staticmethod
42+
def _default_aggregator(outputs: list[EvaluationOutput]) -> tuple[float, bool, str]:
43+
avg_score = sum(o.score for o in outputs) / len(outputs)
44+
all_pass = all(o.test_pass for o in outputs)
45+
combined_reason = " | ".join(o.reason for o in outputs if o.reason)
46+
return avg_score, all_pass, combined_reason
47+
48+
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
3949
"""
4050
Evaluate the performance of the task on the given test cases.
4151
@@ -47,7 +57,7 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> Evaluati
4757
"""
4858
raise NotImplementedError("This method should be implemented in subclasses.")
4959

50-
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> EvaluationOutput:
60+
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
5161
"""
5262
Evaluate the performance of the task on the given test cases asynchronously.
5363
@@ -193,7 +203,8 @@ def to_dict(self) -> dict:
193203
# Get default values from __init__ signature
194204
sig = inspect.signature(self.__class__.__init__)
195205
defaults = {k: v.default for k, v in sig.parameters.items() if v.default != inspect.Parameter.empty}
206+
exclude_attrs = {"aggregator"}
196207
for k, v in self.__dict__.items():
197-
if not k.startswith("_") and (k not in defaults or v != defaults[k]):
208+
if not k.startswith("_") and k not in exclude_attrs and (k not in defaults or v != defaults[k]):
198209
_dict[k] = v
199210
return _dict

src/strands_evals/evaluators/helpfulness_evaluator.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,21 +60,23 @@ def __init__(
6060
self.model = model
6161
self.include_inputs = include_inputs
6262

63-
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> EvaluationOutput:
63+
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
6464
parsed_input = self._get_last_turn(evaluation_case)
6565
prompt = self._format_prompt(parsed_input)
6666
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
6767
rating = evaluator_agent.structured_output(HelpfulnessRating, prompt)
6868
normalized_score = self._score_mapping[rating.score]
69-
return EvaluationOutput(score=normalized_score, test_pass=normalized_score >= 0.5, reason=rating.reasoning)
69+
result = EvaluationOutput(score=normalized_score, test_pass=normalized_score >= 0.5, reason=rating.reasoning)
70+
return [result]
7071

71-
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> EvaluationOutput:
72+
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
7273
parsed_input = self._get_last_turn(evaluation_case)
7374
prompt = self._format_prompt(parsed_input)
7475
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
7576
rating = await evaluator_agent.structured_output_async(HelpfulnessRating, prompt)
7677
normalized_score = self._score_mapping[rating.score]
77-
return EvaluationOutput(score=normalized_score, test_pass=normalized_score >= 0.5, reason=rating.reasoning)
78+
result = EvaluationOutput(score=normalized_score, test_pass=normalized_score >= 0.5, reason=rating.reasoning)
79+
return [result]
7880

7981
def _get_last_turn(self, evaluation_case: EvaluationData[InputT, OutputT]) -> TurnLevelInput:
8082
"""Extract the most recent turn from the conversation for evaluation."""

0 commit comments

Comments
 (0)