@@ -40,11 +40,15 @@ hatch run list
4040
4141``` python
4242from strands import Agent
43- from strands_evaluation.dataset import Dataset
44- from strands_evaluation.case import Case
45- from strands_evaluation.evaluators.output_evaluator import OutputEvaluator
43+ from strands_evals import Case, Dataset
44+ from strands_evals.evaluators import OutputEvaluator
4645
47- # 1. Create test cases
46+ # 1. Define a task function
47+ def get_response (query : str ) -> str :
48+ agent = Agent(callback_handler = None )
49+ return str (agent(query))
50+
51+ # 2. Create test cases
4852test_cases = [
4953 Case[str , str ](
5054 name = " knowledge-1" ,
@@ -59,25 +63,20 @@ test_cases = [
5963 )
6064]
6165
62- # 2 . Create an evaluator
66+ # 3 . Create an evaluator
6367evaluator = OutputEvaluator(
6468 rubric = " The output should represent a reasonable answer to the input."
6569)
6670
67- # 3 . Create a dataset
71+ # 4 . Create a dataset
6872dataset = Dataset[str , str ](
6973 cases = test_cases,
7074 evaluator = evaluator
7175)
7276
73- # 4. Define a task function
74- def get_response (query : str ) -> str :
75- agent = Agent(callback_handler = None )
76- return str (agent(query))
77-
7877# 5. Run evaluations
7978report = dataset.run_evaluations(get_response)
80- report.display ()
79+ report.run_display ()
8180```
8281
8382## Saving and Loading Datasets
@@ -93,8 +92,8 @@ loaded_dataset = Dataset.from_file("./dataset_files/my_dataset.json", "json")
9392## Custom Evaluators
9493
9594``` python
96- from strands_evaluation .evaluators.evaluator import Evaluator
97- from strands_evaluation .types.evaluation import EvaluationData, EvaluationOutput
95+ from strands_evals .evaluators import Evaluator
96+ from strands_evals .types import EvaluationData, EvaluationOutput
9897
9998class CustomEvaluator (Evaluator[str , str ]):
10099 def evaluate (self , evaluation_case : EvaluationData[str , str ]) -> EvaluationOutput:
@@ -122,10 +121,21 @@ dataset = Dataset[str, str](
122121## Evaluating Tool Usage
123122
124123``` python
124+ from strands_evals import Case, Dataset
125+ from strands_evals.evaluators import TrajectoryEvaluator
125126from strands_tools import calculator
126- from strands_evaluation.evaluators.trajectory_evaluator import TrajectoryEvaluator
127127
128- # Create test cases with expected tool trajectories
128+ # 1. Define task that returns tool usage
129+ def get_response_with_tools (query : str ) -> dict :
130+ agent = Agent(tools = [calculator])
131+ response = agent(query)
132+
133+ return {
134+ " output" : str (response),
135+ " trajectory" : list (response.metrics.tool_metrics.keys())
136+ }
137+
138+ # 2. Create test cases with expected tool trajectories
129139test_case = Case[str , str ](
130140 name = " calculator-1" ,
131141 input = " What is the square root of 9?" ,
@@ -134,23 +144,13 @@ test_case = Case[str, str](
134144 metadata = {" category" : " math" }
135145)
136146
137- # Create trajectory evaluator
147+ # 3. Create trajectory evaluator
138148trajectory_evaluator = TrajectoryEvaluator(
139- rubric = " The trajectory should represent a reasonable use of tools based on the input ." ,
149+ rubric = " Scoring should measure how well the agent uses appropriate tools for the given task ." ,
140150 include_inputs = True
141151)
142152
143- # Define task that returns tool usage
144- def get_response_with_tools (query : str ) -> dict :
145- agent = Agent(tools = [calculator])
146- response = agent(query)
147-
148- return {
149- " output" : str (response),
150- " trajectory" : list (response.metrics.tool_metrics.keys())
151- }
152-
153- # Create dataset and run evaluations
153+ # 4. Create dataset and run evaluations
154154dataset = Dataset[str , str ](
155155 cases = [test_case],
156156 evaluator = trajectory_evaluator
@@ -159,33 +159,39 @@ dataset = Dataset[str, str](
159159report = dataset.run_evaluations(get_response_with_tools)
160160```
161161
162- ## Async Evaluation
163-
164- For improved performance with many test cases, use async evaluation:
162+ ## Dataset Generation
165163
166164``` python
167- import asyncio
168- from strands_evaluation.dataset import Dataset
169- from strands_evaluation.evaluators.output_evaluator import OutputEvaluator
170-
171- # Create dataset with cases and evaluator
172- dataset = Dataset(cases = test_cases, evaluator = OutputEvaluator(rubric = " Test rubric" ))
165+ from strands_evals.generators import DatasetGenerator
166+ from strands_evals.evaluators import TrajectoryEvaluator
167+
168+ # 1. Define tool context
169+ tool_context = """
170+ Available tools:
171+ - calculator(expression: str) -> float: Evaluate mathematical expressions
172+ - web_search(query: str) -> str: Search the web for information
173+ """
174+
175+ # 2. Generate dataset from context
176+ generator = DatasetGenerator[str , str ](str , str )
177+
178+ dataset = await generator.generate_dataset(
179+ context = tool_context,
180+ num_cases = 10 ,
181+ evaluator_type = TrajectoryEvaluator,
182+ task_description = " Math and research assistant with tool usage"
183+ )
173184
174- # Define async task function (optional)
175- async def async_task (query ):
176- agent = Agent(callback_handler = None )
177- response = await agent.invoke_async(query)
178- return str (response)
185+ # 3. Save generated dataset
186+ dataset.to_file(" generated_math_research_dataset" )
187+ ```
179188
180- # Run evaluations asynchronously (works with both sync and async task functions)
181- async def main ():
182- report = await dataset.run_evaluations_async(async_task, max_workers = 5 )
183- report.display()
184- return report
189+ ## Available Evaluators
185190
186- # Run the async function
187- report = asyncio.run(main())
188- ```
191+ - ** OutputEvaluator** : Evaluates the quality and correctness of agent outputs
192+ - ** TrajectoryEvaluator** : Evaluates the sequence of tools/actions used by agents
193+ - ** InteractionsEvaluator** : Evaluates multi-agent interactions and handoffs
194+ - ** Custom Evaluators** : Create your own evaluation logic by extending the base Evaluator class
189195
190196## More Examples
191197
0 commit comments