Skip to content

Commit 8812c38

Browse files
authored
Merge pull request #5 from strands-agents/squashed_refactoring
Refactor code and documentations
2 parents 9d7e9ae + ba236cd commit 8812c38

File tree

63 files changed

+3802
-3568
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+3802
-3568
lines changed

README.md

Lines changed: 57 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,15 @@ hatch run list
4040

4141
```python
4242
from strands import Agent
43-
from strands_evaluation.dataset import Dataset
44-
from strands_evaluation.case import Case
45-
from strands_evaluation.evaluators.output_evaluator import OutputEvaluator
43+
from strands_evals import Case, Dataset
44+
from strands_evals.evaluators import OutputEvaluator
4645

47-
# 1. Create test cases
46+
# 1. Define a task function
47+
def get_response(query: str) -> str:
48+
agent = Agent(callback_handler=None)
49+
return str(agent(query))
50+
51+
# 2. Create test cases
4852
test_cases = [
4953
Case[str, str](
5054
name="knowledge-1",
@@ -59,25 +63,20 @@ test_cases = [
5963
)
6064
]
6165

62-
# 2. Create an evaluator
66+
# 3. Create an evaluator
6367
evaluator = OutputEvaluator(
6468
rubric="The output should represent a reasonable answer to the input."
6569
)
6670

67-
# 3. Create a dataset
71+
# 4. Create a dataset
6872
dataset = Dataset[str, str](
6973
cases=test_cases,
7074
evaluator=evaluator
7175
)
7276

73-
# 4. Define a task function
74-
def get_response(query: str) -> str:
75-
agent = Agent(callback_handler=None)
76-
return str(agent(query))
77-
7877
# 5. Run evaluations
7978
report = dataset.run_evaluations(get_response)
80-
report.display()
79+
report.run_display()
8180
```
8281

8382
## Saving and Loading Datasets
@@ -93,8 +92,8 @@ loaded_dataset = Dataset.from_file("./dataset_files/my_dataset.json", "json")
9392
## Custom Evaluators
9493

9594
```python
96-
from strands_evaluation.evaluators.evaluator import Evaluator
97-
from strands_evaluation.types.evaluation import EvaluationData, EvaluationOutput
95+
from strands_evals.evaluators import Evaluator
96+
from strands_evals.types import EvaluationData, EvaluationOutput
9897

9998
class CustomEvaluator(Evaluator[str, str]):
10099
def evaluate(self, evaluation_case: EvaluationData[str, str]) -> EvaluationOutput:
@@ -122,10 +121,21 @@ dataset = Dataset[str, str](
122121
## Evaluating Tool Usage
123122

124123
```python
124+
from strands_evals import Case, Dataset
125+
from strands_evals.evaluators import TrajectoryEvaluator
125126
from strands_tools import calculator
126-
from strands_evaluation.evaluators.trajectory_evaluator import TrajectoryEvaluator
127127

128-
# Create test cases with expected tool trajectories
128+
# 1. Define task that returns tool usage
129+
def get_response_with_tools(query: str) -> dict:
130+
agent = Agent(tools=[calculator])
131+
response = agent(query)
132+
133+
return {
134+
"output": str(response),
135+
"trajectory": list(response.metrics.tool_metrics.keys())
136+
}
137+
138+
# 2. Create test cases with expected tool trajectories
129139
test_case = Case[str, str](
130140
name="calculator-1",
131141
input="What is the square root of 9?",
@@ -134,23 +144,13 @@ test_case = Case[str, str](
134144
metadata={"category": "math"}
135145
)
136146

137-
# Create trajectory evaluator
147+
# 3. Create trajectory evaluator
138148
trajectory_evaluator = TrajectoryEvaluator(
139-
rubric="The trajectory should represent a reasonable use of tools based on the input.",
149+
rubric="Scoring should measure how well the agent uses appropriate tools for the given task.",
140150
include_inputs=True
141151
)
142152

143-
# Define task that returns tool usage
144-
def get_response_with_tools(query: str) -> dict:
145-
agent = Agent(tools=[calculator])
146-
response = agent(query)
147-
148-
return {
149-
"output": str(response),
150-
"trajectory": list(response.metrics.tool_metrics.keys())
151-
}
152-
153-
# Create dataset and run evaluations
153+
# 4. Create dataset and run evaluations
154154
dataset = Dataset[str, str](
155155
cases=[test_case],
156156
evaluator=trajectory_evaluator
@@ -159,33 +159,39 @@ dataset = Dataset[str, str](
159159
report = dataset.run_evaluations(get_response_with_tools)
160160
```
161161

162-
## Async Evaluation
163-
164-
For improved performance with many test cases, use async evaluation:
162+
## Dataset Generation
165163

166164
```python
167-
import asyncio
168-
from strands_evaluation.dataset import Dataset
169-
from strands_evaluation.evaluators.output_evaluator import OutputEvaluator
170-
171-
# Create dataset with cases and evaluator
172-
dataset = Dataset(cases=test_cases, evaluator=OutputEvaluator(rubric="Test rubric"))
165+
from strands_evals.generators import DatasetGenerator
166+
from strands_evals.evaluators import TrajectoryEvaluator
167+
168+
# 1. Define tool context
169+
tool_context = """
170+
Available tools:
171+
- calculator(expression: str) -> float: Evaluate mathematical expressions
172+
- web_search(query: str) -> str: Search the web for information
173+
"""
174+
175+
# 2. Generate dataset from context
176+
generator = DatasetGenerator[str, str](str, str)
177+
178+
dataset = await generator.generate_dataset(
179+
context=tool_context,
180+
num_cases=10,
181+
evaluator_type=TrajectoryEvaluator,
182+
task_description="Math and research assistant with tool usage"
183+
)
173184

174-
# Define async task function (optional)
175-
async def async_task(query):
176-
agent = Agent(callback_handler=None)
177-
response = await agent.invoke_async(query)
178-
return str(response)
185+
# 3. Save generated dataset
186+
dataset.to_file("generated_math_research_dataset")
187+
```
179188

180-
# Run evaluations asynchronously (works with both sync and async task functions)
181-
async def main():
182-
report = await dataset.run_evaluations_async(async_task, max_workers=5)
183-
report.display()
184-
return report
189+
## Available Evaluators
185190

186-
# Run the async function
187-
report = asyncio.run(main())
188-
```
191+
- **OutputEvaluator**: Evaluates the quality and correctness of agent outputs
192+
- **TrajectoryEvaluator**: Evaluates the sequence of tools/actions used by agents
193+
- **InteractionsEvaluator**: Evaluates multi-agent interactions and handoffs
194+
- **Custom Evaluators**: Create your own evaluation logic by extending the base Evaluator class
189195

190196
## More Examples
191197

pyproject.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ requires = ["hatchling", "hatch-vcs"]
33
build-backend = "hatchling.build"
44

55
[project]
6-
name = "strands-evaluation"
6+
name = "strands-agents-evals"
77
dynamic = ["version"]
88
description = "Evaluation framework for Strands"
99
readme = "README.md"
@@ -21,6 +21,9 @@ dependencies = [
2121
"typing-extensions>=4.0",
2222
]
2323

24+
[tool.hatch.build.targets.wheel]
25+
packages = ["src/strands_evals"]
26+
2427
[project.optional-dependencies]
2528
test = [
2629
"pytest>=7.0",
@@ -87,7 +90,7 @@ select = [
8790
]
8891

8992
[tool.hatch.version]
90-
path = "src/strands_evaluation/__init__.py"
93+
path = "src/strands_evals/__init__.py"
9194
[tool.pytest.ini_options]
9295
asyncio_mode = "auto"
9396
testpaths = ["tests"]

0 commit comments

Comments
 (0)