Skip to content

Commit 6124ce2

Browse files
committed
feat: add environment state evaluation support
Add EnvironmentState type and evaluators for assessing agent side effects on external environments (e.g., file systems, databases, APIs). - Add EnvironmentState model with name/state fields - Add expected_environment_state field to Case - Add actual/expected environment state fields to EvaluationData - Add StateEquals deterministic evaluator for exact state matching - Add EnvironmentStateEvaluator (LLM-based) for semantic state evaluation - Include comprehensive tests for all new functionality
1 parent e5a4c61 commit 6124ce2

File tree

14 files changed

+795
-6
lines changed

14 files changed

+795
-6
lines changed

src/strands_evals/case.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pydantic import BaseModel, Field
44
from typing_extensions import Any, Generic
55

6-
from .types.evaluation import InputT, Interaction, OutputT
6+
from .types.evaluation import EnvironmentState, InputT, Interaction, OutputT
77

88

99
class Case(BaseModel, Generic[InputT, OutputT]):
@@ -47,4 +47,5 @@ class Case(BaseModel, Generic[InputT, OutputT]):
4747
expected_output: OutputT | None = None
4848
expected_trajectory: list[Any] | None = None
4949
expected_interactions: list[Interaction] | None = None
50+
expected_environment_state: list[EnvironmentState] | None = None
5051
metadata: dict[str, Any] | None = None

src/strands_evals/evaluators/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from .coherence_evaluator import CoherenceEvaluator
22
from .conciseness_evaluator import ConcisenessEvaluator
3-
from .deterministic import Contains, Equals, StartsWith, ToolCalled
3+
from .deterministic import Contains, Equals, StartsWith, StateEquals, ToolCalled
4+
from .environment_state_evaluator import EnvironmentStateEvaluator
45
from .evaluator import Evaluator
56
from .faithfulness_evaluator import FaithfulnessEvaluator
67
from .goal_success_rate_evaluator import GoalSuccessRateEvaluator
@@ -27,8 +28,10 @@
2728
"ToolParameterAccuracyEvaluator",
2829
"ConcisenessEvaluator",
2930
"CoherenceEvaluator",
31+
"EnvironmentStateEvaluator",
3032
"Contains",
3133
"Equals",
3234
"StartsWith",
35+
"StateEquals",
3336
"ToolCalled",
3437
]
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
from .environment_state import StateEquals
12
from .output import Contains, Equals, StartsWith
23
from .trajectory import ToolCalled
34

45
__all__ = [
56
"Contains",
67
"Equals",
78
"StartsWith",
9+
"StateEquals",
810
"ToolCalled",
911
]
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from typing_extensions import Any
2+
3+
from ...types.evaluation import EnvironmentState, EvaluationData, EvaluationOutput, InputT, OutputT
4+
from ..evaluator import Evaluator
5+
6+
7+
def _find_state_by_name(states: list[EnvironmentState], name: str) -> EnvironmentState | None:
8+
"""Find an EnvironmentState by name in a list of states."""
9+
for state in states:
10+
if state.name == name:
11+
return state
12+
return None
13+
14+
15+
class StateEquals(Evaluator[InputT, OutputT]):
16+
"""Checks if a named environment state matches an expected value."""
17+
18+
def __init__(self, name: str, value: Any | None = None):
19+
super().__init__()
20+
self.name = name
21+
self.value = value
22+
23+
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
24+
if not evaluation_case.actual_environment_state:
25+
return [
26+
EvaluationOutput(
27+
score=0.0,
28+
test_pass=False,
29+
reason=f"state '{self.name}' not found: actual_environment_state is empty or None",
30+
)
31+
]
32+
33+
actual_state = _find_state_by_name(evaluation_case.actual_environment_state, self.name)
34+
if actual_state is None:
35+
return [
36+
EvaluationOutput(
37+
score=0.0,
38+
test_pass=False,
39+
reason=f"state '{self.name}' not found in actual_environment_state",
40+
)
41+
]
42+
43+
if self.value is not None:
44+
expected = self.value
45+
elif evaluation_case.expected_environment_state:
46+
expected_state = _find_state_by_name(evaluation_case.expected_environment_state, self.name)
47+
if expected_state is None:
48+
raise ValueError(
49+
f"state '{self.name}' not found in expected_environment_state and no explicit value provided"
50+
)
51+
expected = expected_state.state
52+
else:
53+
raise ValueError(
54+
f"no expected value for state '{self.name}': provide value param or expected_environment_state"
55+
)
56+
57+
match = actual_state.state == expected
58+
return [
59+
EvaluationOutput(
60+
score=1.0 if match else 0.0,
61+
test_pass=match,
62+
reason=f"state '{self.name}' {'matches' if match else 'does not match'} expected value",
63+
)
64+
]
65+
66+
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
67+
return self.evaluate(evaluation_case)
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
from typing import cast
2+
3+
from strands import Agent
4+
from strands.models.model import Model
5+
from typing_extensions import Union
6+
7+
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
8+
from .evaluator import Evaluator
9+
from .prompt_templates.case_prompt_template import compose_test_prompt
10+
from .prompt_templates.prompt_templates import judge_environment_state_template as SYSTEM_PROMPT
11+
12+
13+
class EnvironmentStateEvaluator(Evaluator[InputT, OutputT]):
14+
"""Evaluates environment state produced by a task using an LLM judge.
15+
16+
Attributes:
17+
rubric: The user-specified criteria for evaluating environment state.
18+
model: A string representing the model-id for Bedrock to use, or a Model instance.
19+
Defaults to strands.models.BedrockModel if None.
20+
system_prompt: System prompt to guide model behavior.
21+
If None, the evaluator will use the default environment state template.
22+
include_inputs: Whether to include inputs to the task in the evaluation or not.
23+
"""
24+
25+
def __init__(
26+
self,
27+
rubric: str,
28+
model: Union[Model, str, None] = None,
29+
system_prompt: str = SYSTEM_PROMPT,
30+
include_inputs: bool = True,
31+
):
32+
super().__init__()
33+
self.rubric = rubric
34+
self.model = model
35+
self.include_inputs = include_inputs
36+
self.system_prompt = system_prompt
37+
38+
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
39+
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
40+
evaluation_prompt = compose_test_prompt(
41+
evaluation_case=evaluation_case,
42+
rubric=self.rubric,
43+
include_inputs=self.include_inputs,
44+
uses_environment_state=True,
45+
)
46+
result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
47+
return [cast(EvaluationOutput, result.structured_output)]
48+
49+
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
50+
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
51+
evaluation_prompt = compose_test_prompt(
52+
evaluation_case=evaluation_case,
53+
rubric=self.rubric,
54+
include_inputs=self.include_inputs,
55+
uses_environment_state=True,
56+
)
57+
result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
58+
return [cast(EvaluationOutput, result.structured_output)]

src/strands_evals/evaluators/prompt_templates/case_prompt_template.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ def compose_test_prompt(
77
include_inputs: bool,
88
uses_trajectory: bool = False,
99
trajectory_description: dict | None = None,
10+
uses_environment_state: bool = False,
1011
) -> str:
1112
"""
1213
Compose the prompt for a test case evaluation.
@@ -17,19 +18,21 @@ def compose_test_prompt(
1718
include_inputs: Whether to include the input in the prompt
1819
uses_trajectory: Whether this is a trajectory-based evaluation
1920
trajectory_description: A dictionary describing the type of trajectory expected for this evaluation.
21+
uses_environment_state: Whether this is an environment-state-based evaluation
2022
2123
Returns:
2224
str: The formatted evaluation prompt
2325
2426
Raises:
25-
Exception: If actual_output is missing for non-trajectory evaluations
27+
Exception: If actual_output is missing for output-only evaluations
2628
Exception: If actual_trajectory is missing for trajectory evaluations
29+
Exception: If actual_environment_state is missing for environment state evaluations
2730
"""
2831
evaluation_prompt = "Evaluate this singular test case. THE FINAL SCORE MUST BE A DECIMAL BETWEEN 0.0 AND 1.0 (NOT 0 to 10 OR 0 to 100). \n"
2932
if include_inputs:
3033
evaluation_prompt += f"<Input>{evaluation_case.input}</Input>\n"
3134

32-
if uses_trajectory: # trajectory evaluations don't require actual_output
35+
if uses_trajectory or uses_environment_state: # these evaluations don't require actual_output
3336
if evaluation_case.actual_output:
3437
evaluation_prompt += f"<Output>{evaluation_case.actual_output}</Output>\n"
3538
else:
@@ -53,6 +56,18 @@ def compose_test_prompt(
5356
if trajectory_description:
5457
evaluation_prompt += f"<TrajectoryDescription>{trajectory_description}</TrajectoryDescription>\n"
5558

59+
if uses_environment_state:
60+
if evaluation_case.actual_environment_state is None:
61+
raise Exception("Please make sure the task function return a dictionary with the key 'environment_state'.")
62+
evaluation_prompt += (
63+
f"<ActualEnvironmentState>{evaluation_case.actual_environment_state}</ActualEnvironmentState>\n"
64+
)
65+
66+
if evaluation_case.expected_environment_state:
67+
evaluation_prompt += (
68+
f"<ExpectedEnvironmentState>{evaluation_case.expected_environment_state}</ExpectedEnvironmentState>\n"
69+
)
70+
5671
evaluation_prompt += f"<Rubric>{rubric}</Rubric>"
5772

5873
return evaluation_prompt

src/strands_evals/evaluators/prompt_templates/prompt_templates.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,30 @@
124124
"""
125125

126126

127+
judge_environment_state_template = """You are an expert evaluator that assesses the environment state produced by a task according to a user-specified rubric. You'll receive some combination of:
128+
- <Input>: Optional original input that initiated the task
129+
- <Output>: Optional output response from the task
130+
- <ActualEnvironmentState>: The actual state of the environment after task execution
131+
- <ExpectedEnvironmentState>: Optional reference for what the environment state should be
132+
- <Rubric>: Evaluation criteria
133+
134+
Evaluate the actual environment state against the expected state and rubric. Focus on whether the task produced the correct side effects in the environment (e.g., files created, database records modified, tests passing, system state changes). Ignore minor formatting differences and focus on semantic correctness of the state.
135+
Keep the reason as concise as possible.
136+
137+
Examples:
138+
<Input>Fix the failing test in test_auth.py</Input>
139+
<ActualEnvironmentState>[{"name": "test_results", "state": {"exit_code": 0, "passed": 5, "failed": 0}}]</ActualEnvironmentState>
140+
<ExpectedEnvironmentState>[{"name": "test_results", "state": {"exit_code": 0}}]</ExpectedEnvironmentState>
141+
<Rubric>Pass if all tests pass after the fix. Score 0-1 based on test success.</Rubric>
142+
{"reason": "All 5 tests pass with exit code 0, indicating the fix was successful.", "test_pass": true, "score": 1.0}
143+
144+
<Input>Create a user record in the database</Input>
145+
<ActualEnvironmentState>[{"name": "database", "state": {"users_table": [{"id": 1, "name": "John", "email": "john@example.com"}]}}]</ActualEnvironmentState>
146+
<ExpectedEnvironmentState>[{"name": "database", "state": {"users_table": [{"id": 1, "name": "Jane", "email": "jane@example.com"}]}}]</ExpectedEnvironmentState>
147+
<Rubric>Pass if the correct user record was created. Score 0-1 based on record accuracy.</Rubric>
148+
{"reason": "A user record was created but with incorrect data: name is 'John' instead of 'Jane' and email is 'john@example.com' instead of 'jane@example.com'.", "test_pass": false, "score": 0.2}
149+
"""
150+
127151
judge_interactions_template = """You are an expert evaluator that assesses multi-agent interactions according to a user-specified rubric. You'll receive:
128152
- <Input>: Optional original input that initiated the interaction sequence
129153
- <Interaction>: Current interaction with node name, dependencies, and message

src/strands_evals/experiment.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
from typing_extensions import Any, Generic
1818

1919
from .case import Case
20-
from .evaluators.deterministic import Contains, Equals, StartsWith, ToolCalled
20+
from .evaluators.deterministic import Contains, Equals, StartsWith, StateEquals, ToolCalled
21+
from .evaluators.environment_state_evaluator import EnvironmentStateEvaluator
2122
from .evaluators.evaluator import Evaluator
2223
from .evaluators.interactions_evaluator import InteractionsEvaluator
2324
from .evaluators.output_evaluator import OutputEvaluator
@@ -202,13 +203,15 @@ def _run_task(
202203
expected_output=case.expected_output,
203204
expected_trajectory=case.expected_trajectory,
204205
expected_interactions=case.expected_interactions,
206+
expected_environment_state=case.expected_environment_state,
205207
metadata=case.metadata,
206208
)
207209
task_output = task(case)
208210
if isinstance(task_output, dict): # could be evaluating the trajectory as well
209211
evaluation_context.actual_output = task_output.get("output")
210212
evaluation_context.actual_trajectory = task_output.get("trajectory")
211213
evaluation_context.actual_interactions = task_output.get("interactions")
214+
evaluation_context.actual_environment_state = task_output.get("environment_state")
212215
new_input = task_output.get("input", None) # allows the user to update the input in the task function
213216
if new_input is not None:
214217
evaluation_context.input = new_input
@@ -238,6 +241,7 @@ async def _run_task_async(
238241
expected_output=case.expected_output,
239242
expected_trajectory=case.expected_trajectory,
240243
expected_interactions=case.expected_interactions,
244+
expected_environment_state=case.expected_environment_state,
241245
metadata=case.metadata,
242246
)
243247

@@ -252,6 +256,7 @@ async def _run_task_async(
252256
evaluation_context.actual_output = task_output.get("output")
253257
evaluation_context.actual_trajectory = task_output.get("trajectory")
254258
evaluation_context.actual_interactions = task_output.get("interactions")
259+
evaluation_context.actual_environment_state = task_output.get("environment_state")
255260
# allows the user to update the input in the task function
256261
new_input = task_output.get("input", None)
257262
if new_input is not None:
@@ -798,7 +803,9 @@ def from_dict(cls, data: dict, custom_evaluators: list[type[Evaluator]] | None =
798803
"Equals": Equals,
799804
"Contains": Contains,
800805
"StartsWith": StartsWith,
806+
"StateEquals": StateEquals,
801807
"ToolCalled": ToolCalled,
808+
"EnvironmentStateEvaluator": EnvironmentStateEvaluator,
802809
}
803810
all_evaluators: dict[str, type[Evaluator]] = {
804811
**default_evaluators,

src/strands_evals/types/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
from .evaluation import EvaluationData, EvaluationOutput, InputT, Interaction, OutputT, TaskOutput
1+
from .evaluation import EnvironmentState, EvaluationData, EvaluationOutput, InputT, Interaction, OutputT, TaskOutput
22
from .simulation import ActorProfile, ActorResponse
33

44
__all__ = [
5+
"EnvironmentState",
56
"Interaction",
67
"TaskOutput",
78
"EvaluationData",

src/strands_evals/types/evaluation.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,18 @@ class Interaction(TypedDict, total=False):
3333
messages: list
3434

3535

36+
class EnvironmentState(BaseModel):
37+
"""A named piece of environment state captured after task execution.
38+
39+
Attributes:
40+
name: Identifier for this state (e.g., "test_results", "file_system")
41+
state: The captured state data
42+
"""
43+
44+
name: str
45+
state: Any
46+
47+
3648
class TaskOutput(TypedDict, total=False):
3749
"""
3850
Structured output format for task functions that return complex results.
@@ -59,6 +71,7 @@ class TaskOutput(TypedDict, total=False):
5971
trajectory: Union[list[Any], Session, None]
6072
interactions: list[Interaction]
6173
input: Any
74+
environment_state: list[EnvironmentState]
6275

6376

6477
class EvaluationData(BaseModel, Generic[InputT, OutputT]):
@@ -86,6 +99,8 @@ class EvaluationData(BaseModel, Generic[InputT, OutputT]):
8699
metadata: dict[str, Any] | None = None
87100
actual_interactions: list[Interaction] | None = None
88101
expected_interactions: list[Interaction] | None = None
102+
actual_environment_state: list[EnvironmentState] | None = None
103+
expected_environment_state: list[EnvironmentState] | None = None
89104

90105

91106
class EvaluationOutput(BaseModel):

0 commit comments

Comments
 (0)