feat: add environment state evaluation support

afarntrog · afarntrog · commit 6124ce2f4452 · 2026-03-10T16:49:24.000-04:00
Add EnvironmentState type and evaluators for assessing agent side effects
on external environments (e.g., file systems, databases, APIs).

- Add EnvironmentState model with name/state fields
- Add expected_environment_state field to Case
- Add actual/expected environment state fields to EvaluationData
- Add StateEquals deterministic evaluator for exact state matching
- Add EnvironmentStateEvaluator (LLM-based) for semantic state evaluation
- Include comprehensive tests for all new functionality
diff --git a/src/strands_evals/case.py b/src/strands_evals/case.py
@@ -3,7 +3,7 @@
 from pydantic import BaseModel, Field
 from typing_extensions import Any, Generic
 
-from .types.evaluation import InputT, Interaction, OutputT
+from .types.evaluation import EnvironmentState, InputT, Interaction, OutputT
 
 
 class Case(BaseModel, Generic[InputT, OutputT]):
@@ -47,4 +47,5 @@ class Case(BaseModel, Generic[InputT, OutputT]):
     expected_output: OutputT | None = None
     expected_trajectory: list[Any] | None = None
     expected_interactions: list[Interaction] | None = None
+    expected_environment_state: list[EnvironmentState] | None = None
     metadata: dict[str, Any] | None = None
diff --git a/src/strands_evals/evaluators/__init__.py b/src/strands_evals/evaluators/__init__.py
@@ -1,6 +1,7 @@
 from .coherence_evaluator import CoherenceEvaluator
 from .conciseness_evaluator import ConcisenessEvaluator
-from .deterministic import Contains, Equals, StartsWith, ToolCalled
+from .deterministic import Contains, Equals, StartsWith, StateEquals, ToolCalled
+from .environment_state_evaluator import EnvironmentStateEvaluator
 from .evaluator import Evaluator
 from .faithfulness_evaluator import FaithfulnessEvaluator
 from .goal_success_rate_evaluator import GoalSuccessRateEvaluator
@@ -27,8 +28,10 @@
     "ToolParameterAccuracyEvaluator",
     "ConcisenessEvaluator",
     "CoherenceEvaluator",
+    "EnvironmentStateEvaluator",
     "Contains",
     "Equals",
     "StartsWith",
+    "StateEquals",
     "ToolCalled",
 ]
diff --git a/src/strands_evals/evaluators/deterministic/__init__.py b/src/strands_evals/evaluators/deterministic/__init__.py
@@ -1,9 +1,11 @@
+from .environment_state import StateEquals
 from .output import Contains, Equals, StartsWith
 from .trajectory import ToolCalled
 
 __all__ = [
     "Contains",
     "Equals",
     "StartsWith",
+    "StateEquals",
     "ToolCalled",
 ]
diff --git a/src/strands_evals/evaluators/deterministic/environment_state.py b/src/strands_evals/evaluators/deterministic/environment_state.py
@@ -0,0 +1,67 @@
+from typing_extensions import Any
+
+from ...types.evaluation import EnvironmentState, EvaluationData, EvaluationOutput, InputT, OutputT
+from ..evaluator import Evaluator
+
+
+def _find_state_by_name(states: list[EnvironmentState], name: str) -> EnvironmentState | None:
+    """Find an EnvironmentState by name in a list of states."""
+    for state in states:
+        if state.name == name:
+            return state
+    return None
+
+
+class StateEquals(Evaluator[InputT, OutputT]):
+    """Checks if a named environment state matches an expected value."""
+
+    def __init__(self, name: str, value: Any | None = None):
+        super().__init__()
+        self.name = name
+        self.value = value
+
+    def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+        if not evaluation_case.actual_environment_state:
+            return [
+                EvaluationOutput(
+                    score=0.0,
+                    test_pass=False,
+                    reason=f"state '{self.name}' not found: actual_environment_state is empty or None",
+                )
+            ]
+
+        actual_state = _find_state_by_name(evaluation_case.actual_environment_state, self.name)
+        if actual_state is None:
+            return [
+                EvaluationOutput(
+                    score=0.0,
+                    test_pass=False,
+                    reason=f"state '{self.name}' not found in actual_environment_state",
+                )
+            ]
+
+        if self.value is not None:
+            expected = self.value
+        elif evaluation_case.expected_environment_state:
+            expected_state = _find_state_by_name(evaluation_case.expected_environment_state, self.name)
+            if expected_state is None:
+                raise ValueError(
+                    f"state '{self.name}' not found in expected_environment_state and no explicit value provided"
+                )
+            expected = expected_state.state
+        else:
+            raise ValueError(
+                f"no expected value for state '{self.name}': provide value param or expected_environment_state"
+            )
+
+        match = actual_state.state == expected
+        return [
+            EvaluationOutput(
+                score=1.0 if match else 0.0,
+                test_pass=match,
+                reason=f"state '{self.name}' {'matches' if match else 'does not match'} expected value",
+            )
+        ]
+
+    async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+        return self.evaluate(evaluation_case)
diff --git a/src/strands_evals/evaluators/environment_state_evaluator.py b/src/strands_evals/evaluators/environment_state_evaluator.py
@@ -0,0 +1,58 @@
+from typing import cast
+
+from strands import Agent
+from strands.models.model import Model
+from typing_extensions import Union
+
+from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
+from .evaluator import Evaluator
+from .prompt_templates.case_prompt_template import compose_test_prompt
+from .prompt_templates.prompt_templates import judge_environment_state_template as SYSTEM_PROMPT
+
+
+class EnvironmentStateEvaluator(Evaluator[InputT, OutputT]):
+    """Evaluates environment state produced by a task using an LLM judge.
+
+    Attributes:
+        rubric: The user-specified criteria for evaluating environment state.
+        model: A string representing the model-id for Bedrock to use, or a Model instance.
+                    Defaults to strands.models.BedrockModel if None.
+        system_prompt: System prompt to guide model behavior.
+                    If None, the evaluator will use the default environment state template.
+        include_inputs: Whether to include inputs to the task in the evaluation or not.
+    """
+
+    def __init__(
+        self,
+        rubric: str,
+        model: Union[Model, str, None] = None,
+        system_prompt: str = SYSTEM_PROMPT,
+        include_inputs: bool = True,
+    ):
+        super().__init__()
+        self.rubric = rubric
+        self.model = model
+        self.include_inputs = include_inputs
+        self.system_prompt = system_prompt
+
+    def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+        evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
+        evaluation_prompt = compose_test_prompt(
+            evaluation_case=evaluation_case,
+            rubric=self.rubric,
+            include_inputs=self.include_inputs,
+            uses_environment_state=True,
+        )
+        result = evaluator_agent(evaluation_prompt, structured_output_model=EvaluationOutput)
+        return [cast(EvaluationOutput, result.structured_output)]
+
+    async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
+        evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
+        evaluation_prompt = compose_test_prompt(
+            evaluation_case=evaluation_case,
+            rubric=self.rubric,
+            include_inputs=self.include_inputs,
+            uses_environment_state=True,
+        )
+        result = await evaluator_agent.invoke_async(evaluation_prompt, structured_output_model=EvaluationOutput)
+        return [cast(EvaluationOutput, result.structured_output)]
diff --git a/src/strands_evals/evaluators/prompt_templates/case_prompt_template.py b/src/strands_evals/evaluators/prompt_templates/case_prompt_template.py
@@ -7,6 +7,7 @@ def compose_test_prompt(
     include_inputs: bool,
     uses_trajectory: bool = False,
     trajectory_description: dict | None = None,
+    uses_environment_state: bool = False,
 ) -> str:
     """
     Compose the prompt for a test case evaluation.
@@ -17,19 +18,21 @@ def compose_test_prompt(
         include_inputs: Whether to include the input in the prompt
         uses_trajectory: Whether this is a trajectory-based evaluation
         trajectory_description: A dictionary describing the type of trajectory expected for this evaluation.
+        uses_environment_state: Whether this is an environment-state-based evaluation
 
     Returns:
         str: The formatted evaluation prompt
 
     Raises:
-        Exception: If actual_output is missing for non-trajectory evaluations
+        Exception: If actual_output is missing for output-only evaluations
         Exception: If actual_trajectory is missing for trajectory evaluations
+        Exception: If actual_environment_state is missing for environment state evaluations
     """
     evaluation_prompt = "Evaluate this singular test case. THE FINAL SCORE MUST BE A DECIMAL BETWEEN 0.0 AND 1.0 (NOT 0 to 10 OR 0 to 100). \n"
     if include_inputs:
         evaluation_prompt += f"<Input>{evaluation_case.input}</Input>\n"
 
-    if uses_trajectory:  # trajectory evaluations don't require actual_output
+    if uses_trajectory or uses_environment_state:  # these evaluations don't require actual_output
         if evaluation_case.actual_output:
             evaluation_prompt += f"<Output>{evaluation_case.actual_output}</Output>\n"
     else:
@@ -53,6 +56,18 @@ def compose_test_prompt(
         if trajectory_description:
             evaluation_prompt += f"<TrajectoryDescription>{trajectory_description}</TrajectoryDescription>\n"
 
+    if uses_environment_state:
+        if evaluation_case.actual_environment_state is None:
+            raise Exception("Please make sure the task function return a dictionary with the key 'environment_state'.")
+        evaluation_prompt += (
+            f"<ActualEnvironmentState>{evaluation_case.actual_environment_state}</ActualEnvironmentState>\n"
+        )
+
+        if evaluation_case.expected_environment_state:
+            evaluation_prompt += (
+                f"<ExpectedEnvironmentState>{evaluation_case.expected_environment_state}</ExpectedEnvironmentState>\n"
+            )
+
     evaluation_prompt += f"<Rubric>{rubric}</Rubric>"
 
     return evaluation_prompt
diff --git a/src/strands_evals/evaluators/prompt_templates/prompt_templates.py b/src/strands_evals/evaluators/prompt_templates/prompt_templates.py
@@ -124,6 +124,30 @@
 """
 
 
+judge_environment_state_template = """You are an expert evaluator that assesses the environment state produced by a task according to a user-specified rubric. You'll receive some combination of:
+- <Input>: Optional original input that initiated the task
+- <Output>: Optional output response from the task
+- <ActualEnvironmentState>: The actual state of the environment after task execution
+- <ExpectedEnvironmentState>: Optional reference for what the environment state should be
+- <Rubric>: Evaluation criteria
+
+Evaluate the actual environment state against the expected state and rubric. Focus on whether the task produced the correct side effects in the environment (e.g., files created, database records modified, tests passing, system state changes). Ignore minor formatting differences and focus on semantic correctness of the state.
+Keep the reason as concise as possible.
+
+Examples:
+<Input>Fix the failing test in test_auth.py</Input>
+<ActualEnvironmentState>[{"name": "test_results", "state": {"exit_code": 0, "passed": 5, "failed": 0}}]</ActualEnvironmentState>
+<ExpectedEnvironmentState>[{"name": "test_results", "state": {"exit_code": 0}}]</ExpectedEnvironmentState>
+<Rubric>Pass if all tests pass after the fix. Score 0-1 based on test success.</Rubric>
+{"reason": "All 5 tests pass with exit code 0, indicating the fix was successful.", "test_pass": true, "score": 1.0}
+
+<Input>Create a user record in the database</Input>
+<ActualEnvironmentState>[{"name": "database", "state": {"users_table": [{"id": 1, "name": "John", "email": "john@example.com"}]}}]</ActualEnvironmentState>
+<ExpectedEnvironmentState>[{"name": "database", "state": {"users_table": [{"id": 1, "name": "Jane", "email": "jane@example.com"}]}}]</ExpectedEnvironmentState>
+<Rubric>Pass if the correct user record was created. Score 0-1 based on record accuracy.</Rubric>
+{"reason": "A user record was created but with incorrect data: name is 'John' instead of 'Jane' and email is 'john@example.com' instead of 'jane@example.com'.", "test_pass": false, "score": 0.2}
+"""
+
 judge_interactions_template = """You are an expert evaluator that assesses multi-agent interactions according to a user-specified rubric. You'll receive:
 - <Input>: Optional original input that initiated the interaction sequence
 - <Interaction>: Current interaction with node name, dependencies, and message
diff --git a/src/strands_evals/experiment.py b/src/strands_evals/experiment.py
@@ -17,7 +17,8 @@
 from typing_extensions import Any, Generic
 
 from .case import Case
-from .evaluators.deterministic import Contains, Equals, StartsWith, ToolCalled
+from .evaluators.deterministic import Contains, Equals, StartsWith, StateEquals, ToolCalled
+from .evaluators.environment_state_evaluator import EnvironmentStateEvaluator
 from .evaluators.evaluator import Evaluator
 from .evaluators.interactions_evaluator import InteractionsEvaluator
 from .evaluators.output_evaluator import OutputEvaluator
@@ -202,13 +203,15 @@ def _run_task(
             expected_output=case.expected_output,
             expected_trajectory=case.expected_trajectory,
             expected_interactions=case.expected_interactions,
+            expected_environment_state=case.expected_environment_state,
             metadata=case.metadata,
         )
         task_output = task(case)
         if isinstance(task_output, dict):  # could be evaluating the trajectory as well
             evaluation_context.actual_output = task_output.get("output")
             evaluation_context.actual_trajectory = task_output.get("trajectory")
             evaluation_context.actual_interactions = task_output.get("interactions")
+            evaluation_context.actual_environment_state = task_output.get("environment_state")
             new_input = task_output.get("input", None)  # allows the user to update the input in the task function
             if new_input is not None:
                 evaluation_context.input = new_input
@@ -238,6 +241,7 @@ async def _run_task_async(
             expected_output=case.expected_output,
             expected_trajectory=case.expected_trajectory,
             expected_interactions=case.expected_interactions,
+            expected_environment_state=case.expected_environment_state,
             metadata=case.metadata,
         )
 
@@ -252,6 +256,7 @@ async def _run_task_async(
             evaluation_context.actual_output = task_output.get("output")
             evaluation_context.actual_trajectory = task_output.get("trajectory")
             evaluation_context.actual_interactions = task_output.get("interactions")
+            evaluation_context.actual_environment_state = task_output.get("environment_state")
             # allows the user to update the input in the task function
             new_input = task_output.get("input", None)
             if new_input is not None:
@@ -798,7 +803,9 @@ def from_dict(cls, data: dict, custom_evaluators: list[type[Evaluator]] | None =
             "Equals": Equals,
             "Contains": Contains,
             "StartsWith": StartsWith,
+            "StateEquals": StateEquals,
             "ToolCalled": ToolCalled,
+            "EnvironmentStateEvaluator": EnvironmentStateEvaluator,
         }
         all_evaluators: dict[str, type[Evaluator]] = {
             **default_evaluators,
diff --git a/src/strands_evals/types/__init__.py b/src/strands_evals/types/__init__.py
@@ -1,7 +1,8 @@
-from .evaluation import EvaluationData, EvaluationOutput, InputT, Interaction, OutputT, TaskOutput
+from .evaluation import EnvironmentState, EvaluationData, EvaluationOutput, InputT, Interaction, OutputT, TaskOutput
 from .simulation import ActorProfile, ActorResponse
 
 __all__ = [
+    "EnvironmentState",
     "Interaction",
     "TaskOutput",
     "EvaluationData",
diff --git a/src/strands_evals/types/evaluation.py b/src/strands_evals/types/evaluation.py
@@ -33,6 +33,18 @@ class Interaction(TypedDict, total=False):
     messages: list
 
 
+class EnvironmentState(BaseModel):
+    """A named piece of environment state captured after task execution.
+
+    Attributes:
+        name: Identifier for this state (e.g., "test_results", "file_system")
+        state: The captured state data
+    """
+
+    name: str
+    state: Any
+
+
 class TaskOutput(TypedDict, total=False):
     """
     Structured output format for task functions that return complex results.
@@ -59,6 +71,7 @@ class TaskOutput(TypedDict, total=False):
     trajectory: Union[list[Any], Session, None]
     interactions: list[Interaction]
     input: Any
+    environment_state: list[EnvironmentState]
 
 
 class EvaluationData(BaseModel, Generic[InputT, OutputT]):
@@ -86,6 +99,8 @@ class EvaluationData(BaseModel, Generic[InputT, OutputT]):
     metadata: dict[str, Any] | None = None
     actual_interactions: list[Interaction] | None = None
     expected_interactions: list[Interaction] | None = None
+    actual_environment_state: list[EnvironmentState] | None = None
+    expected_environment_state: list[EnvironmentState] | None = None
 
 
 class EvaluationOutput(BaseModel):
diff --git a/tests/strands_evals/evaluators/deterministic/test_environment_state.py b/tests/strands_evals/evaluators/deterministic/test_environment_state.py
diff --git a/tests/strands_evals/evaluators/test_environment_state_evaluator.py b/tests/strands_evals/evaluators/test_environment_state_evaluator.py
diff --git a/tests/strands_evals/test_experiment.py b/tests/strands_evals/test_experiment.py
diff --git a/tests_integ/test_environment_state_evaluator.py b/tests_integ/test_environment_state_evaluator.py