|
124 | 124 | """ |
125 | 125 |
|
126 | 126 |
|
127 | | -judge_environment_state_template = """You are an expert evaluator that assesses the environment state produced by a task according to a user-specified rubric. You'll receive some combination of: |
128 | | -- <Input>: Optional original input that initiated the task |
129 | | -- <Output>: Optional output response from the task |
130 | | -- <ActualEnvironmentState>: The actual state of the environment after task execution |
131 | | -- <ExpectedEnvironmentState>: Optional reference for what the environment state should be |
132 | | -- <Rubric>: Evaluation criteria |
133 | | -
|
134 | | -Evaluate the actual environment state against the expected state and rubric. Focus on whether the task produced the correct side effects in the environment (e.g., files created, database records modified, tests passing, system state changes). Ignore minor formatting differences and focus on semantic correctness of the state. |
135 | | -Keep the reason as concise as possible. |
136 | | -
|
137 | | -Examples: |
138 | | -<Input>Fix the failing test in test_auth.py</Input> |
139 | | -<ActualEnvironmentState>[{"name": "test_results", "state": {"exit_code": 0, "passed": 5, "failed": 0}}]</ActualEnvironmentState> |
140 | | -<ExpectedEnvironmentState>[{"name": "test_results", "state": {"exit_code": 0}}]</ExpectedEnvironmentState> |
141 | | -<Rubric>Pass if all tests pass after the fix. Score 0-1 based on test success.</Rubric> |
142 | | -{"reason": "All 5 tests pass with exit code 0, indicating the fix was successful.", "test_pass": true, "score": 1.0} |
143 | | -
|
144 | | -<Input>Create a user record in the database</Input> |
145 | | -<ActualEnvironmentState>[{"name": "database", "state": {"users_table": [{"id": 1, "name": "John", "email": "john@example.com"}]}}]</ActualEnvironmentState> |
146 | | -<ExpectedEnvironmentState>[{"name": "database", "state": {"users_table": [{"id": 1, "name": "Jane", "email": "jane@example.com"}]}}]</ExpectedEnvironmentState> |
147 | | -<Rubric>Pass if the correct user record was created. Score 0-1 based on record accuracy.</Rubric> |
148 | | -{"reason": "A user record was created but with incorrect data: name is 'John' instead of 'Jane' and email is 'john@example.com' instead of 'jane@example.com'.", "test_pass": false, "score": 0.2} |
149 | | -""" |
150 | | - |
151 | 127 | judge_interactions_template = """You are an expert evaluator that assesses multi-agent interactions according to a user-specified rubric. You'll receive: |
152 | 128 | - <Input>: Optional original input that initiated the interaction sequence |
153 | 129 | - <Interaction>: Current interaction with node name, dependencies, and message |
|
0 commit comments