Skip to content

Commit 36f4903

Browse files
author
Daniel Schlör
committed
tweak to avoid unnecessary eval calls
1 parent dd17a02 commit 36f4903

File tree

2 files changed

+25
-2
lines changed

2 files changed

+25
-2
lines changed

src/agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ async def run(self, message: Message, updater: TaskUpdater) -> None:
104104
challenges = request.config.get("challenges", [])
105105
agent_config = request.config.get("agent_config", {})
106106
evaluator_config = request.config.get("evaluator_config", {})
107-
max_iterations = request.config.get("max_iterations", 10)
107+
max_iterations = request.config.get("max_iterations", 5)
108108
enable_phoenix = request.config.get("enable_phoenix", False)
109109
include_goal = request.config.get("include_goal", "first")
110110
include_tactic = request.config.get("include_tactic", "first")

src/evaluator/step_evaluator.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,14 +121,28 @@ def evaluate_prediction(
121121
- matched_command: The predicted command that matched
122122
- confidence: Confidence score (0.0-1.0)
123123
- explanation: Reason for the evaluation
124+
- agent_unknown: True if agent explicitly said "I don't know"
124125
"""
125126
if not agent_response or not agent_response.strip():
126127
return {
127128
"completed": False,
128129
"matched_alternative_index": -1,
129130
"matched_command": None,
130131
"confidence": 0.0,
131-
"explanation": "Empty agent response"
132+
"explanation": "Empty agent response",
133+
"is_fine_grained": False
134+
}
135+
136+
# Check for exact "I don't know" response - no need to evaluate or reiterate
137+
if agent_response.strip() == "I don't know":
138+
return {
139+
"completed": False,
140+
"matched_alternative_index": -1,
141+
"matched_command": None,
142+
"confidence": 0.0,
143+
"explanation": "Agent explicitly indicated lack of knowledge",
144+
"is_fine_grained": False,
145+
"agent_unknown": True
132146
}
133147

134148
# Protocol determines which alternatives to check
@@ -932,6 +946,15 @@ def check_goal_reached(
932946
- reason: Explanation of the decision
933947
- is_fine_grained: Whether the command/result is too fine-grained
934948
"""
949+
# If agent explicitly said "I don't know", stop without reiterating
950+
if evaluation_result.get("agent_unknown", False):
951+
return {
952+
"goal_reached": True, # Consider it "ruled out"
953+
"needs_more_predictions": False,
954+
"reason": "Agent indicated lack of knowledge - goal ruled out",
955+
"is_fine_grained": False
956+
}
957+
935958
# If we found a match, goal is reached
936959
if evaluation_result["completed"]:
937960
# Adapt message based on protocol and task mode

0 commit comments

Comments
 (0)