@@ -121,14 +121,28 @@ def evaluate_prediction(
121121 - matched_command: The predicted command that matched
122122 - confidence: Confidence score (0.0-1.0)
123123 - explanation: Reason for the evaluation
124+ - agent_unknown: True if agent explicitly said "I don't know"
124125 """
125126 if not agent_response or not agent_response .strip ():
126127 return {
127128 "completed" : False ,
128129 "matched_alternative_index" : - 1 ,
129130 "matched_command" : None ,
130131 "confidence" : 0.0 ,
131- "explanation" : "Empty agent response"
132+ "explanation" : "Empty agent response" ,
133+ "is_fine_grained" : False
134+ }
135+
136+ # Check for exact "I don't know" response - no need to evaluate or reiterate
137+ if agent_response .strip () == "I don't know" :
138+ return {
139+ "completed" : False ,
140+ "matched_alternative_index" : - 1 ,
141+ "matched_command" : None ,
142+ "confidence" : 0.0 ,
143+ "explanation" : "Agent explicitly indicated lack of knowledge" ,
144+ "is_fine_grained" : False ,
145+ "agent_unknown" : True
132146 }
133147
134148 # Protocol determines which alternatives to check
@@ -932,6 +946,15 @@ def check_goal_reached(
932946 - reason: Explanation of the decision
933947 - is_fine_grained: Whether the command/result is too fine-grained
934948 """
949+ # If agent explicitly said "I don't know", stop without reiterating
950+ if evaluation_result .get ("agent_unknown" , False ):
951+ return {
952+ "goal_reached" : True , # Consider it "ruled out"
953+ "needs_more_predictions" : False ,
954+ "reason" : "Agent indicated lack of knowledge - goal ruled out" ,
955+ "is_fine_grained" : False
956+ }
957+
935958 # If we found a match, goal is reached
936959 if evaluation_result ["completed" ]:
937960 # Adapt message based on protocol and task mode
0 commit comments