diff --git a/balrog/config/config.yaml b/balrog/config/config.yaml index 7124af07..31a26ce3 100644 --- a/balrog/config/config.yaml +++ b/balrog/config/config.yaml @@ -23,6 +23,7 @@ eval: save_images: False # Whether to save images from the environment icl_episodes: 1 icl_dataset: records + feedback_on_invalid_action : True # Whether to provide feedback on invalid actions client: client_name: openai # LLM client to use (e.g., 'openai', 'gemini', 'claude') diff --git a/balrog/evaluator.py b/balrog/evaluator.py index a5687821..a9a937a9 100644 --- a/balrog/evaluator.py +++ b/balrog/evaluator.py @@ -321,9 +321,9 @@ def run_episode(self, task, agent, process_num=None, position=0, episode_idx=0): # Give feedback on the action (if not valid) obs["text"]["long_term_context"] = ( - f"\n\nYour previous output action: '{response.completion}' is not a valid action. Defaulted to action: {action}\n" + f"\n\nYour previous output did not contain a valid action. Defaulted to action: {action}\n\nObservation:\n" + obs["text"]["long_term_context"] - if action != response.completion + if (action != response.completion) and (self.config.eval.feedback_on_invalid_action) else obs["text"]["long_term_context"] ) action = response.completion