@@ -237,7 +237,7 @@ def _prepare_step_node(self, state: EvaluationState) -> Dict[str, Any]:
237237 print (f" Actual evaluation starts from step 2\n " )
238238
239239 # Build result for this step (NOT marking as completed)
240- step_result = self ._build_step_result (current_step , None )
240+ step_result = self ._build_step_result (current_step , None , task_mode = task_mode )
241241 # Mark this as an example step that doesn't count toward score
242242 step_result ["_example_step" ] = True
243243 # Mark the gold alternative as NOT completed (it's just an example)
@@ -357,7 +357,8 @@ def _record_result_node(self, state: EvaluationState) -> Dict[str, Any]:
357357 eval_result ["matched_command" ] = state ["_accumulated_commands" ]
358358
359359 # Build result structure for this step based on its format
360- step_result = self ._build_step_result (current_step , eval_result )
360+ task_mode = state .get ("task_mode" , "command" )
361+ step_result = self ._build_step_result (current_step , eval_result , task_mode = task_mode )
361362
362363 # Add to completed results
363364 new_completed_results = state ["completed_results" ] + [step_result ]
@@ -763,16 +764,30 @@ def _evaluate_response_node(self, state: EvaluationState) -> Dict[str, Any]:
763764 print (f" Error during evaluation: { e } " )
764765 return {"current_step_goal_reached" : True }
765766
766- def _build_step_result (self , step_data : Any , eval_result : Optional [Dict [str , Any ]]) -> Dict [str , Any ]:
767+ def _build_step_result (self , step_data : Any , eval_result : Optional [Dict [str , Any ]], task_mode : str = "command" ) -> Dict [str , Any ]:
767768 """Build the result structure for a completed step.
768769
769770 Args:
770771 step_data: The step data from steps_enriched.json
771772 eval_result: Evaluation result (None if no match found)
773+ task_mode: The task mode (command, goal, or anticipated_result)
772774
773775 Returns:
774776 Step result structure matching the output format
775777 """
778+ # Determine field names based on task mode
779+ if task_mode == "goal" :
780+ original_field = "original_goal"
781+ matched_field = "matched_goal"
782+ source_field = "goal"
783+ elif task_mode == "anticipated_result" :
784+ original_field = "original_anticipated_result"
785+ matched_field = "matched_anticipated_result"
786+ source_field = "results" # Use "results" field from steps_enriched.json
787+ else : # command
788+ original_field = "original_command"
789+ matched_field = "matched_command"
790+ source_field = "command"
776791 if "or" in step_data :
777792 # Step with alternatives
778793 or_results = []
@@ -783,37 +798,37 @@ def _build_step_result(self, step_data: Any, eval_result: Optional[Dict[str, Any
783798 for sub_step in alternative :
784799 sub_result = {
785800 "completed" : False ,
786- "original_command" : sub_step .get ("command" , "" ),
801+ original_field : sub_step .get (source_field , "" ),
787802 "gold" : sub_step .get ("gold" , False )
788803 }
789804 # Mark as completed if this alternative matched
790805 if eval_result and eval_result ["matched_alternative_index" ] == i :
791806 sub_result ["completed" ] = True
792- sub_result ["matched_command" ] = eval_result ["matched_command" ]
807+ sub_result [matched_field ] = eval_result ["matched_command" ]
793808 sub_results .append (sub_result )
794809 or_results .append (sub_results )
795810 else :
796811 # Atomic alternative
797812 alt_result = {
798813 "completed" : False ,
799- "original_command" : alternative .get ("command" , "" ),
814+ original_field : alternative .get (source_field , "" ),
800815 "gold" : alternative .get ("gold" , False )
801816 }
802817 # Mark as completed if this alternative matched
803818 if eval_result and eval_result ["matched_alternative_index" ] == i :
804819 alt_result ["completed" ] = True
805- alt_result ["matched_command" ] = eval_result ["matched_command" ]
820+ alt_result [matched_field ] = eval_result ["matched_command" ]
806821 or_results .append (alt_result )
807822 return {"or" : or_results }
808823 else :
809824 # Single step without alternatives
810825 result = {
811826 "completed" : eval_result is not None ,
812- "original_command" : step_data .get ("command" , "" ),
827+ original_field : step_data .get (source_field , "" ),
813828 "gold" : step_data .get ("gold" , False )
814829 }
815830 if eval_result :
816- result ["matched_command" ] = eval_result ["matched_command" ]
831+ result [matched_field ] = eval_result ["matched_command" ]
817832 return result
818833
819834 # Conditional edge functions
0 commit comments