Skip to content

Commit cfb3fb2

Browse files
author
Daniel Schlör
committed
result format matches tasks
1 parent 4214c00 commit cfb3fb2

File tree

1 file changed

+24
-9
lines changed

1 file changed

+24
-9
lines changed

src/evaluator/workflow.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ def _prepare_step_node(self, state: EvaluationState) -> Dict[str, Any]:
237237
print(f" Actual evaluation starts from step 2\n")
238238

239239
# Build result for this step (NOT marking as completed)
240-
step_result = self._build_step_result(current_step, None)
240+
step_result = self._build_step_result(current_step, None, task_mode=task_mode)
241241
# Mark this as an example step that doesn't count toward score
242242
step_result["_example_step"] = True
243243
# Mark the gold alternative as NOT completed (it's just an example)
@@ -357,7 +357,8 @@ def _record_result_node(self, state: EvaluationState) -> Dict[str, Any]:
357357
eval_result["matched_command"] = state["_accumulated_commands"]
358358

359359
# Build result structure for this step based on its format
360-
step_result = self._build_step_result(current_step, eval_result)
360+
task_mode = state.get("task_mode", "command")
361+
step_result = self._build_step_result(current_step, eval_result, task_mode=task_mode)
361362

362363
# Add to completed results
363364
new_completed_results = state["completed_results"] + [step_result]
@@ -763,16 +764,30 @@ def _evaluate_response_node(self, state: EvaluationState) -> Dict[str, Any]:
763764
print(f" Error during evaluation: {e}")
764765
return {"current_step_goal_reached": True}
765766

766-
def _build_step_result(self, step_data: Any, eval_result: Optional[Dict[str, Any]]) -> Dict[str, Any]:
767+
def _build_step_result(self, step_data: Any, eval_result: Optional[Dict[str, Any]], task_mode: str = "command") -> Dict[str, Any]:
767768
"""Build the result structure for a completed step.
768769
769770
Args:
770771
step_data: The step data from steps_enriched.json
771772
eval_result: Evaluation result (None if no match found)
773+
task_mode: The task mode (command, goal, or anticipated_result)
772774
773775
Returns:
774776
Step result structure matching the output format
775777
"""
778+
# Determine field names based on task mode
779+
if task_mode == "goal":
780+
original_field = "original_goal"
781+
matched_field = "matched_goal"
782+
source_field = "goal"
783+
elif task_mode == "anticipated_result":
784+
original_field = "original_anticipated_result"
785+
matched_field = "matched_anticipated_result"
786+
source_field = "results" # Use "results" field from steps_enriched.json
787+
else: # command
788+
original_field = "original_command"
789+
matched_field = "matched_command"
790+
source_field = "command"
776791
if "or" in step_data:
777792
# Step with alternatives
778793
or_results = []
@@ -783,37 +798,37 @@ def _build_step_result(self, step_data: Any, eval_result: Optional[Dict[str, Any
783798
for sub_step in alternative:
784799
sub_result = {
785800
"completed": False,
786-
"original_command": sub_step.get("command", ""),
801+
original_field: sub_step.get(source_field, ""),
787802
"gold": sub_step.get("gold", False)
788803
}
789804
# Mark as completed if this alternative matched
790805
if eval_result and eval_result["matched_alternative_index"] == i:
791806
sub_result["completed"] = True
792-
sub_result["matched_command"] = eval_result["matched_command"]
807+
sub_result[matched_field] = eval_result["matched_command"]
793808
sub_results.append(sub_result)
794809
or_results.append(sub_results)
795810
else:
796811
# Atomic alternative
797812
alt_result = {
798813
"completed": False,
799-
"original_command": alternative.get("command", ""),
814+
original_field: alternative.get(source_field, ""),
800815
"gold": alternative.get("gold", False)
801816
}
802817
# Mark as completed if this alternative matched
803818
if eval_result and eval_result["matched_alternative_index"] == i:
804819
alt_result["completed"] = True
805-
alt_result["matched_command"] = eval_result["matched_command"]
820+
alt_result[matched_field] = eval_result["matched_command"]
806821
or_results.append(alt_result)
807822
return {"or": or_results}
808823
else:
809824
# Single step without alternatives
810825
result = {
811826
"completed": eval_result is not None,
812-
"original_command": step_data.get("command", ""),
827+
original_field: step_data.get(source_field, ""),
813828
"gold": step_data.get("gold", False)
814829
}
815830
if eval_result:
816-
result["matched_command"] = eval_result["matched_command"]
831+
result[matched_field] = eval_result["matched_command"]
817832
return result
818833

819834
# Conditional edge functions

0 commit comments

Comments
 (0)