OdysseyBench/run_all.py at main · microsoft/OdysseyBench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# running all tasks from the folder
import argparse
import shutil
import os
from agent_interact import main
from pathlib import Path
from generate_run_config import main as generate_run_config
from ruamel.yaml import YAML
from utils.constants import *
import debugpy

import logging
yaml = YAML()
yaml.preserve_quotes = True  # Preserve the original quoting style
yaml.width = 4096


def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )
    httpx_logger = logging.getLogger("httpx")
    httpx_logger.setLevel(logging.WARNING)


repo_root_dir = os.path.dirname(
    os.path.abspath(__file__)
)


# Create the parser
parser = argparse.ArgumentParser()

# Add arguments
parser.add_argument("--generate-config", action="store_true", help="Generate configuration file")
parser.add_argument("--no-debug", action="store_true", help="Run in non-debug mode")
parser.add_argument("--task", type=str, help="Task to run", default=None)
parser.add_argument("--force-new", action="store_true", help="Force new run to remove existing output directory")
parser.add_argument("--neo", action="store_true", help="Use OdysseyBench-neo tasks")
parser.add_argument("--neo_clean", action="store_true", help="Use generated OdysseyBench-neo tasks for cleaning")
parser.add_argument("--debug", action="store_true", help="Debug mode (attach debugger)")
parser.add_argument("--tag", default="synthesis", help="Experiment tag")
# Parse the arguments
args = parser.parse_args()

setup_logging()

tag = args.tag
model_name_short = 'az-gpt-o3'
exp_id = f"{tag}_{model_name_short}"

# check if the config is already generated, if not, generate new
output_exp_dir = Path(f"{repo_root_dir}/exp/{exp_id}")
if not os.path.exists(output_exp_dir) or args.generate_config:
    print ('Config not found, generating new ... ')
    generate_run_config(tag, model_name_short, prompt_file='configs/prompts_v2.json', use_thinking_tokens=False)
else:
    if args.force_new:
        print ('Generating new config due to force_new flag ... ')
        generate_run_config(tag, model_name_short, prompt_file='configs/prompts_v2.json', use_thinking_tokens=False)

# clear the error log
if os.path.exists("error.log"):
    os.remove("error.log")

# load the config file
exp_config_path = Path(f"{repo_root_dir}/exp/{exp_id}/{CONFIG}/{exp_id}.yaml")
exp_config = yaml.load(exp_config_path)

max_iter = exp_config[ENV][MAX_ITER]

models_config_filepath = Path(f"{repo_root_dir}/configs/models.yaml")

with open(models_config_filepath, "r") as file:
    models_config = yaml.load(file)
MODEL_NAMES = {}
for model_short in models_config["model"]:
    MODEL_NAMES[model_short] = models_config["model"][model_short]["name"]

model_name = MODEL_NAMES[model_name_short]
task_list = list(sorted(os.listdir('tasks')))
if args.task is not None:
    assert args.task in task_list, f"Task {args.task} not found in tasks folder"
    task_list = [args.task]


for i, task in enumerate(task_list):
    if not '-' in task:
        continue

    if args.neo or args.neo_clean:
        subtasks_flag = 'subtasks_neo'
    else:
        subtasks_flag = 'subtasks'
    if args.neo and not os.path.exists(f'tasks/{task}/{subtasks_flag}'):
        continue
    subtask_list = list(sorted(os.listdir(f'tasks/{task}/{subtasks_flag}')))
    for subtask in subtask_list:

        print ('running task folder', task, i, 'out of', len(task_list))
        print ('running subtask', subtask)
        config_file = f'tasks/{task}/{subtasks_flag}/{subtask}'
        if args.neo_clean:
            config_file = f'tasks/{task}/subtasks_gen/{subtask}/task_specs.json'
        subtask_name = subtask.split('.')[0]

        output_folder = f'tasks/{task}/outputs/{subtask_name}/{model_name}_{tag}'
        print(f"Output folder: {output_folder}")
        if os.path.exists(output_folder):
            if args.force_new:
                print(f"Force new mode: removing existing output directory: {output_folder}")
                shutil.rmtree(output_folder)
            print ('output folder exists, skipping')
            continue

        main(docker_name='officebench',
            container_name=f'officebench-debug-{tag}-{model_name_short}',
            dockerfile_path='./docker/Dockerfile',
            model_name=model_name,
            task_dir=f'tasks/{task}',
            config_file=config_file,
            task=None,
            tag=tag,
            max_iter=max_iter,
            mode='default' if not args.force_new else 'force_new',
            exp_config=exp_config,
            debug_mode=not args.no_debug,
            run_neo_tasks=args.neo)