-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun.py
More file actions
110 lines (95 loc) · 3.73 KB
/
run.py
File metadata and controls
110 lines (95 loc) · 3.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import asyncio
import os
import time
from time import sleep
import pandas as pd
from tqdm.auto import tqdm
from biocontext_benchmark.agent import get_agent, get_model, run_agent
if __name__ == "__main__":
prompt_file = "prompts.csv"
results_dir = "results"
cooldown_seconds = 5
models = [
("google", "gemini-2.5-flash"),
("openai", "gpt-4.1-mini-2025-04-14"),
("openai", "gpt-4.1-2025-04-14"),
]
matrix = [
{"provider": (
"openai_responses"
if provider == "openai" and ws is True
else provider
), "model": model, "web_search": ws, "mcp": mcp}
for provider, model in models
for ws, mcp in [
(False, False),
(True, False) if provider != "google" else (None, None),
(False, True),
]
if ws is not None
]
prompts = pd.read_csv(
prompt_file,
header=0,
)
timing_data = []
for _, row in tqdm(prompts.iterrows(), total=len(prompts), desc="Iterating over prompts"):
prompt = row["prompt"]
if pd.isna(prompt):
continue
tool_tested = row["tool_tested"]
print(f"Running agent for tool: {tool_tested} and prompt: {prompt}")
for config in tqdm(matrix, total=len(matrix), desc="Iterating over configs"):
results_folder = os.path.join(results_dir, tool_tested, row["id"], config["model"])
os.makedirs(results_folder, exist_ok=True)
config_str = f"{row['id']}_model={config['model']}_web_search={config['web_search']}_mcp={config['mcp']}"
# Record start time
start_time = time.time()
model = get_model(provider=config["provider"], model_name=config["model"]) # type: ignore[arg-type]
agent = get_agent(
model=model,
mcp=bool(config["mcp"]),
web_search=bool(config["web_search"]),
)
try:
result, request_tokens, result_tokens = asyncio.run(
run_agent(
prompt=prompt,
agent=agent,
# verbose=True,
)
)
except Exception as e:
print(f"Error running agent: {e}")
result = f"Error: {e}"
request_tokens = 0
result_tokens = 0
# Record end time and calculate duration
end_time = time.time()
duration = end_time - start_time
# save result to file
with open(os.path.join(results_folder, config_str + ".txt"), "w") as f:
f.write(result)
# Add timing data
timing_data.append(
{
"id": row["id"],
"tool_tested": tool_tested,
"provider": config["provider"],
"web_search": config["web_search"],
"mcp": config["mcp"],
"config_str": config_str,
"start_time": start_time,
"end_time": end_time,
"duration_seconds": duration,
"request_tokens": request_tokens,
"result_tokens": result_tokens,
"total_tokens": request_tokens + result_tokens,
}
)
print(f"Agent run completed for model {config['model']} in {duration:.2f} seconds")
timing_df = pd.DataFrame(timing_data)
timing_csv_path = os.path.join(results_folder, "timing.csv")
timing_df.to_csv(timing_csv_path, index=False)
for _ in tqdm(range(cooldown_seconds), desc="Rate limit cooldown"):
sleep(1)