simple-mcp-evaluation/run.py at main · biocontext-ai/simple-mcp-evaluation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import asyncio
import os
import time
from time import sleep

import pandas as pd
from tqdm.auto import tqdm

from biocontext_benchmark.agent import get_agent, get_model, run_agent

if __name__ == "__main__":
    prompt_file = "prompts.csv"
    results_dir = "results"
    cooldown_seconds = 5

    models = [
        ("google", "gemini-2.5-flash"),
        ("openai", "gpt-4.1-mini-2025-04-14"),
        ("openai", "gpt-4.1-2025-04-14"),
    ]
    matrix = [
        {"provider": (
            "openai_responses"
            if provider == "openai" and ws is True
            else provider
        ), "model": model, "web_search": ws, "mcp": mcp}
        for provider, model in models
        for ws, mcp in [
            (False, False),
            (True, False) if provider != "google" else (None, None),
            (False, True),
        ]
        if ws is not None
    ]

    prompts = pd.read_csv(
        prompt_file,
        header=0,
    )

    timing_data = []
    for _, row in tqdm(prompts.iterrows(), total=len(prompts), desc="Iterating over prompts"):
        prompt = row["prompt"]
        if pd.isna(prompt):
            continue
        tool_tested = row["tool_tested"]
        print(f"Running agent for tool: {tool_tested} and prompt: {prompt}")

        for config in tqdm(matrix, total=len(matrix), desc="Iterating over configs"):
            results_folder = os.path.join(results_dir, tool_tested, row["id"], config["model"])
            os.makedirs(results_folder, exist_ok=True)

            config_str = f"{row['id']}_model={config['model']}_web_search={config['web_search']}_mcp={config['mcp']}"

            # Record start time
            start_time = time.time()
            model = get_model(provider=config["provider"], model_name=config["model"])  # type: ignore[arg-type]
            agent = get_agent(
                model=model,
                mcp=bool(config["mcp"]),
                web_search=bool(config["web_search"]),
            )
            try:
                result, request_tokens, result_tokens = asyncio.run(
                    run_agent(
                        prompt=prompt,
                        agent=agent,
                        # verbose=True,
                    )
                )
            except Exception as e:
                print(f"Error running agent: {e}")
                result = f"Error: {e}"
                request_tokens = 0
                result_tokens = 0

            # Record end time and calculate duration
            end_time = time.time()
            duration = end_time - start_time

            # save result to file
            with open(os.path.join(results_folder, config_str + ".txt"), "w") as f:
                f.write(result)

            # Add timing data
            timing_data.append(
                {
                    "id": row["id"],
                    "tool_tested": tool_tested,
                    "provider": config["provider"],
                    "web_search": config["web_search"],
                    "mcp": config["mcp"],
                    "config_str": config_str,
                    "start_time": start_time,
                    "end_time": end_time,
                    "duration_seconds": duration,
                    "request_tokens": request_tokens,
                    "result_tokens": result_tokens,
                    "total_tokens": request_tokens + result_tokens,
                }
            )

            print(f"Agent run completed for model {config['model']} in {duration:.2f} seconds")

            timing_df = pd.DataFrame(timing_data)
            timing_csv_path = os.path.join(results_folder, "timing.csv")
            timing_df.to_csv(timing_csv_path, index=False)

            for _ in tqdm(range(cooldown_seconds), desc="Rate limit cooldown"):
                sleep(1)