forked from NolanKoblischke/GravityBench
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquick_start.py
More file actions
99 lines (90 loc) · 4.26 KB
/
quick_start.py
File metadata and controls
99 lines (90 loc) · 4.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import json
from io import StringIO
import pandas as pd, numpy as np
import openai, datasets
from dotenv import load_dotenv
from agents.tools.observe_tool import observe_tool
from agents.tools.python_repl_tool import python_repl_tool, execute_python_repl
from agents.tools.submit_answer_tool import submit_answer_tool, execute_submit_answer
load_dotenv()
class BinarySim:
def __init__(self, csv, task, units, truth):
self.df = pd.read_csv(StringIO(csv))
self.truth = float(truth)
self.req = 0 # total observations used
t0, t1, n = self.df.time.min(), self.df.time.max(), len(self.df)
self.prompt = (
f"You are a physics discovery agent. Your task is: {task}\n\n"
f"Dataset spans {t0:.2f}–{t1:.2f} with {n} rows.\n"
"Use Observe(times_requested) to sample <=100 rows (<=10 per call).\n"
"Analyse with PythonREPL where 'row_wise_results' stores your samples.\n"
f"Expected answer units: {units}. Submit with submit_answer(answer)."
)
def observe(self, times, limit):
if len(times) > limit:
return f"Error: ask <={limit} rows"
self.req += len(times)
rows = [self.df.iloc[(self.df.time - t).abs().idxmin()] for t in times]
return pd.DataFrame(rows)
class Agent:
def __init__(self, sim, model="gpt-4.1", per_req=10):
self.sim, self.per_req = sim, per_req
self.row_wise_results = pd.DataFrame()
self.pkgs = dict(
np=np, pd=pd, scipy=__import__('scipy'), sklearn=__import__('sklearn'),
sm=__import__('statsmodels.api', fromlist=['api']),
row_wise_results=self.row_wise_results,
)
self.tools = [
observe_tool(per_req, metadata={"env": sim}),
python_repl_tool(self.pkgs, self.pkgs, "numpy pandas scipy sklearn statsmodels"),
submit_answer_tool(),
]
self.client = openai.OpenAI()
self.msg = [
{"role": "system", "content": sim.prompt},
{"role": "user", "content": "Begin your analysis."},
]
# keep observations deduplicated and expose to REPL
def add_obs(self, df):
self.row_wise_results = (
pd.concat([self.row_wise_results, df]).drop_duplicates("time").reset_index(drop=True)
)
self.pkgs["row_wise_results"] = self.row_wise_results # refresh pointer
def run(self):
for _ in range(20):
rsp = self.client.chat.completions.create(
model="gpt-4.1", messages=self.msg, tools=self.tools
).choices[0].message
if rsp.content: print(f"Content: {rsp.content}")
self.msg.append({"role": "assistant", "content": rsp.content, "tool_calls": rsp.tool_calls})
if not rsp.tool_calls:
print("No tool calls, continuing...")
continue
for call in rsp.tool_calls:
args = json.loads(call.function.arguments)
name = call.function.name
if name == "Observe":
print(f"Observe args: {args['times_requested']}")
res = self.sim.observe(args["times_requested"], self.per_req)
print(f"Observe result: {res}\n\n")
self.add_obs(res)
elif name == "PythonREPL":
print(f"PythonREPL args: {args['input_code']}")
res = execute_python_repl(args["input_code"], self.pkgs, self.pkgs)
print(f"PythonREPL result: {res}\n\n")
else:
print(f"submit_answer args: {args['answer']}")
return execute_submit_answer(args["answer"])
self.msg.append({"role": "tool", "content": str(res), "tool_call_id": call.id})
return None
def main():
data = datasets.load_dataset("GravityBench/GravityBench")["test"][0]
sim = BinarySim(data['simulation_csv_content'], data['task_prompt'], data['expected_units'], data['true_answer'])
agent = Agent(sim)
ans = agent.run()
print("Ans:", ans, "Truth:", sim.truth, f"Obs {sim.req}/100")
err = abs(float(ans) - sim.truth) / sim.truth * 100
print(f"Error {err:.2f}% (threshold {data['budget_obs_threshold_percent']}%)")
if __name__ == "__main__":
main()