-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathles_evals.py
106 lines (83 loc) · 3.19 KB
/
les_evals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import time
from pathlib import Path
import modal
app = modal.App("humaneval-sandbox")
volume = modal.Volume.from_name("mistral-humaneval", create_if_missing=False)
sandbox_image = (
modal.Image.debian_slim()
.apt_install("git")
.run_commands(
"git clone https://github.com/modal-labs/human-eval.git",
"pip install -e human-eval",
)
)
MINUTES = 60
@app.function(volumes={"/humaneval": volume}, timeout=10 * MINUTES)
def run_humaneval(sample_file_path: str, problem_file_path: str):
volume.reload()
with modal.Volume.ephemeral() as sandboxed_volume:
with sandboxed_volume.batch_upload() as batch:
batch.put_file(sample_file_path, "samples.jsonl")
batch.put_file(problem_file_path, "problems.jsonl")
print(f"Starting sandbox for {sample_file_path}")
sandbox = modal.Sandbox.create(
"bash",
"-c",
"evaluate_functional_correctness vol/samples.jsonl --problem_file=vol/problems.jsonl --n_workers=32",
image=sandbox_image,
volumes={"/vol": sandboxed_volume},
timeout=5 * MINUTES,
cpu=32,
)
try:
start = time.time()
while (time.time() - start) < 5 * MINUTES:
if sandbox.poll() is not None:
print(f"Finished sandbox for {sample_file_path}")
break
else:
time.sleep(1)
else:
raise TimeoutError
except TimeoutError:
print("Sandbox timed out")
try:
data = b""
for chunk in sandboxed_volume.read_file("samples.jsonl_results.jsonl"):
data += chunk
except Exception:
pass
Path(f"{sample_file_path}_results.jsonl").write_bytes(data)
@app.function(volumes={"/humaneval": volume}, timeout=30 * MINUTES)
def compute_results():
import os
envs = [element for element in Path("/humaneval").iterdir() if element.is_dir()]
envs = list(filter(lambda p: not p.stem.startswith("."), envs))
volume.reload()
for env in envs:
done = False
while not done:
print(f"looking in {env}")
problem_file = env / "data.jsonl"
pattern = "*/*.jsonl"
handles = []
for file_path in env.glob(pattern):
# skip results files
if str(file_path).endswith("_results.jsonl"):
continue
# Check if the corresponding results file exists
results_file = f"{file_path}_results.jsonl"
if not os.path.exists(results_file):
# If it doesn't exist, run run_humaneval
print(f"Checking {file_path}")
handles.append(run_humaneval.spawn(file_path, problem_file))
if not handles:
print(f"no samples without results in {env}")
done = True
else:
print(f"executing on {len(handles)} samples without results in {env}")
modal.FunctionCall.gather(*handles)
volume.reload()
@app.local_entrypoint()
def main():
compute_results.remote()