Skip to content

Commit aafe9da

Browse files
authored
Merge pull request #41 from bastoica/main
Adding EgWalker (EuroSys'25) to ArtEvalBench
2 parents 4950df8 + cb5daa8 commit aafe9da

File tree

313 files changed

+97592
-2
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

313 files changed

+97592
-2
lines changed
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
{"artifact_id": "sosp24_wasabi", "artifact_dir": "sosp24_wasabi", "artifact_readme": "sosp24_wasabi/wasabi/README.md", "artifact_url": "https://github.com/bastoica/wasabi/tree/sosp24-ae", "evaluator": "sosp24_wasabi/wasabi/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
2-
{"artifact_id": "osdi24_anvil", "artifact_dir": "osdi24_anvil", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/anvil-verifier/anvil", "evaluator": "osdi24_anvil/anvil/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
3-
{"artifact_id": "sosp23_acto", "artifact_dir": "sosp23_acto", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/xlab-uiuc/acto", "evaluator": "sosp23_acto/acto/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
2+
{"artifact_id": "osdi24_anvil", "artifact_dir": "osdi24_anvil", "artifact_readme": "osdi24_anvil/anvil/README.md", "artifact_url": "https://github.com/anvil-verifier/anvil", "evaluator": "osdi24_anvil/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
3+
{"artifact_id": "sosp23_acto", "artifact_dir": "sosp23_acto", "artifact_readme": "sosp23_acto/acto/README.md", "artifact_url": "https://github.com/xlab-uiuc/acto", "evaluator": "sosp23_acto/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
4+
{"artifact_id": "eurosys25_egwalker", "artifact_dir": "eurosys25_egwalker", "artifact_readme": "eurosys25_egwalker/egwalker/README.md", "artifact_url": "https://github.com/josephg/egwalker-paper", "evaluator": "eurosys25_egwalker/_agent_eval/main.py", "expected_score": 4, "docer_env": "bastoica/ae-agent-ubuntu24.04:latest"}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/usr/bin/env python3
2+
import sys
3+
from typing import Dict
4+
5+
# from oracle_artifact_build import OracleArtifactBuild
6+
from oracle_env_setup import OracleEnvSetup
7+
# from oracle_benchmark_prep import OracleBenchmarkPrep
8+
# from oracle_experiment_runs import OracleExperimentRuns
9+
10+
from utils import logger
11+
12+
def main():
13+
results: Dict[str, int] = {}
14+
15+
score = 0
16+
for cls in (OracleEnvSetup, OracleArtifactBuild, OracleBenchmarkPrep, OracleExperimentRuns):
17+
checker = cls()
18+
ok = checker.run()
19+
name = cls.__name__
20+
logger.info(f"{name}: {'PASS' if ok else 'FAIL'}")
21+
if ok:
22+
results[name] = 1
23+
score += 1
24+
else:
25+
results[name] = 0
26+
27+
logger.info(f"Agent scores: {results}")
28+
return score
29+
30+
31+
if __name__ == "__main__":
32+
main()
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#!/usr/bin/env python3
2+
import os
3+
import subprocess
4+
from dataclasses import dataclass
5+
from typing import Iterable, List, Optional, Tuple
6+
from pathlib import Path
7+
8+
from utils import REPO_DIR
9+
from utils import logger
10+
11+
12+
@dataclass(frozen=True)
13+
class BuildTarget:
14+
name: str
15+
repo_key: str
16+
cmd: List[str]
17+
18+
19+
BUILD_TARGETS: List[BuildTarget] = [
20+
BuildTarget(
21+
name="artifact-core",
22+
repo_key="artifact-core",
23+
cmd=[
24+
"make",
25+
"-j8",
26+
"tools/diamond-types/target/release/dt",
27+
"tools/crdt-converter/target/release/crdt-converter",
28+
"tools/diamond-types/target/release/paper-stats",
29+
"tools/paper-benchmarks/target/memusage/paper-benchmarks",
30+
"tools/paper-benchmarks/target/release/paper-benchmarks",
31+
"tools/ot-bench/target/memusage/ot-bench",
32+
"tools/ot-bench/target/release/ot-bench"
33+
],
34+
),
35+
]
36+
37+
38+
class OracleArtifactBuild:
39+
40+
def __init__(self) -> None:
41+
self.repo_dir = REPO_DIR
42+
43+
def run_shell_command(
44+
self,
45+
cmd: Iterable[str],
46+
cwd: Optional[Path] = None,
47+
) -> Tuple[int, str, str]:
48+
"""
49+
Run a command and return (rc, stdout, stderr) tuple.
50+
"""
51+
try:
52+
cp = subprocess.run(
53+
cmd,
54+
stdout=subprocess.PIPE,
55+
stderr=subprocess.PIPE,
56+
text=True,
57+
cwd=str(cwd) if cwd is not None else None,
58+
)
59+
return cp.returncode, cp.stdout or "", cp.stderr or ""
60+
except FileNotFoundError:
61+
return 127, "", ""
62+
63+
def build_target(self, target: BuildTarget) -> Optional[str]:
64+
"""
65+
Build a single target using its configured repository and command.
66+
"""
67+
repo_path = Path(os.path.expanduser(self.repo_dir))
68+
if not repo_path.exists():
69+
return f"{target.name} repo directory missing"
70+
71+
rc, out, err = self.run_shell_command(target.cmd, cwd=repo_path)
72+
if rc != 0:
73+
return f"{target.name} build failed (error code: {rc}; error message: {err})"
74+
75+
return None
76+
77+
def build_check(self):
78+
"""
79+
Run builds for all configured targets and collect failures.
80+
"""
81+
problems: List[str] = []
82+
for target in BUILD_TARGETS:
83+
msg = self.build_target(target)
84+
if msg:
85+
problems.append(msg)
86+
if problems:
87+
return False, "; ".join(problems)
88+
return True, ""
89+
90+
def run(self):
91+
ok, why = self.build_check()
92+
logger.info(f"Build: {'PASS' if ok else 'FAIL' + (' - ' + why if why else '')}")
93+
return ok
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#!/usr/bin/env python3
2+
import json
3+
import os
4+
from dataclasses import dataclass
5+
from pathlib import Path
6+
from typing import Any, List, Optional, Tuple
7+
8+
from utils import HOME
9+
from utils import REPO_DIR
10+
from utils import REFERENCE_BENCHMARK_FILE
11+
from utils import logger
12+
13+
14+
@dataclass(frozen=True)
15+
class DatasetRef:
16+
filepath: str
17+
sizeinbytes: int
18+
19+
20+
class OracleBenchmarkPrep:
21+
22+
def __init__(self) -> None:
23+
self.home = Path(os.path.expanduser(str(HOME)))
24+
self.repo_path = Path(os.path.expanduser(str(REPO_DIR)))
25+
self.ref_json = Path(os.path.expanduser(str(REFERENCE_BENCHMARK_FILE)))
26+
27+
def load_json(self, path: Path) -> Tuple[Optional[Any], str]:
28+
"""
29+
Load JSON from disk and return (obj, err).
30+
"""
31+
if not path.exists():
32+
return None, f"ref json missing: {path}"
33+
try:
34+
with path.open("r", encoding="utf-8") as f:
35+
return json.load(f), ""
36+
except Exception as e:
37+
return None, f"ref json unreadable: {e}"
38+
39+
def iter_ref_entries(self, obj: Any) -> List[dict]:
40+
"""
41+
Extract benchmark entries from a reference JSON.
42+
"""
43+
if isinstance(obj, list):
44+
return [x for x in obj if isinstance(x, dict)]
45+
if isinstance(obj, dict):
46+
for v in obj.values():
47+
if isinstance(v, list) and v and all(isinstance(x, dict) for x in v):
48+
return v
49+
return []
50+
51+
def parse_entry(self, d: dict) -> Tuple[Optional[DatasetRef], str]:
52+
"""
53+
Parse a single JSON entry into DatasetRef.
54+
"""
55+
if "filepath" not in d:
56+
return None, "missing filepath"
57+
if "sizeinbytes" not in d:
58+
return None, "missing sizeinbytes"
59+
60+
fp = d.get("filepath", "")
61+
sz = d.get("sizeinbytes", None)
62+
63+
if not isinstance(fp, str) or not fp:
64+
return None, "invalid filepath"
65+
if not isinstance(sz, int) or sz < 0:
66+
return None, "invalid sizeinbytes"
67+
68+
return DatasetRef(filepath=fp, sizeinbytes=sz), ""
69+
70+
def check_entry(self, ref: DatasetRef) -> Optional[str]:
71+
"""
72+
Validate that dataset files exist and matche the expected sizes (in bytes).
73+
"""
74+
rel = Path(ref.filepath)
75+
76+
if rel.is_absolute():
77+
return f"{ref.filepath}: absolute paths not allowed"
78+
79+
p = self.repo_path / rel
80+
if not p.exists():
81+
return f"{ref.filepath}: missing"
82+
if not p.is_file():
83+
return f"{ref.filepath}: not a file"
84+
85+
try:
86+
actual = p.stat().st_size
87+
except OSError as e:
88+
return f"{ref.filepath}: stat failed ({e})"
89+
90+
if actual != ref.sizeinbytes:
91+
return f"{ref.filepath}: size mismatch (expected {ref.sizeinbytes}, got {actual})"
92+
93+
return None
94+
95+
def datasets_check(self) -> Tuple[bool, str]:
96+
"""
97+
Check all referenced dataset files are present and match expected sizes.
98+
"""
99+
obj, err = self.load_json(self.ref_json)
100+
if err:
101+
return False, err
102+
103+
entries = self.iter_ref_entries(obj)
104+
if not entries:
105+
return False, "no entries found in ref json"
106+
107+
problems: List[str] = []
108+
for d in entries:
109+
ref, perr = self.parse_entry(d)
110+
if perr or ref is None:
111+
problems.append(perr or "invalid entry")
112+
continue
113+
114+
msg = self.check_entry(ref)
115+
if msg:
116+
problems.append(msg)
117+
118+
if problems:
119+
return False, "; ".join(problems)
120+
return True, ""
121+
122+
def run(self) -> bool:
123+
ok, why = self.datasets_check()
124+
logger.info(f"Datasets: {'PASS' if ok else 'FAIL' + (' - ' + why if why else '')}")
125+
return ok

0 commit comments

Comments
 (0)