Skip to content

Commit 7c53d84

Browse files
committed
bugfix: Fixing paths, environment variables, and dependency checks inconsistencies for ANVIL
1 parent a1780ed commit 7c53d84

File tree

5 files changed

+413
-457
lines changed

5 files changed

+413
-457
lines changed

benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/main.py

Lines changed: 114 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,83 +1,150 @@
11
#!/usr/bin/env python3
2-
"""Runs environment setup checks for ANVIL."""
2+
"""Runs environment setup, build, benchmark prep, and experiment runs checks for ANVIL (OSDI'24)."""
33

44
from __future__ import annotations
55

6-
import os
7-
import sys
86
from pathlib import Path
97
from typing import Dict
8+
import os
9+
import sys
1010

1111
_AGENT_EVAL_DIR = Path(__file__).resolve().parent
1212
_AGENT_SRC_DIR = _AGENT_EVAL_DIR.parents[3] / "src"
1313
sys.path.append(str(_AGENT_SRC_DIR))
1414

15-
from oracle_env_setup import OracleEnvSetup
16-
from oracle_artifact_build import OracleArtifactBuild
17-
from oracle_benchmark_prep import OracleBenchmarkPrep
18-
from oracle_experiment_runs import OracleExperimentRuns
1915
from evaluator.utils import (
2016
EntryConfig,
2117
LoggerConfig,
2218
get_logger,
2319
record_result,
2420
)
21+
from oracle_env_setup import OracleEnvSetup
22+
from oracle_artifact_build import OracleArtifactBuild
23+
from oracle_benchmark_prep import OracleBenchmarkPrep
24+
from oracle_experiment_runs import OracleExperimentRuns
2525

26-
# Reuse the same constants the legacy oracle used.
27-
from utils import RESULTS_PATH, SIMILARITY_RATIO # pylint: disable=wrong-import-position
28-
29-
30-
ANVIL_CONFIG = EntryConfig(
31-
name="osdi24-anvil",
32-
home_dir=Path.home() / "osdi24_anvil",
33-
repository_paths={
34-
"osdi24-anvil": Path.home() / "osdi24_anvil" / "anvil",
35-
"osdi24-acto-dependency": Path.home() / "osdi24_anvil" / "acto",
36-
},
37-
results_paths={
38-
"table3": Path(RESULTS_PATH),
39-
},
40-
ground_truth_paths={
41-
"table3": (
42-
Path.home()
43-
/ "osdi24_anvil"
44-
/ "_agent_eval"
45-
/ "refs"
46-
/ "anvil-table-3.ref.json"
47-
),
48-
},
49-
similarity_ratio=SIMILARITY_RATIO,
50-
)
26+
27+
def _resolve_workspace_paths() -> tuple[Path, Path]:
28+
"""Resolve and validate _agent_eval/ and the ANVIL workspace root.
29+
30+
Expects either:
31+
(1) _agent_eval/ and (anvil/, acto/) are located in the same root directory; or
32+
(2) _AGENT_EVAL_DIR and _ANVIL_HOME are set by the user.
33+
"""
34+
try:
35+
env_agent_eval = os.environ.get("_AGENT_EVAL_DIR")
36+
env_anvil_home = os.environ.get("_ANVIL_HOME")
37+
38+
if env_agent_eval:
39+
agent_eval_dir = Path(env_agent_eval).expanduser().resolve()
40+
else:
41+
agent_eval_dir = Path(__file__).resolve().parent
42+
43+
if env_anvil_home:
44+
workspace_root = Path(env_anvil_home).expanduser().resolve()
45+
else:
46+
workspace_root = agent_eval_dir.parent.resolve()
47+
48+
if not agent_eval_dir.exists() or not agent_eval_dir.is_dir():
49+
raise RuntimeError(
50+
f"Invalid _agent_eval dir: {agent_eval_dir}\n"
51+
f"This runner expects _agent_eval/ to exist.\n"
52+
f"Set _AGENT_EVAL_DIR to the directory containing main.py if needed.")
53+
54+
anvil_repo_root = workspace_root / "anvil"
55+
if not anvil_repo_root.exists() or not anvil_repo_root.is_dir():
56+
raise RuntimeError(
57+
f"Invalid ANVIL workspace: {workspace_root}\n"
58+
f"Expected to find an 'anvil/' directory at: {anvil_repo_root}\n"
59+
f"This runner expects _agent_eval/ and anvil/ to be located in the same root directory.\n"
60+
f"Set _ANVIL_HOME to the workspace root if needed.")
61+
62+
acto_repo_root = workspace_root / "acto"
63+
if not acto_repo_root.exists() or not acto_repo_root.is_dir():
64+
raise RuntimeError(
65+
f"Invalid ANVIL workspace: {workspace_root}\n"
66+
f"Expected to find an 'acto/' directory at: {acto_repo_root}\n"
67+
f"This runner expects _agent_eval/ and acto/ to be located in the same root directory.\n"
68+
f"Set _ANVIL_HOME to the workspace root if needed.")
69+
70+
return agent_eval_dir, workspace_root
71+
72+
except OSError as exc:
73+
raise RuntimeError(f"Failed to resolve workspace paths: {exc}") from exc
74+
75+
76+
def _build_anvil_config(*, agent_eval_dir: Path,
77+
workspace_root: Path) -> EntryConfig:
78+
"""Construct EntryConfig for the ANVIL evaluation bundle from resolved paths."""
79+
anvil_repo = (workspace_root / "anvil").resolve()
80+
acto_repo = (workspace_root / "acto").resolve()
81+
82+
agent_eval_dir = agent_eval_dir.resolve()
83+
refs_dir = (agent_eval_dir / "refs").resolve()
84+
85+
default_table3_results = (anvil_repo / "results" / "table3.md").resolve()
86+
table3_results = Path(
87+
os.environ.get("_ANVIL_TABLE3_RESULTS",
88+
str(default_table3_results))).expanduser().resolve()
89+
90+
similarity_ratio = float(os.environ.get("_ANVIL_SIMILARITY_RATIO", "0.75"))
91+
92+
return EntryConfig(
93+
name="osdi24-anvil",
94+
home_dir=workspace_root,
95+
repository_paths={
96+
"osdi24-anvil": anvil_repo,
97+
"osdi24-acto-dependency": acto_repo,
98+
},
99+
results_paths={
100+
"table3": table3_results,
101+
},
102+
ground_truth_paths={
103+
"table3": (refs_dir / "anvil-table-3.ref.json").resolve(),
104+
"osdi24-acto-dependency.expected_branch":
105+
(refs_dir / "acto.expected_branch.txt").resolve(),
106+
"osdi24-acto-dependency.expected_head":
107+
(refs_dir / "acto.expected_head.txt").resolve(),
108+
},
109+
similarity_ratio=similarity_ratio,
110+
)
51111

52112

53113
def main(argv: list[str]) -> int:
114+
verbose = "--verbose" in argv
115+
54116
results: Dict[str, int] = {}
55117
score = 0
56118

57-
verbose = "--verbose" in argv
58-
59-
logger_name = os.environ.get("EVAL_LOGGER_NAME", "ANVIL-EVAL")
119+
logger_name = os.environ.get("EVAL_LOGGER_NAME", "ANVIL-AGENT-EVALUATOR")
60120
logger = get_logger(LoggerConfig(root_name=logger_name))
61121

122+
try:
123+
agent_eval_dir, workspace_root = _resolve_workspace_paths()
124+
ANVIL_CONFIG = _build_anvil_config(agent_eval_dir=agent_eval_dir,
125+
workspace_root=workspace_root)
126+
except RuntimeError as exc:
127+
raise SystemExit(str(exc)) from exc
128+
62129
env_checker = OracleEnvSetup(config=ANVIL_CONFIG, logger=logger)
63-
score += record_result(
64-
results, type(env_checker).__name__, env_checker.run(verbose=verbose)
65-
)
130+
score += record_result(results,
131+
type(env_checker).__name__,
132+
env_checker.run(verbose=verbose))
66133

67134
build_checker = OracleArtifactBuild(config=ANVIL_CONFIG, logger=logger)
68-
score += record_result(
69-
results, type(build_checker).__name__, build_checker.run(verbose=verbose)
70-
)
135+
score += record_result(results,
136+
type(build_checker).__name__,
137+
build_checker.run(verbose=verbose))
71138

72139
prep_checker = OracleBenchmarkPrep(config=ANVIL_CONFIG, logger=logger)
73-
score += record_result(
74-
results, type(prep_checker).__name__, prep_checker.run(verbose=verbose)
75-
)
140+
score += record_result(results,
141+
type(prep_checker).__name__,
142+
prep_checker.run(verbose=verbose))
76143

77144
runs_checker = OracleExperimentRuns(config=ANVIL_CONFIG, logger=logger)
78-
score += record_result(
79-
results, type(runs_checker).__name__, runs_checker.run(verbose=verbose)
80-
)
145+
score += record_result(results,
146+
type(runs_checker).__name__,
147+
runs_checker.run(verbose=verbose))
81148

82149
logger.info("Agent scores: %s", results)
83150
return score
Lines changed: 21 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,23 @@
1-
#!/usr/bin/env python3
2-
"""Artifact build oracle for the OSDI '24 ANVIL artifact.
1+
"""Artifact build oracle for ANVIL (OSDI'24).
32
43
Validates:
5-
- The ACTO dependency repository can build its required library target.
4+
- Required repository working directories exist
5+
- Build commands execute successfully
66
"""
77

88
from __future__ import annotations
99

10-
from collections.abc import Mapping, Sequence
11-
from dataclasses import dataclass, field
1210
import logging
13-
from pathlib import Path
11+
from collections.abc import Sequence
1412

13+
from evaluator import utils
1514
from evaluator.oracle_artifact_build_primitives import (
1615
BuildCommandRequirement,
17-
BuildRequirement,
1816
OracleArtifactBuildBase,
1917
)
2018
from evaluator.utils import EntryConfig
2119

2220

23-
@dataclass(frozen = True, slots = True, kw_only = True)
24-
class BuildTarget:
25-
"""Declarative description of one build command to run."""
26-
27-
name: str
28-
cwd: Path
29-
command: Sequence[str]
30-
cwd_relative: Path | None = None
31-
optional: bool = False
32-
timeout_seconds: float = 60.0
33-
env_overrides: Mapping[str, str] = field(default_factory = dict)
34-
35-
def __post_init__(self) -> None:
36-
if not self.name:
37-
raise ValueError("BuildTarget.name must be non-empty")
38-
if not self.command:
39-
raise ValueError(f"{self.name}: command must be non-empty")
40-
if self.timeout_seconds <= 0:
41-
raise ValueError(f"{self.name}: timeout_seconds must be > 0")
42-
43-
object.__setattr__(self, "command", tuple(self.command))
44-
45-
4621
class OracleArtifactBuild(OracleArtifactBuildBase):
4722
"""Artifact build oracle for ANVIL."""
4823

@@ -51,40 +26,26 @@ def __init__(
5126
*,
5227
config: EntryConfig,
5328
logger: logging.Logger,
54-
targets: Sequence[BuildTarget] | None = None,
29+
targets: Sequence[BuildCommandRequirement] | None = None,
5530
) -> None:
56-
super().__init__(logger = logger)
31+
super().__init__(logger=logger)
5732
self._config = config
5833

59-
if targets is None:
60-
targets = self._default_targets()
61-
self._targets = tuple(targets)
34+
self._requirements = tuple(
35+
targets) if targets is not None else self._default_requirements()
6236

63-
names = [t.name for t in self._targets]
37+
names = [r.name for r in self._requirements]
6438
if len(names) != len(set(names)):
65-
raise ValueError(f"Duplicate build target names: {names!r}")
39+
raise ValueError(f"Duplicate build requirement names: {names!r}")
6640

67-
def _default_targets(self) -> tuple[BuildTarget, ...]:
41+
def _default_requirements(self) -> tuple[BuildCommandRequirement, ...]:
6842
acto_repo = self._config.repository_paths["osdi24-acto-dependency"]
69-
return (
70-
BuildTarget(
71-
name = "acto: make lib",
72-
cwd = acto_repo,
73-
command = ("make", "lib"),
74-
timeout_seconds = 60.0,
75-
),
76-
)
77-
78-
def requirements(self) -> Sequence[BuildRequirement]:
79-
return tuple(
80-
BuildCommandRequirement(
81-
name = t.name,
82-
optional = t.optional,
83-
cwd = t.cwd,
84-
command = t.command,
85-
cwd_relative = t.cwd_relative,
86-
timeout_seconds = t.timeout_seconds,
87-
env_overrides = t.env_overrides,
88-
)
89-
for t in self._targets
90-
)
43+
return (BuildCommandRequirement(
44+
name="acto: make lib",
45+
cwd=acto_repo,
46+
command=("make", "lib"),
47+
timeout_seconds=60.0,
48+
),)
49+
50+
def requirements(self) -> Sequence[utils.BaseRequirement]:
51+
return self._requirements

0 commit comments

Comments
 (0)