Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 114 additions & 47 deletions benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/main.py
Original file line number Diff line number Diff line change
@@ -1,83 +1,150 @@
#!/usr/bin/env python3
"""Runs environment setup checks for ANVIL."""
"""Runs environment setup, build, benchmark prep, and experiment runs checks for ANVIL (OSDI'24)."""

from __future__ import annotations

import os
import sys
from pathlib import Path
from typing import Dict
import os
import sys

_AGENT_EVAL_DIR = Path(__file__).resolve().parent
_AGENT_SRC_DIR = _AGENT_EVAL_DIR.parents[3] / "src"
sys.path.append(str(_AGENT_SRC_DIR))

from oracle_env_setup import OracleEnvSetup
from oracle_artifact_build import OracleArtifactBuild
from oracle_benchmark_prep import OracleBenchmarkPrep
from oracle_experiment_runs import OracleExperimentRuns
from evaluator.utils import (
EntryConfig,
LoggerConfig,
get_logger,
record_result,
)
from oracle_env_setup import OracleEnvSetup
from oracle_artifact_build import OracleArtifactBuild
from oracle_benchmark_prep import OracleBenchmarkPrep
from oracle_experiment_runs import OracleExperimentRuns

# Reuse the same constants the legacy oracle used.
from utils import RESULTS_PATH, SIMILARITY_RATIO # pylint: disable=wrong-import-position


ANVIL_CONFIG = EntryConfig(
name="osdi24-anvil",
home_dir=Path.home() / "osdi24_anvil",
repository_paths={
"osdi24-anvil": Path.home() / "osdi24_anvil" / "anvil",
"osdi24-acto-dependency": Path.home() / "osdi24_anvil" / "acto",
},
results_paths={
"table3": Path(RESULTS_PATH),
},
ground_truth_paths={
"table3": (
Path.home()
/ "osdi24_anvil"
/ "_agent_eval"
/ "refs"
/ "anvil-table-3.ref.json"
),
},
similarity_ratio=SIMILARITY_RATIO,
)

def _resolve_workspace_paths() -> tuple[Path, Path]:
"""Resolve and validate _agent_eval/ and the ANVIL workspace root.

Expects either:
(1) _agent_eval/ and (anvil/, acto/) are located in the same root directory; or
(2) _AGENT_EVAL_DIR and _ANVIL_HOME are set by the user.
"""
try:
env_agent_eval = os.environ.get("_AGENT_EVAL_DIR")
env_anvil_home = os.environ.get("_ANVIL_HOME")

if env_agent_eval:
agent_eval_dir = Path(env_agent_eval).expanduser().resolve()
else:
agent_eval_dir = Path(__file__).resolve().parent

if env_anvil_home:
workspace_root = Path(env_anvil_home).expanduser().resolve()
else:
workspace_root = agent_eval_dir.parent.resolve()

if not agent_eval_dir.exists() or not agent_eval_dir.is_dir():
raise RuntimeError(
f"Invalid _agent_eval dir: {agent_eval_dir}\n"
f"This runner expects _agent_eval/ to exist.\n"
f"Set _AGENT_EVAL_DIR to the directory containing main.py if needed.")

anvil_repo_root = workspace_root / "anvil"
if not anvil_repo_root.exists() or not anvil_repo_root.is_dir():
raise RuntimeError(
f"Invalid ANVIL workspace: {workspace_root}\n"
f"Expected to find an 'anvil/' directory at: {anvil_repo_root}\n"
f"This runner expects _agent_eval/ and anvil/ to be located in the same root directory.\n"
f"Set _ANVIL_HOME to the workspace root if needed.")

acto_repo_root = workspace_root / "acto"
if not acto_repo_root.exists() or not acto_repo_root.is_dir():
raise RuntimeError(
f"Invalid ANVIL workspace: {workspace_root}\n"
f"Expected to find an 'acto/' directory at: {acto_repo_root}\n"
f"This runner expects _agent_eval/ and acto/ to be located in the same root directory.\n"
f"Set _ANVIL_HOME to the workspace root if needed.")

return agent_eval_dir, workspace_root

except OSError as exc:
raise RuntimeError(f"Failed to resolve workspace paths: {exc}") from exc


def _build_anvil_config(*, agent_eval_dir: Path,
workspace_root: Path) -> EntryConfig:
"""Construct EntryConfig for the ANVIL evaluation bundle from resolved paths."""
anvil_repo = (workspace_root / "anvil").resolve()
acto_repo = (workspace_root / "acto").resolve()

agent_eval_dir = agent_eval_dir.resolve()
refs_dir = (agent_eval_dir / "refs").resolve()

default_table3_results = (anvil_repo / "results" / "table3.md").resolve()
table3_results = Path(
os.environ.get("_ANVIL_TABLE3_RESULTS",
str(default_table3_results))).expanduser().resolve()

similarity_ratio = float(os.environ.get("_ANVIL_SIMILARITY_RATIO", "0.75"))

return EntryConfig(
name="osdi24-anvil",
home_dir=workspace_root,
repository_paths={
"osdi24-anvil": anvil_repo,
"osdi24-acto-dependency": acto_repo,
},
results_paths={
"table3": table3_results,
},
ground_truth_paths={
"table3": (refs_dir / "anvil-table-3.ref.json").resolve(),
"osdi24-acto-dependency.expected_branch":
(refs_dir / "acto.expected_branch.txt").resolve(),
"osdi24-acto-dependency.expected_head":
(refs_dir / "acto.expected_head.txt").resolve(),
},
similarity_ratio=similarity_ratio,
)


def main(argv: list[str]) -> int:
verbose = "--verbose" in argv

results: Dict[str, int] = {}
score = 0

verbose = "--verbose" in argv

logger_name = os.environ.get("EVAL_LOGGER_NAME", "ANVIL-EVAL")
logger_name = os.environ.get("EVAL_LOGGER_NAME", "ANVIL-AGENT-EVALUATOR")
logger = get_logger(LoggerConfig(root_name=logger_name))

try:
agent_eval_dir, workspace_root = _resolve_workspace_paths()
ANVIL_CONFIG = _build_anvil_config(agent_eval_dir=agent_eval_dir,
workspace_root=workspace_root)
except RuntimeError as exc:
raise SystemExit(str(exc)) from exc

env_checker = OracleEnvSetup(config=ANVIL_CONFIG, logger=logger)
score += record_result(
results, type(env_checker).__name__, env_checker.run(verbose=verbose)
)
score += record_result(results,
type(env_checker).__name__,
env_checker.run(verbose=verbose))

build_checker = OracleArtifactBuild(config=ANVIL_CONFIG, logger=logger)
score += record_result(
results, type(build_checker).__name__, build_checker.run(verbose=verbose)
)
score += record_result(results,
type(build_checker).__name__,
build_checker.run(verbose=verbose))

prep_checker = OracleBenchmarkPrep(config=ANVIL_CONFIG, logger=logger)
score += record_result(
results, type(prep_checker).__name__, prep_checker.run(verbose=verbose)
)
score += record_result(results,
type(prep_checker).__name__,
prep_checker.run(verbose=verbose))

runs_checker = OracleExperimentRuns(config=ANVIL_CONFIG, logger=logger)
score += record_result(
results, type(runs_checker).__name__, runs_checker.run(verbose=verbose)
)
score += record_result(results,
type(runs_checker).__name__,
runs_checker.run(verbose=verbose))

logger.info("Agent scores: %s", results)
return score
Expand Down
Original file line number Diff line number Diff line change
@@ -1,48 +1,23 @@
#!/usr/bin/env python3
"""Artifact build oracle for the OSDI '24 ANVIL artifact.
"""Artifact build oracle for ANVIL (OSDI'24).

Validates:
- The ACTO dependency repository can build its required library target.
- Required repository working directories exist
- Build commands execute successfully
"""

from __future__ import annotations

from collections.abc import Mapping, Sequence
from dataclasses import dataclass, field
import logging
from pathlib import Path
from collections.abc import Sequence

from evaluator import utils
from evaluator.oracle_artifact_build_primitives import (
BuildCommandRequirement,
BuildRequirement,
OracleArtifactBuildBase,
)
from evaluator.utils import EntryConfig


@dataclass(frozen = True, slots = True, kw_only = True)
class BuildTarget:
"""Declarative description of one build command to run."""

name: str
cwd: Path
command: Sequence[str]
cwd_relative: Path | None = None
optional: bool = False
timeout_seconds: float = 60.0
env_overrides: Mapping[str, str] = field(default_factory = dict)

def __post_init__(self) -> None:
if not self.name:
raise ValueError("BuildTarget.name must be non-empty")
if not self.command:
raise ValueError(f"{self.name}: command must be non-empty")
if self.timeout_seconds <= 0:
raise ValueError(f"{self.name}: timeout_seconds must be > 0")

object.__setattr__(self, "command", tuple(self.command))


class OracleArtifactBuild(OracleArtifactBuildBase):
"""Artifact build oracle for ANVIL."""

Expand All @@ -51,40 +26,26 @@ def __init__(
*,
config: EntryConfig,
logger: logging.Logger,
targets: Sequence[BuildTarget] | None = None,
targets: Sequence[BuildCommandRequirement] | None = None,
) -> None:
super().__init__(logger = logger)
super().__init__(logger=logger)
self._config = config

if targets is None:
targets = self._default_targets()
self._targets = tuple(targets)
self._requirements = tuple(
targets) if targets is not None else self._default_requirements()

names = [t.name for t in self._targets]
names = [r.name for r in self._requirements]
if len(names) != len(set(names)):
raise ValueError(f"Duplicate build target names: {names!r}")
raise ValueError(f"Duplicate build requirement names: {names!r}")

def _default_targets(self) -> tuple[BuildTarget, ...]:
def _default_requirements(self) -> tuple[BuildCommandRequirement, ...]:
acto_repo = self._config.repository_paths["osdi24-acto-dependency"]
return (
BuildTarget(
name = "acto: make lib",
cwd = acto_repo,
command = ("make", "lib"),
timeout_seconds = 60.0,
),
)

def requirements(self) -> Sequence[BuildRequirement]:
return tuple(
BuildCommandRequirement(
name = t.name,
optional = t.optional,
cwd = t.cwd,
command = t.command,
cwd_relative = t.cwd_relative,
timeout_seconds = t.timeout_seconds,
env_overrides = t.env_overrides,
)
for t in self._targets
)
return (BuildCommandRequirement(
name="acto: make lib",
cwd=acto_repo,
command=("make", "lib"),
timeout_seconds=60.0,
),)

def requirements(self) -> Sequence[utils.BaseRequirement]:
return self._requirements
Loading
Loading