diff --git a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/main.py b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/main.py index b9910621..b4f0c74a 100644 --- a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/main.py +++ b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/main.py @@ -1,83 +1,150 @@ #!/usr/bin/env python3 -"""Runs environment setup checks for ANVIL.""" +"""Runs environment setup, build, benchmark prep, and experiment runs checks for ANVIL (OSDI'24).""" from __future__ import annotations -import os -import sys from pathlib import Path from typing import Dict +import os +import sys _AGENT_EVAL_DIR = Path(__file__).resolve().parent _AGENT_SRC_DIR = _AGENT_EVAL_DIR.parents[3] / "src" sys.path.append(str(_AGENT_SRC_DIR)) -from oracle_env_setup import OracleEnvSetup -from oracle_artifact_build import OracleArtifactBuild -from oracle_benchmark_prep import OracleBenchmarkPrep -from oracle_experiment_runs import OracleExperimentRuns from evaluator.utils import ( EntryConfig, LoggerConfig, get_logger, record_result, ) +from oracle_env_setup import OracleEnvSetup +from oracle_artifact_build import OracleArtifactBuild +from oracle_benchmark_prep import OracleBenchmarkPrep +from oracle_experiment_runs import OracleExperimentRuns -# Reuse the same constants the legacy oracle used. -from utils import RESULTS_PATH, SIMILARITY_RATIO # pylint: disable=wrong-import-position - - -ANVIL_CONFIG = EntryConfig( - name="osdi24-anvil", - home_dir=Path.home() / "osdi24_anvil", - repository_paths={ - "osdi24-anvil": Path.home() / "osdi24_anvil" / "anvil", - "osdi24-acto-dependency": Path.home() / "osdi24_anvil" / "acto", - }, - results_paths={ - "table3": Path(RESULTS_PATH), - }, - ground_truth_paths={ - "table3": ( - Path.home() - / "osdi24_anvil" - / "_agent_eval" - / "refs" - / "anvil-table-3.ref.json" - ), - }, - similarity_ratio=SIMILARITY_RATIO, -) + +def _resolve_workspace_paths() -> tuple[Path, Path]: + """Resolve and validate _agent_eval/ and the ANVIL workspace root. + + Expects either: + (1) _agent_eval/ and (anvil/, acto/) are located in the same root directory; or + (2) _AGENT_EVAL_DIR and _ANVIL_HOME are set by the user. + """ + try: + env_agent_eval = os.environ.get("_AGENT_EVAL_DIR") + env_anvil_home = os.environ.get("_ANVIL_HOME") + + if env_agent_eval: + agent_eval_dir = Path(env_agent_eval).expanduser().resolve() + else: + agent_eval_dir = Path(__file__).resolve().parent + + if env_anvil_home: + workspace_root = Path(env_anvil_home).expanduser().resolve() + else: + workspace_root = agent_eval_dir.parent.resolve() + + if not agent_eval_dir.exists() or not agent_eval_dir.is_dir(): + raise RuntimeError( + f"Invalid _agent_eval dir: {agent_eval_dir}\n" + f"This runner expects _agent_eval/ to exist.\n" + f"Set _AGENT_EVAL_DIR to the directory containing main.py if needed.") + + anvil_repo_root = workspace_root / "anvil" + if not anvil_repo_root.exists() or not anvil_repo_root.is_dir(): + raise RuntimeError( + f"Invalid ANVIL workspace: {workspace_root}\n" + f"Expected to find an 'anvil/' directory at: {anvil_repo_root}\n" + f"This runner expects _agent_eval/ and anvil/ to be located in the same root directory.\n" + f"Set _ANVIL_HOME to the workspace root if needed.") + + acto_repo_root = workspace_root / "acto" + if not acto_repo_root.exists() or not acto_repo_root.is_dir(): + raise RuntimeError( + f"Invalid ANVIL workspace: {workspace_root}\n" + f"Expected to find an 'acto/' directory at: {acto_repo_root}\n" + f"This runner expects _agent_eval/ and acto/ to be located in the same root directory.\n" + f"Set _ANVIL_HOME to the workspace root if needed.") + + return agent_eval_dir, workspace_root + + except OSError as exc: + raise RuntimeError(f"Failed to resolve workspace paths: {exc}") from exc + + +def _build_anvil_config(*, agent_eval_dir: Path, + workspace_root: Path) -> EntryConfig: + """Construct EntryConfig for the ANVIL evaluation bundle from resolved paths.""" + anvil_repo = (workspace_root / "anvil").resolve() + acto_repo = (workspace_root / "acto").resolve() + + agent_eval_dir = agent_eval_dir.resolve() + refs_dir = (agent_eval_dir / "refs").resolve() + + default_table3_results = (anvil_repo / "results" / "table3.md").resolve() + table3_results = Path( + os.environ.get("_ANVIL_TABLE3_RESULTS", + str(default_table3_results))).expanduser().resolve() + + similarity_ratio = float(os.environ.get("_ANVIL_SIMILARITY_RATIO", "0.75")) + + return EntryConfig( + name="osdi24-anvil", + home_dir=workspace_root, + repository_paths={ + "osdi24-anvil": anvil_repo, + "osdi24-acto-dependency": acto_repo, + }, + results_paths={ + "table3": table3_results, + }, + ground_truth_paths={ + "table3": (refs_dir / "anvil-table-3.ref.json").resolve(), + "osdi24-acto-dependency.expected_branch": + (refs_dir / "acto.expected_branch.txt").resolve(), + "osdi24-acto-dependency.expected_head": + (refs_dir / "acto.expected_head.txt").resolve(), + }, + similarity_ratio=similarity_ratio, + ) def main(argv: list[str]) -> int: + verbose = "--verbose" in argv + results: Dict[str, int] = {} score = 0 - verbose = "--verbose" in argv - - logger_name = os.environ.get("EVAL_LOGGER_NAME", "ANVIL-EVAL") + logger_name = os.environ.get("EVAL_LOGGER_NAME", "ANVIL-AGENT-EVALUATOR") logger = get_logger(LoggerConfig(root_name=logger_name)) + try: + agent_eval_dir, workspace_root = _resolve_workspace_paths() + ANVIL_CONFIG = _build_anvil_config(agent_eval_dir=agent_eval_dir, + workspace_root=workspace_root) + except RuntimeError as exc: + raise SystemExit(str(exc)) from exc + env_checker = OracleEnvSetup(config=ANVIL_CONFIG, logger=logger) - score += record_result( - results, type(env_checker).__name__, env_checker.run(verbose=verbose) - ) + score += record_result(results, + type(env_checker).__name__, + env_checker.run(verbose=verbose)) build_checker = OracleArtifactBuild(config=ANVIL_CONFIG, logger=logger) - score += record_result( - results, type(build_checker).__name__, build_checker.run(verbose=verbose) - ) + score += record_result(results, + type(build_checker).__name__, + build_checker.run(verbose=verbose)) prep_checker = OracleBenchmarkPrep(config=ANVIL_CONFIG, logger=logger) - score += record_result( - results, type(prep_checker).__name__, prep_checker.run(verbose=verbose) - ) + score += record_result(results, + type(prep_checker).__name__, + prep_checker.run(verbose=verbose)) runs_checker = OracleExperimentRuns(config=ANVIL_CONFIG, logger=logger) - score += record_result( - results, type(runs_checker).__name__, runs_checker.run(verbose=verbose) - ) + score += record_result(results, + type(runs_checker).__name__, + runs_checker.run(verbose=verbose)) logger.info("Agent scores: %s", results) return score diff --git a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_artifact_build.py b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_artifact_build.py index 3554c528..84d176d1 100644 --- a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_artifact_build.py +++ b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_artifact_build.py @@ -1,48 +1,23 @@ -#!/usr/bin/env python3 -"""Artifact build oracle for the OSDI '24 ANVIL artifact. +"""Artifact build oracle for ANVIL (OSDI'24). Validates: - - The ACTO dependency repository can build its required library target. + - Required repository working directories exist + - Build commands execute successfully """ from __future__ import annotations -from collections.abc import Mapping, Sequence -from dataclasses import dataclass, field import logging -from pathlib import Path +from collections.abc import Sequence +from evaluator import utils from evaluator.oracle_artifact_build_primitives import ( BuildCommandRequirement, - BuildRequirement, OracleArtifactBuildBase, ) from evaluator.utils import EntryConfig -@dataclass(frozen = True, slots = True, kw_only = True) -class BuildTarget: - """Declarative description of one build command to run.""" - - name: str - cwd: Path - command: Sequence[str] - cwd_relative: Path | None = None - optional: bool = False - timeout_seconds: float = 60.0 - env_overrides: Mapping[str, str] = field(default_factory = dict) - - def __post_init__(self) -> None: - if not self.name: - raise ValueError("BuildTarget.name must be non-empty") - if not self.command: - raise ValueError(f"{self.name}: command must be non-empty") - if self.timeout_seconds <= 0: - raise ValueError(f"{self.name}: timeout_seconds must be > 0") - - object.__setattr__(self, "command", tuple(self.command)) - - class OracleArtifactBuild(OracleArtifactBuildBase): """Artifact build oracle for ANVIL.""" @@ -51,40 +26,26 @@ def __init__( *, config: EntryConfig, logger: logging.Logger, - targets: Sequence[BuildTarget] | None = None, + targets: Sequence[BuildCommandRequirement] | None = None, ) -> None: - super().__init__(logger = logger) + super().__init__(logger=logger) self._config = config - if targets is None: - targets = self._default_targets() - self._targets = tuple(targets) + self._requirements = tuple( + targets) if targets is not None else self._default_requirements() - names = [t.name for t in self._targets] + names = [r.name for r in self._requirements] if len(names) != len(set(names)): - raise ValueError(f"Duplicate build target names: {names!r}") + raise ValueError(f"Duplicate build requirement names: {names!r}") - def _default_targets(self) -> tuple[BuildTarget, ...]: + def _default_requirements(self) -> tuple[BuildCommandRequirement, ...]: acto_repo = self._config.repository_paths["osdi24-acto-dependency"] - return ( - BuildTarget( - name = "acto: make lib", - cwd = acto_repo, - command = ("make", "lib"), - timeout_seconds = 60.0, - ), - ) - - def requirements(self) -> Sequence[BuildRequirement]: - return tuple( - BuildCommandRequirement( - name = t.name, - optional = t.optional, - cwd = t.cwd, - command = t.command, - cwd_relative = t.cwd_relative, - timeout_seconds = t.timeout_seconds, - env_overrides = t.env_overrides, - ) - for t in self._targets - ) \ No newline at end of file + return (BuildCommandRequirement( + name="acto: make lib", + cwd=acto_repo, + command=("make", "lib"), + timeout_seconds=60.0, + ),) + + def requirements(self) -> Sequence[utils.BaseRequirement]: + return self._requirements diff --git a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_benchmark_prep.py b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_benchmark_prep.py index 0e274242..be1bf606 100644 --- a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_benchmark_prep.py +++ b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_benchmark_prep.py @@ -1,140 +1,105 @@ -#!/usr/bin/env python3 -import sys -import subprocess -from pathlib import Path - -from utils import REPO_DIRS, logger - - -class OracleBenchmarkPrep: - - def __init__(self): - self.repo_root = Path(REPO_DIRS["acto"]) - self.expected_remote = "https://github.com/xlab-uiuc/acto.git" - self.expected_branch = "anvil-dev" - - def run_shell_command(self, cmd): - """ - Run a command and return (rc, stdout, stderr) tuple. - """ - try: - cp = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, text = True) - return cp.returncode, (cp.stdout or "").strip(), (cp.stderr or "").strip() - except FileNotFoundError as e: - return 127, "", str(e) - - def check_repo_exists(self): - """ - Check that repository root exists and is a git working tree. - """ - if not self.repo_root.is_dir(): - return False, f"acto: FAIL (repo) - directory not found: {self.repo_root}" - - rc, out, err = self.run_shell_command( - ["git", "-C", str(self.repo_root), "rev-parse", "--is-inside-work-tree"] - ) - if rc != 0 or out != "true": - return False, f"acto: FAIL (repo) - not a git working tree: {err or out}" - - return True, "acto: PASS (repo) - git working tree present" - - def check_remote_origin(self): - """ - Check that remote matches the expected repository URL. - """ - rc, out, err = self.run_shell_command( - ["git", "-C", str(self.repo_root), "remote", "get-url", "origin"] - ) - if rc != 0: - return False, f"acto: FAIL (remote) - cannot read origin remote: {err or out}" - - origin_url = (out or "").strip() - def normalize(url: str) -> str: - return url[:-4] if url.endswith(".git") else url - - if normalize(origin_url) != normalize(self.expected_remote): - return False, ( - "acto: FAIL (remote) - origin URL " - f"{origin_url!r} does not match expected {self.expected_remote!r}" - ) - - return True, f"acto: PASS (remote) - origin URL matches {self.expected_remote}" - - def check_branch_and_head(self): - """ - Check that the current branch is the expected one and that the current - commit resolves to a valid hash. - """ - rc, out, err = self.run_shell_command( - ["git", "-C", str(self.repo_root), "rev-parse", "--abbrev-ref", "HEAD"] - ) - if rc != 0: - return False, f"acto: FAIL (branch) - cannot read current branch: {err or out}" +"""Benchmark preparation oracle for ANVIL (OSDI'24). - branch = (out or "").strip() - if branch != self.expected_branch: - return False, f"acto: FAIL (branch) - {branch!r} != expected {self.expected_branch!r}" +Validates: + - Target repository working directory exists + - Repository was cloned and is a valid git working tree + - Current branch matches the expected branch + - Current HEAD commit matches the expected revision +""" - rc, out, err = self.run_shell_command( - ["git", "-C", str(self.repo_root), "rev-parse", "HEAD"] - ) - if rc != 0: - return False, f"acto: FAIL (commit) - cannot read HEAD: {err or out}" - - head = (out or "").strip() - if not head: - return False, "acto: FAIL (commit) - empty HEAD hash" - - return True, f"acto: PASS (branch/commit) - {branch}@{head[:12]}" - - def check_submodules_recursive(self): - """ - Check that submodules (if any) are initialized, approximating a --recursive clone. - """ - gitmodules = self.repo_root / ".gitmodules" - if not gitmodules.exists(): - # No submodules configured; nothing to check - return True, "acto: PASS (submodules) - no submodules configured" - - rc, out, err = self.run_shell_command( - ["git", "-C", str(self.repo_root), "submodule", "status", "--recursive"] - ) - if rc != 0: - return False, f"acto: FAIL (submodules) - git submodule status failed: {err or out}" - - # Heuristic: lines starting with '-' indicate uninitialized submodules - uninitialized = [line for line in out.splitlines() if line.startswith("-")] - if uninitialized: - return False, ( - "acto: FAIL (submodules) - uninitialized submodules present " - "(clone may have been done without --recursive)" - ) - - return True, "acto: PASS (submodules) - all submodules initialized" - - def run(self): - """ - Run all repository checks and return True on overall success. - """ - results: list[bool] = [] - - ok, msg = self.check_repo_exists() - logger.info(msg) - results.append(ok) - - ok, msg = self.check_remote_origin() - logger.info(msg) - results.append(ok) - - ok, msg = self.check_branch_and_head() - logger.info(msg) - results.append(ok) - - ok, msg = self.check_submodules_recursive() - logger.info(msg) - results.append(ok) - - if all(results): - return True - - return False \ No newline at end of file +from __future__ import annotations +from pathlib import Path +from typing import Sequence + +from evaluator import utils +from evaluator.utils import EntryConfig +from evaluator.benchmark_prep_primitives import OracleBenchmarkPrepBase, BenchmarkRequirement, FailRequirement + + +class OracleBenchmarkPrep(OracleBenchmarkPrepBase): + + def __init__(self, *, config: EntryConfig, logger) -> None: + super().__init__(logger=logger) + self._config = config + self._ORACLE_NAME = f"BenchmarkPrep/{config.name}" + + repo = None + for _k, p in config.repository_paths.items(): + if p.name.lower() == "acto": + repo = p + break + if repo is None and config.repository_paths: + repo = next(iter(config.repository_paths.values())) + self._repo_root = repo + + self._expected_branch = None + self._expected_head = None + if repo is not None: + repo_id = next(k for k, v in config.repository_paths.items() if v == repo) + bpath = config.ground_truth_paths.get(f"{repo_id}.expected_branch") + hpath = config.ground_truth_paths.get(f"{repo_id}.expected_head") + if bpath: + self._expected_branch = Path(bpath).read_text(encoding="utf-8").strip() + if hpath: + self._expected_head = Path(hpath).read_text(encoding="utf-8").strip() + + def requirements(self) -> Sequence[utils.BaseRequirement]: + if self._repo_root is None: + return (FailRequirement(name="select repo", + message="No repository_paths configured"),) + + reqs: list[utils.BaseRequirement] = [] + + # Check that ACTO directory exists + reqs.append( + BenchmarkRequirement( + name="repo directory exists", + filepath=self._repo_root, + )) + + # Check that ACTO repository has been cloned correctly + reqs.append( + BenchmarkRequirement( + name="git working tree", + filepath=self._repo_root, + cmd=("git", "rev-parse", "--is-inside-work-tree"), + signature="true", + timeout_seconds=10.0, + )) + + # Check that ACTO branch matches + if not self._expected_branch: + reqs.append( + FailRequirement( + name="expected branch configured", + message= + "Missing expected branch in EntryConfig.ground_truth_paths", + )) + else: + reqs.append( + BenchmarkRequirement( + name="on expected branch", + filepath=self._repo_root, + cmd=("git", "rev-parse", "--abbrev-ref", "HEAD"), + signature=self._expected_branch, + timeout_seconds=10.0, + )) + + # Check that ACTO commit SHA matches + if not self._expected_head: + reqs.append( + FailRequirement( + name="expected head configured", + message="Missing expected head in EntryConfig.ground_truth_paths", + )) + else: + reqs.append( + BenchmarkRequirement( + name="HEAD matches expected", + filepath=self._repo_root, + cmd=("git", "rev-parse", "HEAD"), + signature=self._expected_head, + timeout_seconds=10.0, + )) + + return tuple(reqs) diff --git a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_env_setup.py b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_env_setup.py index 8bef40db..21c9897a 100644 --- a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_env_setup.py +++ b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_env_setup.py @@ -1,138 +1,95 @@ -#!/usr/bin/env python3 -"""Environment setup oracle for the ANVIL bundle. +"""Environment setup oracle for ANVIL (OSDI'24). -This implementation uses evaluator.oracle_env_setup_primitives for consistent -reporting and verbose failure logging. +Validates: + - Required workspace and repository directories exist + - Required reference (ground-truth) files exist + - Required external tooling is available and satisfies minimum version constraints """ from __future__ import annotations -import dataclasses -import logging -import shutil -from collections.abc import Sequence +from collections.abc import Mapping, Sequence from pathlib import Path +import logging -from evaluator.utils import CheckResult, EntryConfig +from evaluator.utils import EntryConfig from evaluator.oracle_env_setup_primitives import ( DependencyVersionRequirement, - EnvironmentVariableRequirement, - EnvQuantifier, FilesystemPathRequirement, OracleEnvSetupBase, PathType, - Requirement, VersionCompare, ) -@dataclasses.dataclass(frozen = True, slots = True, kw_only = True) -class ExecutableOnPathRequirement(Requirement): - """Checks that an executable is present on PATH (no version constraint).""" - - executable: str - - def __post_init__(self) -> None: - if not self.executable: - raise ValueError(f"{self.name}: executable must be non-empty") - - def check(self) -> CheckResult: - if shutil.which(self.executable) is None: - return CheckResult.failure(f"not found on PATH: {self.executable!r}") - return CheckResult.success() +def _required_path(paths: Mapping[str, Path], key: str, *, label: str) -> Path: + """Fetches a required path from an EntryConfig mapping with a clear error.""" + try: + return paths[key] + except KeyError as exc: + raise ValueError(f"Missing {label}[{key!r}] in EntryConfig") from exc class OracleEnvSetup(OracleEnvSetupBase): - """Validates environment prerequisites for the ANVIL bundle.""" + """Validates that the ANVIL workspace and dependencies are present.""" + + _ORACLE_NAME = "EnvironmentSetup" def __init__(self, *, config: EntryConfig, logger: logging.Logger) -> None: - super().__init__(logger = logger) + super().__init__(logger=logger) self._config = config - def requirements(self) -> Sequence[Requirement]: - home_dir = self._config.home_dir - venv_dir = home_dir / ".venv" - go_root = Path.home() / "go" - go_bin = go_root / "bin" - - reqs: list[Requirement] = [ - # Check dependencies - DependencyVersionRequirement( - name = "docker", - command = ("docker", "--version"), - required_version = (24, 0, 0), - compare = VersionCompare.GEQ, - ), - DependencyVersionRequirement( - name = "go", - command = ("go", "version"), - required_version = (1, 22, 0), - compare = VersionCompare.GEQ, - version_regex = r"go(\d+\.\d+(?:\.\d+)?)", - ), - DependencyVersionRequirement( - name = "python3", - command = ("python3", "--version"), - required_version = (3, 10, 0), - compare = VersionCompare.GEQ, - version_regex = r"Python\s+([0-9.]+)", - ), - DependencyVersionRequirement( - name = "pip3", - command = ("pip3", "--version"), - required_version = (24, 0, 0), - compare = VersionCompare.GEQ, - ), - DependencyVersionRequirement( - name = "kind", - command = ("kind", "version"), - required_version = (0, 20, 0), - compare = VersionCompare.GEQ, - version_regex = r"v([0-9.]+)", - ), - DependencyVersionRequirement( - name = "kubectl", - command = ("kubectl", "version", "--client", "--short"), - required_version = (1, 22, 9), - compare = VersionCompare.GEQ, - version_regex = r"Client Version:\s+v?([0-9.]+)", + def requirements( + self + ) -> Sequence[FilesystemPathRequirement | DependencyVersionRequirement]: + cfg = self._config + + if not cfg.repository_paths: + raise ValueError("EntryConfig.repository_paths must be non-empty") + if not cfg.ground_truth_paths: + raise ValueError("EntryConfig.ground_truth_paths must be non-empty") + + anvil_repo = _required_path(cfg.repository_paths, + "osdi24-anvil", + label="repository_paths") + acto_repo = _required_path(cfg.repository_paths, + "osdi24-acto-dependency", + label="repository_paths") + + table3_ref = _required_path(cfg.ground_truth_paths, + "table3", + label="ground_truth_paths") + + return ( + # Workspace and repository directory layout + FilesystemPathRequirement( + name="home_dir", + path=cfg.home_dir, + path_type=PathType.DIRECTORY, ), - - # Check directory structure FilesystemPathRequirement( - name = "venv_exists", - path = venv_dir, - path_type = PathType.DIRECTORY, + name="repo_osdi24_anvil", + path=anvil_repo, + path_type=PathType.DIRECTORY, ), FilesystemPathRequirement( - name = "go_root_exists", - path = go_root, - path_type = PathType.DIRECTORY, + name="repo_osdi24_acto_dependency", + path=acto_repo, + path_type=PathType.DIRECTORY, ), - # Check PATH contents - EnvironmentVariableRequirement( - name = "PATH_contains_go_root", - env_var = "PATH", - expected = str(go_root), - quantifier = EnvQuantifier.CONTAINS, + # Reference artifacts used for evaluation + FilesystemPathRequirement( + name="ref_table3", + path=table3_ref, + path_type=PathType.FILE, ), - EnvironmentVariableRequirement( - name = "PATH_contains_go_bin", - env_var = "PATH", - expected = str(go_bin), - quantifier = EnvQuantifier.CONTAINS, + + # Tooling dependencies + DependencyVersionRequirement( + name="python3_version", + cmd=("python3", "--version"), + required_version=(3, 10, 0), + compare=VersionCompare.GEQ, ), - ] - - # Check that the repo root directory is present - for key, repo_root in sorted(self._config.repository_paths.items()): - reqs.append( - FilesystemPathRequirement( - name = f"repo_exists:{key}", - path = repo_root, - path_type = PathType.DIRECTORY, - ) - ) - - return reqs + ) diff --git a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_experiment_runs.py b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_experiment_runs.py index a9f5f1c6..93ecc0c6 100644 --- a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_experiment_runs.py +++ b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_experiment_runs.py @@ -1,24 +1,27 @@ -#!/usr/bin/env python3 -"""Experiment runs oracle for the OSDI'24 ANVIL artifact. +"""Experiment runs oracle for ANVIL (OSDI'24). -Validates results (tsble 3) against reference measurements by comparing -per-controller calues: - - mean ratio: verified_anvil_mean / reference_unverified_mean - - max ratio: verified_anvil_max / reference_unverified_max +Validates: + - Table 3 results file exists and is parseable + - Table 3 reference (ground-truth) JSON exists and is parseable + - Per-controller mean and max ratios (verified/reference) meet similarity thresholds """ from __future__ import annotations +import hashlib import json +import logging +import math from collections.abc import Mapping, Sequence from dataclasses import dataclass from pathlib import Path -import logging +from evaluator import utils from evaluator.oracle_experiment_runs_primitives import ( - ExperimentRunsRequirement, - LabeledSequenceSimilarityThresholdRequirement, - OracleExperimentRunsBase, + ElementwiseSimilarityThresholdRequirement, + ListSimilarityRequirement, + SimilarityMetric, + OracleExperimentRunsBase, ) from evaluator.utils import EntryConfig @@ -33,11 +36,11 @@ class TableRow: _EXPECTED_HEADERS: tuple[str, ...] = ( - "Controller", - "Verified (Anvil) Mean", - "Verified (Anvil) Max", - "Reference (unverified) Mean", - "Reference (unverified) Max", + "Controller", + "Verified (Anvil) Mean", + "Verified (Anvil) Max", + "Reference (unverified) Mean", + "Reference (unverified) Max", ) @@ -49,6 +52,12 @@ def _required_path(paths: Mapping[str, Path], key: str, *, label: str) -> Path: raise ValueError(f"Missing {label}[{key!r}] in EntryConfig") from exc +def _read_lines(path: Path, *, label: str) -> list[str]: + if not path.exists(): + raise ValueError(f"{label}: {path} not found") + return path.read_text(encoding="utf-8").splitlines() + + def _is_separator_line(line: str) -> bool: """Returns True if this looks like the Markdown header separator line.""" stripped = line.strip() @@ -93,7 +102,6 @@ def _parse_results_table_rows(lines: Sequence[str]) -> list[TableRow]: for line in lines: if "|" not in line: - # Not a table row. continue if header_line is None: @@ -117,32 +125,27 @@ def _parse_results_table_rows(lines: Sequence[str]) -> list[TableRow]: cells = _split_markdown_row(line) if len(cells) != len(_EXPECTED_HEADERS): raise ValueError( - f"Row has {len(cells)} cells, expected {len(_EXPECTED_HEADERS)}: {line!r}" + f"Row has {len(cells)} cells, expected {len(_EXPECTED_HEADERS)}: {line!r}" ) controller = cells[0] - verified_anvil_mean = _parse_float_token( - cells[1], label="Verified (Anvil) Mean" - ) - verified_anvil_max = _parse_float_token( - cells[2], label="Verified (Anvil) Max" - ) + verified_anvil_mean = _parse_float_token(cells[1], + label="Verified (Anvil) Mean") + verified_anvil_max = _parse_float_token(cells[2], + label="Verified (Anvil) Max") reference_unverified_mean = _parse_float_token( - cells[3], label="Reference (unverified) Mean" - ) + cells[3], label="Reference (unverified) Mean") reference_unverified_max = _parse_float_token( - cells[4], label="Reference (unverified) Max" - ) + cells[4], label="Reference (unverified) Max") rows.append( - TableRow( - controller=controller, - verified_anvil_mean=verified_anvil_mean, - verified_anvil_max=verified_anvil_max, - reference_unverified_mean=reference_unverified_mean, - reference_unverified_max=reference_unverified_max, - ) - ) + TableRow( + controller=controller, + verified_anvil_mean=verified_anvil_mean, + verified_anvil_max=verified_anvil_max, + reference_unverified_mean=reference_unverified_mean, + reference_unverified_max=reference_unverified_max, + )) return rows @@ -167,101 +170,104 @@ def _load_reference_rows(path: Path) -> list[TableRow]: try: rows.append( - TableRow( - controller=str(obj["controller"]), - verified_anvil_mean=float(obj["verified_anvil_mean"]), - verified_anvil_max=float(obj["verified_anvil_max"]), - reference_unverified_mean=float(obj["reference_unverified_mean"]), - reference_unverified_max=float(obj["reference_unverified_max"]), - ) - ) + TableRow( + controller=str(obj["controller"]), + verified_anvil_mean=float(obj["verified_anvil_mean"]), + verified_anvil_max=float(obj["verified_anvil_max"]), + reference_unverified_mean=float(obj["reference_unverified_mean"]), + reference_unverified_max=float(obj["reference_unverified_max"]), + )) except (KeyError, TypeError, ValueError) as exc: raise ValueError(f"{path} malformed entry #{idx}: {exc}") from exc return rows -def _results_mean_ratio_pairs(lines: Sequence[str]) -> list[tuple[str, float]]: - """Returns (controller, mean_ratio) from results table.""" - rows = _parse_results_table_rows(lines) - out: list[tuple[str, float]] = [] +def _ratios_by_controller( + rows: Sequence[TableRow]) -> dict[str, tuple[float, float]]: + """Returns controller -> (mean_ratio, max_ratio).""" + out: dict[str, tuple[float, float]] = {} for r in rows: - mean_ratio, _ = _compute_ratios(r) - out.append((r.controller, mean_ratio)) + if r.controller in out: + raise ValueError(f"Duplicate controller row: {r.controller!r}") + out[r.controller] = _compute_ratios(r) return out -def _results_max_ratio_pairs(lines: Sequence[str]) -> list[tuple[str, float]]: - """Returns (controller, max_ratio) from results table.""" - rows = _parse_results_table_rows(lines) - out: list[tuple[str, float]] = [] - for r in rows: - _, max_ratio = _compute_ratios(r) - out.append((r.controller, max_ratio)) - return out - - -def _reference_mean_ratio_pairs(path: Path) -> list[tuple[str, float]]: - """Returns (controller, mean_ratio) from reference JSON rows.""" - rows = _load_reference_rows(path) - out: list[tuple[str, float]] = [] - for r in rows: - mean_ratio, _ = _compute_ratios(r) - out.append((r.controller, mean_ratio)) - return out - - -def _reference_max_ratio_pairs(path: Path) -> list[tuple[str, float]]: - """Returns (controller, max_ratio) from reference JSON rows.""" - rows = _load_reference_rows(path) - out: list[tuple[str, float]] = [] - for r in rows: - _, max_ratio = _compute_ratios(r) - out.append((r.controller, max_ratio)) - return out +def _controller_id_as_float(controller: str) -> float: + """Determinstic controller ID encoding as a float.""" + digest = hashlib.sha256(controller.encode("utf-8")).digest() + raw64 = int.from_bytes(digest[:8], "big", signed=False) + return float(raw64 % (2**53)) class OracleExperimentRuns(OracleExperimentRunsBase): - """Validates ANVIL experiment run outputs (TABLE-3).""" - - _NAME = "ExperimentRuns" + """Validates ANVIL experiment run outputs (Table 3).""" def __init__(self, *, config: EntryConfig, logger: logging.Logger) -> None: super().__init__(logger=logger) self._config = config - def requirements(self) -> Sequence[ExperimentRunsRequirement]: + def requirements(self) -> Sequence[utils.BaseRequirement]: if not self._config.results_paths: raise ValueError("EntryConfig.results_paths must be non-empty") if not self._config.ground_truth_paths: raise ValueError("EntryConfig.ground_truth_paths must be non-empty") - results_path = _required_path( - self._config.results_paths, "table3", label="results_paths" - ) - reference_path = _required_path( - self._config.ground_truth_paths, "table3", label="ground_truth_paths" - ) + results_path = _required_path(self._config.results_paths, + "table3", + label="results_paths") + reference_path = _required_path(self._config.ground_truth_paths, + "table3", + label="ground_truth_paths") threshold = self._config.similarity_ratio + results_rows = _parse_results_table_rows( + _read_lines(results_path, label="results_path")) + reference_rows = _load_reference_rows(reference_path) + + results_map = _ratios_by_controller(results_rows) + reference_map = _ratios_by_controller(reference_rows) + + controllers_union = sorted(set(results_map) | set(reference_map)) + results_mean = [] + ref_mean = [] + results_max = [] + ref_max = [] + for c in controllers_union: + r = results_map.get(c) + g = reference_map.get(c) + results_mean.append(r[0] if r is not None else float("nan")) + results_max.append(r[1] if r is not None else float("nan")) + ref_mean.append(g[0] if g is not None else float("nan")) + ref_max.append(g[1] if g is not None else float("nan")) + + results_controller_ids = [ + _controller_id_as_float(c) for c in results_map.keys() + ] + reference_controller_ids = [ + _controller_id_as_float(c) for c in reference_map.keys() + ] + return ( - LabeledSequenceSimilarityThresholdRequirement( - name="table3_mean_ratio", - label="TABLE-3 mean_ratio", - results_path=results_path, - reference_path=reference_path, - threshold=threshold, - parse_results_fn=_results_mean_ratio_pairs, - parse_reference_fn=_reference_mean_ratio_pairs, - ), - LabeledSequenceSimilarityThresholdRequirement( - name="table3_max_ratio", - label="TABLE-3 max_ratio", - results_path=results_path, - reference_path=reference_path, - threshold=threshold, - parse_results_fn=_results_max_ratio_pairs, - parse_reference_fn=_reference_max_ratio_pairs, - ), + ListSimilarityRequirement( + name="table3_controllers", + observed=results_controller_ids, + reference=reference_controller_ids, + metric=SimilarityMetric.JACCARD_SET, + min_similarity=1.0, + ), + ElementwiseSimilarityThresholdRequirement( + name="table3_mean_ratio", + observed=results_mean, + reference=ref_mean, + threshold=threshold, + ), + ElementwiseSimilarityThresholdRequirement( + name="table3_max_ratio", + observed=results_max, + reference=ref_max, + threshold=threshold, + ), )