|
1 | 1 | #!/usr/bin/env python3 |
2 | | -"""Runs environment setup checks for ANVIL.""" |
| 2 | +"""Runs environment setup, build, benchmark prep, and experiment runs checks for ANVIL (OSDI'24).""" |
3 | 3 |
|
4 | 4 | from __future__ import annotations |
5 | 5 |
|
6 | | -import os |
7 | | -import sys |
8 | 6 | from pathlib import Path |
9 | 7 | from typing import Dict |
| 8 | +import os |
| 9 | +import sys |
10 | 10 |
|
11 | 11 | _AGENT_EVAL_DIR = Path(__file__).resolve().parent |
12 | 12 | _AGENT_SRC_DIR = _AGENT_EVAL_DIR.parents[3] / "src" |
13 | 13 | sys.path.append(str(_AGENT_SRC_DIR)) |
14 | 14 |
|
15 | | -from oracle_env_setup import OracleEnvSetup |
16 | | -from oracle_artifact_build import OracleArtifactBuild |
17 | | -from oracle_benchmark_prep import OracleBenchmarkPrep |
18 | | -from oracle_experiment_runs import OracleExperimentRuns |
19 | 15 | from evaluator.utils import ( |
20 | 16 | EntryConfig, |
21 | 17 | LoggerConfig, |
22 | 18 | get_logger, |
23 | 19 | record_result, |
24 | 20 | ) |
| 21 | +from oracle_env_setup import OracleEnvSetup |
| 22 | +from oracle_artifact_build import OracleArtifactBuild |
| 23 | +from oracle_benchmark_prep import OracleBenchmarkPrep |
| 24 | +from oracle_experiment_runs import OracleExperimentRuns |
25 | 25 |
|
26 | | -# Reuse the same constants the legacy oracle used. |
27 | | -from utils import RESULTS_PATH, SIMILARITY_RATIO # pylint: disable=wrong-import-position |
28 | | - |
29 | | - |
30 | | -ANVIL_CONFIG = EntryConfig( |
31 | | - name="osdi24-anvil", |
32 | | - home_dir=Path.home() / "osdi24_anvil", |
33 | | - repository_paths={ |
34 | | - "osdi24-anvil": Path.home() / "osdi24_anvil" / "anvil", |
35 | | - "osdi24-acto-dependency": Path.home() / "osdi24_anvil" / "acto", |
36 | | - }, |
37 | | - results_paths={ |
38 | | - "table3": Path(RESULTS_PATH), |
39 | | - }, |
40 | | - ground_truth_paths={ |
41 | | - "table3": ( |
42 | | - Path.home() |
43 | | - / "osdi24_anvil" |
44 | | - / "_agent_eval" |
45 | | - / "refs" |
46 | | - / "anvil-table-3.ref.json" |
47 | | - ), |
48 | | - }, |
49 | | - similarity_ratio=SIMILARITY_RATIO, |
50 | | -) |
| 26 | + |
| 27 | +def _resolve_workspace_paths() -> tuple[Path, Path]: |
| 28 | + """Resolve and validate _agent_eval/ and the ANVIL workspace root. |
| 29 | +
|
| 30 | + Expects either: |
| 31 | + (1) _agent_eval/ and (anvil/, acto/) are located in the same root directory; or |
| 32 | + (2) _AGENT_EVAL_DIR and _ANVIL_HOME are set by the user. |
| 33 | + """ |
| 34 | + try: |
| 35 | + env_agent_eval = os.environ.get("_AGENT_EVAL_DIR") |
| 36 | + env_anvil_home = os.environ.get("_ANVIL_HOME") |
| 37 | + |
| 38 | + if env_agent_eval: |
| 39 | + agent_eval_dir = Path(env_agent_eval).expanduser().resolve() |
| 40 | + else: |
| 41 | + agent_eval_dir = Path(__file__).resolve().parent |
| 42 | + |
| 43 | + if env_anvil_home: |
| 44 | + workspace_root = Path(env_anvil_home).expanduser().resolve() |
| 45 | + else: |
| 46 | + workspace_root = agent_eval_dir.parent.resolve() |
| 47 | + |
| 48 | + if not agent_eval_dir.exists() or not agent_eval_dir.is_dir(): |
| 49 | + raise RuntimeError( |
| 50 | + f"Invalid _agent_eval dir: {agent_eval_dir}\n" |
| 51 | + f"This runner expects _agent_eval/ to exist.\n" |
| 52 | + f"Set _AGENT_EVAL_DIR to the directory containing main.py if needed.") |
| 53 | + |
| 54 | + anvil_repo_root = workspace_root / "anvil" |
| 55 | + if not anvil_repo_root.exists() or not anvil_repo_root.is_dir(): |
| 56 | + raise RuntimeError( |
| 57 | + f"Invalid ANVIL workspace: {workspace_root}\n" |
| 58 | + f"Expected to find an 'anvil/' directory at: {anvil_repo_root}\n" |
| 59 | + f"This runner expects _agent_eval/ and anvil/ to be located in the same root directory.\n" |
| 60 | + f"Set _ANVIL_HOME to the workspace root if needed.") |
| 61 | + |
| 62 | + acto_repo_root = workspace_root / "acto" |
| 63 | + if not acto_repo_root.exists() or not acto_repo_root.is_dir(): |
| 64 | + raise RuntimeError( |
| 65 | + f"Invalid ANVIL workspace: {workspace_root}\n" |
| 66 | + f"Expected to find an 'acto/' directory at: {acto_repo_root}\n" |
| 67 | + f"This runner expects _agent_eval/ and acto/ to be located in the same root directory.\n" |
| 68 | + f"Set _ANVIL_HOME to the workspace root if needed.") |
| 69 | + |
| 70 | + return agent_eval_dir, workspace_root |
| 71 | + |
| 72 | + except OSError as exc: |
| 73 | + raise RuntimeError(f"Failed to resolve workspace paths: {exc}") from exc |
| 74 | + |
| 75 | + |
| 76 | +def _build_anvil_config(*, agent_eval_dir: Path, |
| 77 | + workspace_root: Path) -> EntryConfig: |
| 78 | + """Construct EntryConfig for the ANVIL evaluation bundle from resolved paths.""" |
| 79 | + anvil_repo = (workspace_root / "anvil").resolve() |
| 80 | + acto_repo = (workspace_root / "acto").resolve() |
| 81 | + |
| 82 | + agent_eval_dir = agent_eval_dir.resolve() |
| 83 | + refs_dir = (agent_eval_dir / "refs").resolve() |
| 84 | + |
| 85 | + default_table3_results = (anvil_repo / "results" / "table3.md").resolve() |
| 86 | + table3_results = Path( |
| 87 | + os.environ.get("_ANVIL_TABLE3_RESULTS", |
| 88 | + str(default_table3_results))).expanduser().resolve() |
| 89 | + |
| 90 | + similarity_ratio = float(os.environ.get("_ANVIL_SIMILARITY_RATIO", "0.75")) |
| 91 | + |
| 92 | + return EntryConfig( |
| 93 | + name="osdi24-anvil", |
| 94 | + home_dir=workspace_root, |
| 95 | + repository_paths={ |
| 96 | + "osdi24-anvil": anvil_repo, |
| 97 | + "osdi24-acto-dependency": acto_repo, |
| 98 | + }, |
| 99 | + results_paths={ |
| 100 | + "table3": table3_results, |
| 101 | + }, |
| 102 | + ground_truth_paths={ |
| 103 | + "table3": (refs_dir / "anvil-table-3.ref.json").resolve(), |
| 104 | + "osdi24-acto-dependency.expected_branch": |
| 105 | + (refs_dir / "acto.expected_branch.txt").resolve(), |
| 106 | + "osdi24-acto-dependency.expected_head": |
| 107 | + (refs_dir / "acto.expected_head.txt").resolve(), |
| 108 | + }, |
| 109 | + similarity_ratio=similarity_ratio, |
| 110 | + ) |
51 | 111 |
|
52 | 112 |
|
53 | 113 | def main(argv: list[str]) -> int: |
| 114 | + verbose = "--verbose" in argv |
| 115 | + |
54 | 116 | results: Dict[str, int] = {} |
55 | 117 | score = 0 |
56 | 118 |
|
57 | | - verbose = "--verbose" in argv |
58 | | - |
59 | | - logger_name = os.environ.get("EVAL_LOGGER_NAME", "ANVIL-EVAL") |
| 119 | + logger_name = os.environ.get("EVAL_LOGGER_NAME", "ANVIL-AGENT-EVALUATOR") |
60 | 120 | logger = get_logger(LoggerConfig(root_name=logger_name)) |
61 | 121 |
|
| 122 | + try: |
| 123 | + agent_eval_dir, workspace_root = _resolve_workspace_paths() |
| 124 | + ANVIL_CONFIG = _build_anvil_config(agent_eval_dir=agent_eval_dir, |
| 125 | + workspace_root=workspace_root) |
| 126 | + except RuntimeError as exc: |
| 127 | + raise SystemExit(str(exc)) from exc |
| 128 | + |
62 | 129 | env_checker = OracleEnvSetup(config=ANVIL_CONFIG, logger=logger) |
63 | | - score += record_result( |
64 | | - results, type(env_checker).__name__, env_checker.run(verbose=verbose) |
65 | | - ) |
| 130 | + score += record_result(results, |
| 131 | + type(env_checker).__name__, |
| 132 | + env_checker.run(verbose=verbose)) |
66 | 133 |
|
67 | 134 | build_checker = OracleArtifactBuild(config=ANVIL_CONFIG, logger=logger) |
68 | | - score += record_result( |
69 | | - results, type(build_checker).__name__, build_checker.run(verbose=verbose) |
70 | | - ) |
| 135 | + score += record_result(results, |
| 136 | + type(build_checker).__name__, |
| 137 | + build_checker.run(verbose=verbose)) |
71 | 138 |
|
72 | 139 | prep_checker = OracleBenchmarkPrep(config=ANVIL_CONFIG, logger=logger) |
73 | | - score += record_result( |
74 | | - results, type(prep_checker).__name__, prep_checker.run(verbose=verbose) |
75 | | - ) |
| 140 | + score += record_result(results, |
| 141 | + type(prep_checker).__name__, |
| 142 | + prep_checker.run(verbose=verbose)) |
76 | 143 |
|
77 | 144 | runs_checker = OracleExperimentRuns(config=ANVIL_CONFIG, logger=logger) |
78 | | - score += record_result( |
79 | | - results, type(runs_checker).__name__, runs_checker.run(verbose=verbose) |
80 | | - ) |
| 145 | + score += record_result(results, |
| 146 | + type(runs_checker).__name__, |
| 147 | + runs_checker.run(verbose=verbose)) |
81 | 148 |
|
82 | 149 | logger.info("Agent scores: %s", results) |
83 | 150 | return score |
|
0 commit comments