diff --git a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/main.py b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/main.py
index b9910621..b4f0c74a 100644
--- a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/main.py
+++ b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/main.py
@@ -1,83 +1,150 @@
 #!/usr/bin/env python3
-"""Runs environment setup checks for ANVIL."""
+"""Runs environment setup, build, benchmark prep, and experiment runs checks for ANVIL (OSDI'24)."""
 
 from __future__ import annotations
 
-import os
-import sys
 from pathlib import Path
 from typing import Dict
+import os
+import sys
 
 _AGENT_EVAL_DIR = Path(__file__).resolve().parent
 _AGENT_SRC_DIR = _AGENT_EVAL_DIR.parents[3] / "src"
 sys.path.append(str(_AGENT_SRC_DIR))
 
-from oracle_env_setup import OracleEnvSetup
-from oracle_artifact_build import OracleArtifactBuild
-from oracle_benchmark_prep import OracleBenchmarkPrep
-from oracle_experiment_runs import OracleExperimentRuns
 from evaluator.utils import (
     EntryConfig,
     LoggerConfig,
     get_logger,
     record_result,
 )
+from oracle_env_setup import OracleEnvSetup
+from oracle_artifact_build import OracleArtifactBuild
+from oracle_benchmark_prep import OracleBenchmarkPrep
+from oracle_experiment_runs import OracleExperimentRuns
 
-# Reuse the same constants the legacy oracle used.
-from utils import RESULTS_PATH, SIMILARITY_RATIO  # pylint: disable=wrong-import-position
-
-
-ANVIL_CONFIG = EntryConfig(
-    name="osdi24-anvil",
-    home_dir=Path.home() / "osdi24_anvil",
-    repository_paths={
-        "osdi24-anvil": Path.home() / "osdi24_anvil" / "anvil",
-        "osdi24-acto-dependency": Path.home() / "osdi24_anvil" / "acto",
-    },
-    results_paths={
-        "table3": Path(RESULTS_PATH),
-    },
-    ground_truth_paths={
-        "table3": (
-            Path.home()
-            / "osdi24_anvil"
-            / "_agent_eval"
-            / "refs"
-            / "anvil-table-3.ref.json"
-        ),
-    },
-    similarity_ratio=SIMILARITY_RATIO,
-)
+
+def _resolve_workspace_paths() -> tuple[Path, Path]:
+  """Resolve and validate _agent_eval/ and the ANVIL workspace root.
+
+  Expects either:
+    (1) _agent_eval/ and (anvil/, acto/) are located in the same root directory; or
+    (2) _AGENT_EVAL_DIR and _ANVIL_HOME are set by the user.
+  """
+  try:
+    env_agent_eval = os.environ.get("_AGENT_EVAL_DIR")
+    env_anvil_home = os.environ.get("_ANVIL_HOME")
+
+    if env_agent_eval:
+      agent_eval_dir = Path(env_agent_eval).expanduser().resolve()
+    else:
+      agent_eval_dir = Path(__file__).resolve().parent
+
+    if env_anvil_home:
+      workspace_root = Path(env_anvil_home).expanduser().resolve()
+    else:
+      workspace_root = agent_eval_dir.parent.resolve()
+
+    if not agent_eval_dir.exists() or not agent_eval_dir.is_dir():
+      raise RuntimeError(
+          f"Invalid _agent_eval dir: {agent_eval_dir}\n"
+          f"This runner expects _agent_eval/ to exist.\n"
+          f"Set _AGENT_EVAL_DIR to the directory containing main.py if needed.")
+
+    anvil_repo_root = workspace_root / "anvil"
+    if not anvil_repo_root.exists() or not anvil_repo_root.is_dir():
+      raise RuntimeError(
+          f"Invalid ANVIL workspace: {workspace_root}\n"
+          f"Expected to find an 'anvil/' directory at: {anvil_repo_root}\n"
+          f"This runner expects _agent_eval/ and anvil/ to be located in the same root directory.\n"
+          f"Set _ANVIL_HOME to the workspace root if needed.")
+
+    acto_repo_root = workspace_root / "acto"
+    if not acto_repo_root.exists() or not acto_repo_root.is_dir():
+      raise RuntimeError(
+          f"Invalid ANVIL workspace: {workspace_root}\n"
+          f"Expected to find an 'acto/' directory at: {acto_repo_root}\n"
+          f"This runner expects _agent_eval/ and acto/ to be located in the same root directory.\n"
+          f"Set _ANVIL_HOME to the workspace root if needed.")
+
+    return agent_eval_dir, workspace_root
+
+  except OSError as exc:
+    raise RuntimeError(f"Failed to resolve workspace paths: {exc}") from exc
+
+
+def _build_anvil_config(*, agent_eval_dir: Path,
+                        workspace_root: Path) -> EntryConfig:
+  """Construct EntryConfig for the ANVIL evaluation bundle from resolved paths."""
+  anvil_repo = (workspace_root / "anvil").resolve()
+  acto_repo = (workspace_root / "acto").resolve()
+
+  agent_eval_dir = agent_eval_dir.resolve()
+  refs_dir = (agent_eval_dir / "refs").resolve()
+
+  default_table3_results = (anvil_repo / "results" / "table3.md").resolve()
+  table3_results = Path(
+      os.environ.get("_ANVIL_TABLE3_RESULTS",
+                     str(default_table3_results))).expanduser().resolve()
+
+  similarity_ratio = float(os.environ.get("_ANVIL_SIMILARITY_RATIO", "0.75"))
+
+  return EntryConfig(
+      name="osdi24-anvil",
+      home_dir=workspace_root,
+      repository_paths={
+          "osdi24-anvil": anvil_repo,
+          "osdi24-acto-dependency": acto_repo,
+      },
+      results_paths={
+          "table3": table3_results,
+      },
+      ground_truth_paths={
+          "table3": (refs_dir / "anvil-table-3.ref.json").resolve(),
+          "osdi24-acto-dependency.expected_branch":
+              (refs_dir / "acto.expected_branch.txt").resolve(),
+          "osdi24-acto-dependency.expected_head":
+              (refs_dir / "acto.expected_head.txt").resolve(),
+      },
+      similarity_ratio=similarity_ratio,
+  )
 
 
 def main(argv: list[str]) -> int:
+  verbose = "--verbose" in argv
+
   results: Dict[str, int] = {}
   score = 0
 
-  verbose = "--verbose" in argv
-
-  logger_name = os.environ.get("EVAL_LOGGER_NAME", "ANVIL-EVAL")
+  logger_name = os.environ.get("EVAL_LOGGER_NAME", "ANVIL-AGENT-EVALUATOR")
   logger = get_logger(LoggerConfig(root_name=logger_name))
 
+  try:
+    agent_eval_dir, workspace_root = _resolve_workspace_paths()
+    ANVIL_CONFIG = _build_anvil_config(agent_eval_dir=agent_eval_dir,
+                                       workspace_root=workspace_root)
+  except RuntimeError as exc:
+    raise SystemExit(str(exc)) from exc
+
   env_checker = OracleEnvSetup(config=ANVIL_CONFIG, logger=logger)
-  score += record_result(
-      results, type(env_checker).__name__, env_checker.run(verbose=verbose)
-  )
+  score += record_result(results,
+                         type(env_checker).__name__,
+                         env_checker.run(verbose=verbose))
 
   build_checker = OracleArtifactBuild(config=ANVIL_CONFIG, logger=logger)
-  score += record_result(
-      results, type(build_checker).__name__, build_checker.run(verbose=verbose)
-  )
+  score += record_result(results,
+                         type(build_checker).__name__,
+                         build_checker.run(verbose=verbose))
 
   prep_checker = OracleBenchmarkPrep(config=ANVIL_CONFIG, logger=logger)
-  score += record_result(
-      results, type(prep_checker).__name__, prep_checker.run(verbose=verbose)
-  )
+  score += record_result(results,
+                         type(prep_checker).__name__,
+                         prep_checker.run(verbose=verbose))
 
   runs_checker = OracleExperimentRuns(config=ANVIL_CONFIG, logger=logger)
-  score += record_result(
-      results, type(runs_checker).__name__, runs_checker.run(verbose=verbose)
-  )
+  score += record_result(results,
+                         type(runs_checker).__name__,
+                         runs_checker.run(verbose=verbose))
 
   logger.info("Agent scores: %s", results)
   return score
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_artifact_build.py b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_artifact_build.py
index 3554c528..84d176d1 100644
--- a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_artifact_build.py
+++ b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_artifact_build.py
@@ -1,48 +1,23 @@
-#!/usr/bin/env python3
-"""Artifact build oracle for the OSDI '24 ANVIL artifact.
+"""Artifact build oracle for ANVIL (OSDI'24).
 
 Validates:
-  - The ACTO dependency repository can build its required library target.
+  - Required repository working directories exist
+  - Build commands execute successfully
 """
 
 from __future__ import annotations
 
-from collections.abc import Mapping, Sequence
-from dataclasses import dataclass, field
 import logging
-from pathlib import Path
+from collections.abc import Sequence
 
+from evaluator import utils
 from evaluator.oracle_artifact_build_primitives import (
     BuildCommandRequirement,
-    BuildRequirement,
     OracleArtifactBuildBase,
 )
 from evaluator.utils import EntryConfig
 
 
-@dataclass(frozen = True, slots = True, kw_only = True)
-class BuildTarget:
-  """Declarative description of one build command to run."""
-
-  name: str
-  cwd: Path
-  command: Sequence[str]
-  cwd_relative: Path | None = None
-  optional: bool = False
-  timeout_seconds: float = 60.0
-  env_overrides: Mapping[str, str] = field(default_factory = dict)
-
-  def __post_init__(self) -> None:
-    if not self.name:
-      raise ValueError("BuildTarget.name must be non-empty")
-    if not self.command:
-      raise ValueError(f"{self.name}: command must be non-empty")
-    if self.timeout_seconds <= 0:
-      raise ValueError(f"{self.name}: timeout_seconds must be > 0")
-
-    object.__setattr__(self, "command", tuple(self.command))
-
-
 class OracleArtifactBuild(OracleArtifactBuildBase):
   """Artifact build oracle for ANVIL."""
 
@@ -51,40 +26,26 @@ def __init__(
       *,
       config: EntryConfig,
       logger: logging.Logger,
-      targets: Sequence[BuildTarget] | None = None,
+      targets: Sequence[BuildCommandRequirement] | None = None,
   ) -> None:
-    super().__init__(logger = logger)
+    super().__init__(logger=logger)
     self._config = config
 
-    if targets is None:
-      targets = self._default_targets()
-    self._targets = tuple(targets)
+    self._requirements = tuple(
+        targets) if targets is not None else self._default_requirements()
 
-    names = [t.name for t in self._targets]
+    names = [r.name for r in self._requirements]
     if len(names) != len(set(names)):
-      raise ValueError(f"Duplicate build target names: {names!r}")
+      raise ValueError(f"Duplicate build requirement names: {names!r}")
 
-  def _default_targets(self) -> tuple[BuildTarget, ...]:
+  def _default_requirements(self) -> tuple[BuildCommandRequirement, ...]:
     acto_repo = self._config.repository_paths["osdi24-acto-dependency"]
-    return (
-        BuildTarget(
-            name = "acto: make lib",
-            cwd = acto_repo,
-            command = ("make", "lib"),
-            timeout_seconds = 60.0,
-        ),
-    )
-
-  def requirements(self) -> Sequence[BuildRequirement]:
-    return tuple(
-        BuildCommandRequirement(
-            name = t.name,
-            optional = t.optional,
-            cwd = t.cwd,
-            command = t.command,
-            cwd_relative = t.cwd_relative,
-            timeout_seconds = t.timeout_seconds,
-            env_overrides = t.env_overrides,
-        )
-        for t in self._targets
-    )
\ No newline at end of file
+    return (BuildCommandRequirement(
+        name="acto: make lib",
+        cwd=acto_repo,
+        command=("make", "lib"),
+        timeout_seconds=60.0,
+    ),)
+
+  def requirements(self) -> Sequence[utils.BaseRequirement]:
+    return self._requirements
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_benchmark_prep.py b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_benchmark_prep.py
index 0e274242..be1bf606 100644
--- a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_benchmark_prep.py
+++ b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_benchmark_prep.py
@@ -1,140 +1,105 @@
-#!/usr/bin/env python3
-import sys
-import subprocess
-from pathlib import Path
-
-from utils import REPO_DIRS, logger
-
-
-class OracleBenchmarkPrep:
-
-  def __init__(self):
-    self.repo_root = Path(REPO_DIRS["acto"])
-    self.expected_remote = "https://github.com/xlab-uiuc/acto.git"
-    self.expected_branch = "anvil-dev"
-
-  def run_shell_command(self, cmd):
-    """
-    Run a command and return (rc, stdout, stderr) tuple.
-    """
-    try:
-      cp = subprocess.run(cmd, stdout = subprocess.PIPE, stderr = subprocess.PIPE, text = True)
-      return cp.returncode, (cp.stdout or "").strip(), (cp.stderr or "").strip()
-    except FileNotFoundError as e:
-      return 127, "", str(e)
-
-  def check_repo_exists(self):
-    """
-    Check that repository root exists and is a git working tree.
-    """
-    if not self.repo_root.is_dir():
-      return False, f"acto: FAIL (repo) - directory not found: {self.repo_root}"
-
-    rc, out, err = self.run_shell_command(
-      ["git", "-C", str(self.repo_root), "rev-parse", "--is-inside-work-tree"]
-    )
-    if rc != 0 or out != "true":
-      return False, f"acto: FAIL (repo) - not a git working tree: {err or out}"
-
-    return True, "acto: PASS (repo) - git working tree present"
-
-  def check_remote_origin(self):
-    """
-    Check that <origin> remote matches the expected repository URL.
-    """
-    rc, out, err = self.run_shell_command(
-      ["git", "-C", str(self.repo_root), "remote", "get-url", "origin"]
-    )
-    if rc != 0:
-      return False, f"acto: FAIL (remote) - cannot read origin remote: {err or out}"
-
-    origin_url = (out or "").strip()
-    def normalize(url: str) -> str:
-      return url[:-4] if url.endswith(".git") else url
-
-    if normalize(origin_url) != normalize(self.expected_remote):
-      return False, (
-        "acto: FAIL (remote) - origin URL "
-        f"{origin_url!r} does not match expected {self.expected_remote!r}"
-      )
-
-    return True, f"acto: PASS (remote) - origin URL matches {self.expected_remote}"
-
-  def check_branch_and_head(self):
-    """
-    Check that the current branch is the expected one and that the current 
-    commit resolves to a valid hash.
-    """
-    rc, out, err = self.run_shell_command(
-      ["git", "-C", str(self.repo_root), "rev-parse", "--abbrev-ref", "HEAD"]
-    )
-    if rc != 0:
-      return False, f"acto: FAIL (branch) - cannot read current branch: {err or out}"
+"""Benchmark preparation oracle for ANVIL (OSDI'24).
 
-    branch = (out or "").strip()
-    if branch != self.expected_branch:
-      return False, f"acto: FAIL (branch) - {branch!r} != expected {self.expected_branch!r}"
+Validates:
+  - Target repository working directory exists
+  - Repository was cloned and is a valid git working tree
+  - Current branch matches the expected branch
+  - Current HEAD commit matches the expected revision
+"""
 
-    rc, out, err = self.run_shell_command(
-      ["git", "-C", str(self.repo_root), "rev-parse", "HEAD"]
-    )
-    if rc != 0:
-      return False, f"acto: FAIL (commit) - cannot read HEAD: {err or out}"
-
-    head = (out or "").strip()
-    if not head:
-      return False, "acto: FAIL (commit) - empty HEAD hash"
-
-    return True, f"acto: PASS (branch/commit) - {branch}@{head[:12]}"
-
-  def check_submodules_recursive(self):
-    """
-    Check that submodules (if any) are initialized, approximating a --recursive clone.
-    """
-    gitmodules = self.repo_root / ".gitmodules"
-    if not gitmodules.exists():
-      # No submodules configured; nothing to check
-      return True, "acto: PASS (submodules) - no submodules configured"
-
-    rc, out, err = self.run_shell_command(
-      ["git", "-C", str(self.repo_root), "submodule", "status", "--recursive"]
-    )
-    if rc != 0:
-      return False, f"acto: FAIL (submodules) - git submodule status failed: {err or out}"
-
-    # Heuristic: lines starting with '-' indicate uninitialized submodules
-    uninitialized = [line for line in out.splitlines() if line.startswith("-")]
-    if uninitialized:
-      return False, (
-        "acto: FAIL (submodules) - uninitialized submodules present "
-        "(clone may have been done without --recursive)"
-      )
-
-    return True, "acto: PASS (submodules) - all submodules initialized"
-
-  def run(self):
-    """
-    Run all repository checks and return True on overall success.
-    """
-    results: list[bool] = []
-
-    ok, msg = self.check_repo_exists()
-    logger.info(msg)
-    results.append(ok)
-
-    ok, msg = self.check_remote_origin()
-    logger.info(msg)
-    results.append(ok)
-
-    ok, msg = self.check_branch_and_head()
-    logger.info(msg)
-    results.append(ok)
-
-    ok, msg = self.check_submodules_recursive()
-    logger.info(msg)
-    results.append(ok)
-
-    if all(results):
-      return True
-
-    return False
\ No newline at end of file
+from __future__ import annotations
+from pathlib import Path
+from typing import Sequence
+
+from evaluator import utils
+from evaluator.utils import EntryConfig
+from evaluator.benchmark_prep_primitives import OracleBenchmarkPrepBase, BenchmarkRequirement, FailRequirement
+
+
+class OracleBenchmarkPrep(OracleBenchmarkPrepBase):
+
+  def __init__(self, *, config: EntryConfig, logger) -> None:
+    super().__init__(logger=logger)
+    self._config = config
+    self._ORACLE_NAME = f"BenchmarkPrep/{config.name}"
+
+    repo = None
+    for _k, p in config.repository_paths.items():
+      if p.name.lower() == "acto":
+        repo = p
+        break
+    if repo is None and config.repository_paths:
+      repo = next(iter(config.repository_paths.values()))
+    self._repo_root = repo
+
+    self._expected_branch = None
+    self._expected_head = None
+    if repo is not None:
+      repo_id = next(k for k, v in config.repository_paths.items() if v == repo)
+      bpath = config.ground_truth_paths.get(f"{repo_id}.expected_branch")
+      hpath = config.ground_truth_paths.get(f"{repo_id}.expected_head")
+      if bpath:
+        self._expected_branch = Path(bpath).read_text(encoding="utf-8").strip()
+      if hpath:
+        self._expected_head = Path(hpath).read_text(encoding="utf-8").strip()
+
+  def requirements(self) -> Sequence[utils.BaseRequirement]:
+    if self._repo_root is None:
+      return (FailRequirement(name="select repo",
+                              message="No repository_paths configured"),)
+
+    reqs: list[utils.BaseRequirement] = []
+
+    # Check that ACTO directory exists
+    reqs.append(
+        BenchmarkRequirement(
+            name="repo directory exists",
+            filepath=self._repo_root,
+        ))
+
+    # Check that ACTO repository has been cloned correctly
+    reqs.append(
+        BenchmarkRequirement(
+            name="git working tree",
+            filepath=self._repo_root,
+            cmd=("git", "rev-parse", "--is-inside-work-tree"),
+            signature="true",
+            timeout_seconds=10.0,
+        ))
+
+    # Check that ACTO branch matches
+    if not self._expected_branch:
+      reqs.append(
+          FailRequirement(
+              name="expected branch configured",
+              message=
+              "Missing expected branch in EntryConfig.ground_truth_paths",
+          ))
+    else:
+      reqs.append(
+          BenchmarkRequirement(
+              name="on expected branch",
+              filepath=self._repo_root,
+              cmd=("git", "rev-parse", "--abbrev-ref", "HEAD"),
+              signature=self._expected_branch,
+              timeout_seconds=10.0,
+          ))
+
+    # Check that ACTO commit SHA matches
+    if not self._expected_head:
+      reqs.append(
+          FailRequirement(
+              name="expected head configured",
+              message="Missing expected head in EntryConfig.ground_truth_paths",
+          ))
+    else:
+      reqs.append(
+          BenchmarkRequirement(
+              name="HEAD matches expected",
+              filepath=self._repo_root,
+              cmd=("git", "rev-parse", "HEAD"),
+              signature=self._expected_head,
+              timeout_seconds=10.0,
+          ))
+
+    return tuple(reqs)
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_env_setup.py b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_env_setup.py
index 8bef40db..21c9897a 100644
--- a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_env_setup.py
+++ b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_env_setup.py
@@ -1,138 +1,95 @@
-#!/usr/bin/env python3
-"""Environment setup oracle for the ANVIL bundle.
+"""Environment setup oracle for ANVIL (OSDI'24).
 
-This implementation uses evaluator.oracle_env_setup_primitives for consistent
-reporting and verbose failure logging.
+Validates:
+  - Required workspace and repository directories exist
+  - Required reference (ground-truth) files exist
+  - Required external tooling is available and satisfies minimum version constraints
 """
 
 from __future__ import annotations
 
-import dataclasses
-import logging
-import shutil
-from collections.abc import Sequence
+from collections.abc import Mapping, Sequence
 from pathlib import Path
+import logging
 
-from evaluator.utils import CheckResult, EntryConfig
+from evaluator.utils import EntryConfig
 from evaluator.oracle_env_setup_primitives import (
     DependencyVersionRequirement,
-    EnvironmentVariableRequirement,
-    EnvQuantifier,
     FilesystemPathRequirement,
     OracleEnvSetupBase,
     PathType,
-    Requirement,
     VersionCompare,
 )
 
 
-@dataclasses.dataclass(frozen = True, slots = True, kw_only = True)
-class ExecutableOnPathRequirement(Requirement):
-  """Checks that an executable is present on PATH (no version constraint)."""
-
-  executable: str
-
-  def __post_init__(self) -> None:
-    if not self.executable:
-      raise ValueError(f"{self.name}: executable must be non-empty")
-
-  def check(self) -> CheckResult:
-    if shutil.which(self.executable) is None:
-      return CheckResult.failure(f"not found on PATH: {self.executable!r}")
-    return CheckResult.success()
+def _required_path(paths: Mapping[str, Path], key: str, *, label: str) -> Path:
+  """Fetches a required path from an EntryConfig mapping with a clear error."""
+  try:
+    return paths[key]
+  except KeyError as exc:
+    raise ValueError(f"Missing {label}[{key!r}] in EntryConfig") from exc
 
 
 class OracleEnvSetup(OracleEnvSetupBase):
-  """Validates environment prerequisites for the ANVIL bundle."""
+  """Validates that the ANVIL workspace and dependencies are present."""
+
+  _ORACLE_NAME = "EnvironmentSetup"
 
   def __init__(self, *, config: EntryConfig, logger: logging.Logger) -> None:
-    super().__init__(logger = logger)
+    super().__init__(logger=logger)
     self._config = config
 
-  def requirements(self) -> Sequence[Requirement]:
-    home_dir = self._config.home_dir
-    venv_dir = home_dir / ".venv"
-    go_root = Path.home() / "go"
-    go_bin = go_root / "bin"
-
-    reqs: list[Requirement] = [
-        # Check dependencies
-        DependencyVersionRequirement(
-          name = "docker",
-          command = ("docker", "--version"),
-          required_version = (24, 0, 0),
-          compare = VersionCompare.GEQ,
-        ),
-        DependencyVersionRequirement(
-          name = "go",
-          command = ("go", "version"),
-          required_version = (1, 22, 0),
-          compare = VersionCompare.GEQ,
-          version_regex = r"go(\d+\.\d+(?:\.\d+)?)",
-        ),
-        DependencyVersionRequirement(
-            name = "python3",
-            command = ("python3", "--version"),
-            required_version = (3, 10, 0),
-            compare = VersionCompare.GEQ,
-            version_regex = r"Python\s+([0-9.]+)",
-        ),
-        DependencyVersionRequirement(
-          name = "pip3",
-          command = ("pip3", "--version"),
-          required_version = (24, 0, 0),
-          compare = VersionCompare.GEQ,
-        ),
-        DependencyVersionRequirement(
-            name = "kind",
-            command = ("kind", "version"),
-            required_version = (0, 20, 0),
-            compare = VersionCompare.GEQ,
-            version_regex = r"v([0-9.]+)",
-        ),
-        DependencyVersionRequirement(
-            name = "kubectl",
-            command = ("kubectl", "version", "--client", "--short"),
-            required_version = (1, 22, 9),
-            compare = VersionCompare.GEQ,
-            version_regex = r"Client Version:\s+v?([0-9.]+)",
+  def requirements(
+      self
+  ) -> Sequence[FilesystemPathRequirement | DependencyVersionRequirement]:
+    cfg = self._config
+
+    if not cfg.repository_paths:
+      raise ValueError("EntryConfig.repository_paths must be non-empty")
+    if not cfg.ground_truth_paths:
+      raise ValueError("EntryConfig.ground_truth_paths must be non-empty")
+
+    anvil_repo = _required_path(cfg.repository_paths,
+                                "osdi24-anvil",
+                                label="repository_paths")
+    acto_repo = _required_path(cfg.repository_paths,
+                               "osdi24-acto-dependency",
+                               label="repository_paths")
+
+    table3_ref = _required_path(cfg.ground_truth_paths,
+                                "table3",
+                                label="ground_truth_paths")
+
+    return (
+        # Workspace and repository directory layout
+        FilesystemPathRequirement(
+            name="home_dir",
+            path=cfg.home_dir,
+            path_type=PathType.DIRECTORY,
         ),
-
-        # Check directory structure
         FilesystemPathRequirement(
-            name = "venv_exists",
-            path = venv_dir,
-            path_type = PathType.DIRECTORY,
+            name="repo_osdi24_anvil",
+            path=anvil_repo,
+            path_type=PathType.DIRECTORY,
         ),
         FilesystemPathRequirement(
-            name = "go_root_exists",
-            path = go_root,
-            path_type = PathType.DIRECTORY,
+            name="repo_osdi24_acto_dependency",
+            path=acto_repo,
+            path_type=PathType.DIRECTORY,
         ),
 
-        # Check PATH contents
-        EnvironmentVariableRequirement(
-            name = "PATH_contains_go_root",
-            env_var = "PATH",
-            expected = str(go_root),
-            quantifier = EnvQuantifier.CONTAINS,
+        # Reference artifacts used for evaluation
+        FilesystemPathRequirement(
+            name="ref_table3",
+            path=table3_ref,
+            path_type=PathType.FILE,
         ),
-        EnvironmentVariableRequirement(
-            name = "PATH_contains_go_bin",
-            env_var = "PATH",
-            expected = str(go_bin),
-            quantifier = EnvQuantifier.CONTAINS,
+
+        # Tooling dependencies
+        DependencyVersionRequirement(
+            name="python3_version",
+            cmd=("python3", "--version"),
+            required_version=(3, 10, 0),
+            compare=VersionCompare.GEQ,
         ),
-    ]
-
-    # Check that the repo root directory is present
-    for key, repo_root in sorted(self._config.repository_paths.items()):
-      reqs.append(
-          FilesystemPathRequirement(
-              name = f"repo_exists:{key}",
-              path = repo_root,
-              path_type = PathType.DIRECTORY,
-          )
-      )
-
-    return reqs
+    )
diff --git a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_experiment_runs.py b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_experiment_runs.py
index a9f5f1c6..93ecc0c6 100644
--- a/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_experiment_runs.py
+++ b/benchmarks/arteval_bench/data/benchmark/osdi24_anvil/_agent_eval/oracle_experiment_runs.py
@@ -1,24 +1,27 @@
-#!/usr/bin/env python3
-"""Experiment runs oracle for the OSDI'24 ANVIL artifact.
+"""Experiment runs oracle for ANVIL (OSDI'24).
 
-Validates results (tsble 3) against reference measurements by comparing 
-per-controller calues:
-  - mean ratio: verified_anvil_mean / reference_unverified_mean
-  - max ratio:  verified_anvil_max  / reference_unverified_max
+Validates:
+  - Table 3 results file exists and is parseable
+  - Table 3 reference (ground-truth) JSON exists and is parseable
+  - Per-controller mean and max ratios (verified/reference) meet similarity thresholds
 """
 
 from __future__ import annotations
 
+import hashlib
 import json
+import logging
+import math
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
 from pathlib import Path
-import logging
 
+from evaluator import utils
 from evaluator.oracle_experiment_runs_primitives import (
-  ExperimentRunsRequirement,
-  LabeledSequenceSimilarityThresholdRequirement,
-  OracleExperimentRunsBase,
+    ElementwiseSimilarityThresholdRequirement,
+    ListSimilarityRequirement,
+    SimilarityMetric,
+    OracleExperimentRunsBase,
 )
 from evaluator.utils import EntryConfig
 
@@ -33,11 +36,11 @@ class TableRow:
 
 
 _EXPECTED_HEADERS: tuple[str, ...] = (
-  "Controller",
-  "Verified (Anvil) Mean",
-  "Verified (Anvil) Max",
-  "Reference (unverified) Mean",
-  "Reference (unverified) Max",
+    "Controller",
+    "Verified (Anvil) Mean",
+    "Verified (Anvil) Max",
+    "Reference (unverified) Mean",
+    "Reference (unverified) Max",
 )
 
 
@@ -49,6 +52,12 @@ def _required_path(paths: Mapping[str, Path], key: str, *, label: str) -> Path:
     raise ValueError(f"Missing {label}[{key!r}] in EntryConfig") from exc
 
 
+def _read_lines(path: Path, *, label: str) -> list[str]:
+  if not path.exists():
+    raise ValueError(f"{label}: {path} not found")
+  return path.read_text(encoding="utf-8").splitlines()
+
+
 def _is_separator_line(line: str) -> bool:
   """Returns True if this looks like the Markdown header separator line."""
   stripped = line.strip()
@@ -93,7 +102,6 @@ def _parse_results_table_rows(lines: Sequence[str]) -> list[TableRow]:
 
   for line in lines:
     if "|" not in line:
-      # Not a table row.
       continue
 
     if header_line is None:
@@ -117,32 +125,27 @@ def _parse_results_table_rows(lines: Sequence[str]) -> list[TableRow]:
     cells = _split_markdown_row(line)
     if len(cells) != len(_EXPECTED_HEADERS):
       raise ValueError(
-        f"Row has {len(cells)} cells, expected {len(_EXPECTED_HEADERS)}: {line!r}"
+          f"Row has {len(cells)} cells, expected {len(_EXPECTED_HEADERS)}: {line!r}"
       )
 
     controller = cells[0]
-    verified_anvil_mean = _parse_float_token(
-      cells[1], label="Verified (Anvil) Mean"
-    )
-    verified_anvil_max = _parse_float_token(
-      cells[2], label="Verified (Anvil) Max"
-    )
+    verified_anvil_mean = _parse_float_token(cells[1],
+                                             label="Verified (Anvil) Mean")
+    verified_anvil_max = _parse_float_token(cells[2],
+                                            label="Verified (Anvil) Max")
     reference_unverified_mean = _parse_float_token(
-      cells[3], label="Reference (unverified) Mean"
-    )
+        cells[3], label="Reference (unverified) Mean")
     reference_unverified_max = _parse_float_token(
-      cells[4], label="Reference (unverified) Max"
-    )
+        cells[4], label="Reference (unverified) Max")
 
     rows.append(
-      TableRow(
-        controller=controller,
-        verified_anvil_mean=verified_anvil_mean,
-        verified_anvil_max=verified_anvil_max,
-        reference_unverified_mean=reference_unverified_mean,
-        reference_unverified_max=reference_unverified_max,
-      )
-    )
+        TableRow(
+            controller=controller,
+            verified_anvil_mean=verified_anvil_mean,
+            verified_anvil_max=verified_anvil_max,
+            reference_unverified_mean=reference_unverified_mean,
+            reference_unverified_max=reference_unverified_max,
+        ))
 
   return rows
 
@@ -167,101 +170,104 @@ def _load_reference_rows(path: Path) -> list[TableRow]:
 
     try:
       rows.append(
-        TableRow(
-          controller=str(obj["controller"]),
-          verified_anvil_mean=float(obj["verified_anvil_mean"]),
-          verified_anvil_max=float(obj["verified_anvil_max"]),
-          reference_unverified_mean=float(obj["reference_unverified_mean"]),
-          reference_unverified_max=float(obj["reference_unverified_max"]),
-        )
-      )
+          TableRow(
+              controller=str(obj["controller"]),
+              verified_anvil_mean=float(obj["verified_anvil_mean"]),
+              verified_anvil_max=float(obj["verified_anvil_max"]),
+              reference_unverified_mean=float(obj["reference_unverified_mean"]),
+              reference_unverified_max=float(obj["reference_unverified_max"]),
+          ))
     except (KeyError, TypeError, ValueError) as exc:
       raise ValueError(f"{path} malformed entry #{idx}: {exc}") from exc
 
   return rows
 
 
-def _results_mean_ratio_pairs(lines: Sequence[str]) -> list[tuple[str, float]]:
-  """Returns (controller, mean_ratio) from results table."""
-  rows = _parse_results_table_rows(lines)
-  out: list[tuple[str, float]] = []
+def _ratios_by_controller(
+    rows: Sequence[TableRow]) -> dict[str, tuple[float, float]]:
+  """Returns controller -> (mean_ratio, max_ratio)."""
+  out: dict[str, tuple[float, float]] = {}
   for r in rows:
-    mean_ratio, _ = _compute_ratios(r)
-    out.append((r.controller, mean_ratio))
+    if r.controller in out:
+      raise ValueError(f"Duplicate controller row: {r.controller!r}")
+    out[r.controller] = _compute_ratios(r)
   return out
 
 
-def _results_max_ratio_pairs(lines: Sequence[str]) -> list[tuple[str, float]]:
-  """Returns (controller, max_ratio) from results table."""
-  rows = _parse_results_table_rows(lines)
-  out: list[tuple[str, float]] = []
-  for r in rows:
-    _, max_ratio = _compute_ratios(r)
-    out.append((r.controller, max_ratio))
-  return out
-
-
-def _reference_mean_ratio_pairs(path: Path) -> list[tuple[str, float]]:
-  """Returns (controller, mean_ratio) from reference JSON rows."""
-  rows = _load_reference_rows(path)
-  out: list[tuple[str, float]] = []
-  for r in rows:
-    mean_ratio, _ = _compute_ratios(r)
-    out.append((r.controller, mean_ratio))
-  return out
-
-
-def _reference_max_ratio_pairs(path: Path) -> list[tuple[str, float]]:
-  """Returns (controller, max_ratio) from reference JSON rows."""
-  rows = _load_reference_rows(path)
-  out: list[tuple[str, float]] = []
-  for r in rows:
-    _, max_ratio = _compute_ratios(r)
-    out.append((r.controller, max_ratio))
-  return out
+def _controller_id_as_float(controller: str) -> float:
+  """Determinstic controller ID encoding as a float."""
+  digest = hashlib.sha256(controller.encode("utf-8")).digest()
+  raw64 = int.from_bytes(digest[:8], "big", signed=False)
+  return float(raw64 % (2**53))
 
 
 class OracleExperimentRuns(OracleExperimentRunsBase):
-  """Validates ANVIL experiment run outputs (TABLE-3)."""
-
-  _NAME = "ExperimentRuns"
+  """Validates ANVIL experiment run outputs (Table 3)."""
 
   def __init__(self, *, config: EntryConfig, logger: logging.Logger) -> None:
     super().__init__(logger=logger)
     self._config = config
 
-  def requirements(self) -> Sequence[ExperimentRunsRequirement]:
+  def requirements(self) -> Sequence[utils.BaseRequirement]:
     if not self._config.results_paths:
       raise ValueError("EntryConfig.results_paths must be non-empty")
     if not self._config.ground_truth_paths:
       raise ValueError("EntryConfig.ground_truth_paths must be non-empty")
 
-    results_path = _required_path(
-      self._config.results_paths, "table3", label="results_paths"
-    )
-    reference_path = _required_path(
-      self._config.ground_truth_paths, "table3", label="ground_truth_paths"
-    )
+    results_path = _required_path(self._config.results_paths,
+                                  "table3",
+                                  label="results_paths")
+    reference_path = _required_path(self._config.ground_truth_paths,
+                                    "table3",
+                                    label="ground_truth_paths")
 
     threshold = self._config.similarity_ratio
 
+    results_rows = _parse_results_table_rows(
+        _read_lines(results_path, label="results_path"))
+    reference_rows = _load_reference_rows(reference_path)
+
+    results_map = _ratios_by_controller(results_rows)
+    reference_map = _ratios_by_controller(reference_rows)
+
+    controllers_union = sorted(set(results_map) | set(reference_map))
+    results_mean = []
+    ref_mean = []
+    results_max = []
+    ref_max = []
+    for c in controllers_union:
+      r = results_map.get(c)
+      g = reference_map.get(c)
+      results_mean.append(r[0] if r is not None else float("nan"))
+      results_max.append(r[1] if r is not None else float("nan"))
+      ref_mean.append(g[0] if g is not None else float("nan"))
+      ref_max.append(g[1] if g is not None else float("nan"))
+
+    results_controller_ids = [
+        _controller_id_as_float(c) for c in results_map.keys()
+    ]
+    reference_controller_ids = [
+        _controller_id_as_float(c) for c in reference_map.keys()
+    ]
+
     return (
-      LabeledSequenceSimilarityThresholdRequirement(
-        name="table3_mean_ratio",
-        label="TABLE-3 mean_ratio",
-        results_path=results_path,
-        reference_path=reference_path,
-        threshold=threshold,
-        parse_results_fn=_results_mean_ratio_pairs,
-        parse_reference_fn=_reference_mean_ratio_pairs,
-      ),
-      LabeledSequenceSimilarityThresholdRequirement(
-        name="table3_max_ratio",
-        label="TABLE-3 max_ratio",
-        results_path=results_path,
-        reference_path=reference_path,
-        threshold=threshold,
-        parse_results_fn=_results_max_ratio_pairs,
-        parse_reference_fn=_reference_max_ratio_pairs,
-      ),
+        ListSimilarityRequirement(
+            name="table3_controllers",
+            observed=results_controller_ids,
+            reference=reference_controller_ids,
+            metric=SimilarityMetric.JACCARD_SET,
+            min_similarity=1.0,
+        ),
+        ElementwiseSimilarityThresholdRequirement(
+            name="table3_mean_ratio",
+            observed=results_mean,
+            reference=ref_mean,
+            threshold=threshold,
+        ),
+        ElementwiseSimilarityThresholdRequirement(
+            name="table3_max_ratio",
+            observed=results_max,
+            reference=ref_max,
+            threshold=threshold,
+        ),
     )