diff --git a/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/main.py b/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/main.py
index 98fe5535..d41fbf8d 100644
--- a/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/main.py
+++ b/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/main.py
@@ -1,86 +1,124 @@
 #!/usr/bin/env python3
-"""Runs environment setup, build, benchmark prep, and experiment runs checks for EGWALKER."""
+"""Runs environment setup, build, benchmark prep, and experiment runs checks for EGWALKER (EuroSys'25)."""
 
 from __future__ import annotations
 
-import os
-import sys
 from pathlib import Path
 from typing import Dict
+import os
+import sys
+
 
 _AGENT_EVAL_DIR = Path(__file__).resolve().parent
 _AGENT_SRC_DIR = _AGENT_EVAL_DIR.parents[3] / "src"
 sys.path.append(str(_AGENT_SRC_DIR))
 
+
+from evaluator.utils import (
+  EntryConfig,
+  LoggerConfig,
+  get_logger,
+  record_result,
+)
 from oracle_artifact_build import OracleArtifactBuild
-from oracle_benchmark_prep import OracleBenchmarkPrep
 from oracle_env_setup import OracleEnvSetup
+from oracle_benchmark_prep import OracleBenchmarkPrep
 from oracle_experiment_runs import OracleExperimentRuns
-from evaluator.utils import EntryConfig, LoggerConfig, get_logger, record_result
 
 
-EGWALKER_CONFIG = EntryConfig(
-    name="eurosys25-egwalker",
-    home_dir=Path.home() / "eurosys25_egwalker",
-    repository_paths={
-        "eurosys25-egwalker": Path.home() / "eurosys25_egwalker" / "egwalker",
+def _resolve_workspace_paths() -> tuple[Path, Path, Path]:
+  """Resolve and validate _agent_eval/ and egwalker/ locations.
+  This expectes that either:
+    (1) _agent_eval/ and egwalker/ are located in the same root directory; or
+    (2) _AGENT_EVAL_DIR and _EGWALKER_HOME are set by the user
+  """
+  try:
+    env_agent_eval = os.environ.get("_AGENT_EVAL_DIR")
+    env_egwalker_home = os.environ.get("_EGWALKER_HOME")
+
+    if env_agent_eval:
+      agent_eval_dir = Path(env_agent_eval).expanduser().resolve()
+    else:
+      agent_eval_dir = Path(__file__).resolve().parent
+
+    if env_egwalker_home:
+      egwalker_home = Path(env_egwalker_home).expanduser().resolve()
+    else:
+      egwalker_home = agent_eval_dir.parent.resolve()
+
+    if not agent_eval_dir.exists() or not agent_eval_dir.is_dir():
+      raise RuntimeError(
+          f"Invalid _agent_eval dir: {agent_eval_dir}\n"
+          f"This runner expects _agent_eval/ and egwalker/ to be located in the same root directory.\n"
+          f"Set _AGENT_EVAL_DIR to the directory containing main.py if needed."
+      )
+
+    egwalker_repo_root = egwalker_home / "egwalker"
+    if not egwalker_repo_root.exists() or not egwalker_repo_root.is_dir():
+      raise RuntimeError(
+          f"Invalid EGWALKER workspace: {egwalker_home}\n"
+          f"Expected to find a 'egwalker/' directory at: {egwalker_repo_root}\n"
+          f"This runner expects _agent_eval/ and egwalker/ to be located in the same root directory.\n"
+          f"Set _EGWALKER_HOME to the workspace root if needed."
+      )
+
+    workspace_root = egwalker_home
+    return agent_eval_dir, workspace_root
+
+  except OSError as exc:
+    raise RuntimeError(f"Failed to resolve workspace paths: {exc}") from exc
+
+
+def _build_egwalker_config(*, agent_eval_dir: Path, workspace_root: Path) -> EntryConfig:
+  """Constructs EntryConfig for the EGWALKER evaluation bundle from resolved paths."""
+  egwalker_repo = (workspace_root / "egwalker").resolve()
+  egwalker_agent_eval = agent_eval_dir.resolve()
+  egwalker_refs = (egwalker_agent_eval / "refs").resolve()
+  egwalker_results = (egwalker_repo / "results").resolve()
+
+  return EntryConfig(
+    name = "eurosys25-egwalker",
+    home_dir = workspace_root,
+    repository_paths = {
+      "eurosys25-egwalker": egwalker_repo,
     },
-    results_paths={
-        # Matches legacy: <repo>/results/timings.json
-        "timings": Path.home()
-        / "eurosys25_egwalker"
-        / "egwalker"
-        / "results"
-        / "timings.json",
+    results_paths = {
+      "timings": egwalker_results / "timings.json",
     },
-    ground_truth_paths={
-        "datasets": (
-            Path.home()
-            / "eurosys25_egwalker"
-            / "_agent_eval"
-            / "refs"
-            / "datasets.ref.json"
-        ),
-        "timings": (
-            Path.home()
-            / "eurosys25_egwalker"
-            / "_agent_eval"
-            / "refs"
-            / "timings.ref.json"
-        ),
+    ground_truth_paths = {
+      "datasets": egwalker_refs / "datasets.ref.json",
+      "timings": egwalker_refs / "timings.ref.json",
     },
-    similarity_ratio=0.75,
-)
+    similarity_ratio = 0.75,
+  )
 
 
 def main(argv: list[str]) -> int:
+  verbose = "--verbose" in argv
+
   results: Dict[str, int] = {}
   score = 0
 
-  verbose = "--verbose" in argv
+  logger_name = os.environ.get("EVAL_LOGGER_NAME", "EGWALKER-AGENT-EVALUATOR")
+  logger = get_logger(LoggerConfig(root_name = logger_name))
 
-  logger_name = os.environ.get("EVAL_LOGGER_NAME", "EGWALKER-EVAL")
-  logger = get_logger(LoggerConfig(root_name=logger_name))
+  try:
+    agent_eval_dir, workspace_root = _resolve_workspace_paths()
+    EGWALKER_CONFIG = _build_egwalker_config(agent_eval_dir = agent_eval_dir, workspace_root = workspace_root)
+  except RuntimeError as exc:
+    raise SystemExit(str(exc)) from exc
 
-  env_checker = OracleEnvSetup(config=EGWALKER_CONFIG, logger=logger)
-  score += record_result(
-      logger, results, type(env_checker).__name__, env_checker.run(verbose=verbose)
-  )
+  env_checker = OracleEnvSetup(config = EGWALKER_CONFIG, logger = logger)
+  score += record_result(results, type(env_checker).__name__, env_checker.run(verbose = verbose))
 
-  build_checker = OracleArtifactBuild(config=EGWALKER_CONFIG, logger=logger)
-  score += record_result(
-      logger, results, type(build_checker).__name__, build_checker.run(verbose=verbose)
-  )
+  build_checker = OracleArtifactBuild(config = EGWALKER_CONFIG, logger = logger)
+  score += record_result(results, type(build_checker).__name__, build_checker.run(verbose = verbose))
 
-  prep_checker = OracleBenchmarkPrep(config=EGWALKER_CONFIG, logger=logger)
-  score += record_result(
-      logger, results, type(prep_checker).__name__, prep_checker.run(verbose=verbose)
-  )
+  prep_checker = OracleBenchmarkPrep(config = EGWALKER_CONFIG, logger = logger)
+  score += record_result(results, type(prep_checker).__name__, prep_checker.run(verbose = verbose))
 
-  runs_checker = OracleExperimentRuns(config=EGWALKER_CONFIG, logger=logger)
-  score += record_result(
-      logger, results, type(runs_checker).__name__, runs_checker.run(verbose=verbose)
-  )
+  runs_checker = OracleExperimentRuns(config = EGWALKER_CONFIG, logger = logger)
+  score += record_result(results, type(runs_checker).__name__, runs_checker.run(verbose = verbose))
 
   logger.info("Agent scores: %s", results)
   return score
diff --git a/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_artifact_build.py b/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_artifact_build.py
index c71db0d5..858657b8 100644
--- a/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_artifact_build.py
+++ b/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_artifact_build.py
@@ -1,7 +1,7 @@
-"""Artifact build oracle for the Eurosys'25 EGWALKER artifact.
+"""Artifact build oracle for EGWALKER (EuroSys'25).
 
 Validates:
-  - Required repository working directories exist.
+  - Repository working directory exists.
   - Build commands execute successfully (captures stdout/stderr/return code).
 """
 
@@ -13,102 +13,113 @@
 from pathlib import Path
 
 from evaluator.oracle_artifact_build_primitives import (
-  BuildCommandRequirement,
-  BuildRequirement,
-  OracleArtifactBuildBase,
+    BuildCommandRequirement,
+    OracleArtifactBuildBase,
 )
-from evaluator.utils import EntryConfig
+from evaluator.utils import EntryConfig, BaseRequirement
 
 
-@dataclass(frozen = True, slots = True, kw_only = True)
+@dataclass(frozen=True, slots=True, kw_only=True)
 class BuildTarget:
-  """Declarative description of one build command to run."""
+    """Declarative description of one build command to run.
 
-  name: str
-  command: Sequence[str]
-  cwd_relative: Path | None = None
-  optional: bool = False
-  timeout_seconds: float = 60.0
-  env_overrides: Mapping[str, str] = field(default_factory = dict)
+    Kept intentionally thin: the base primitive (BuildCommandRequirement) performs
+    the authoritative validation and normalization.
+    """
 
-  def __post_init__(self) -> None:
-    if not self.name:
-      raise ValueError("BuildTarget.name must be non-empty")
-    if not self.command:
-      raise ValueError(f"{self.name}: command must be non-empty")
-    if self.timeout_seconds <= 0:
-      raise ValueError(f"{self.name}: timeout_seconds must be > 0")
+    name: str
+    cmd: Sequence[str]
+    relative_workdir: Path | None = None
+    optional: bool = False
+    timeout_seconds: float = 60.0
+    env_overrides: Mapping[str, str] = field(default_factory=dict)
 
-    # Normalize for downstream requirements.
-    if self.cwd_relative is not None and not isinstance(self.cwd_relative, Path):
-      object.__setattr__(self, "cwd_relative", Path(self.cwd_relative))
+    def __post_init__(self) -> None:
+        if not self.name:
+            raise ValueError("BuildTarget.name must be non-empty")
 
-    # Freeze command to avoid accidental mutation.
-    object.__setattr__(self, "command", tuple(self.command))
+        object.__setattr__(self, "cmd", tuple(self.cmd))
+
+        if self.relative_workdir is not None and not isinstance(
+            self.relative_workdir, Path
+        ):
+            object.__setattr__(self, "relative_workdir", Path(self.relative_workdir))
 
 
 class OracleArtifactBuild(OracleArtifactBuildBase):
-  """The artifact build oracle for artifact-core.
-
-  Defaults:
-   * Runs build commands in the repo keyed by config.name.
-   * EntryConfig.repository_paths must contain an entry for config.name.
-  """
-
-  _DEFAULT_TARGET_SPECS: tuple[tuple[str, tuple[str, ...], float], ...] = (
-    (
-      "artifact-core: make tools",
-      (
-        "make",
-        "-j8",
-        "tools/diamond-types/target/release/dt",
-        "tools/crdt-converter/target/release/crdt-converter",
-        "tools/diamond-types/target/release/paper-stats",
-        "tools/paper-benchmarks/target/memusage/paper-benchmarks",
-        "tools/paper-benchmarks/target/release/paper-benchmarks",
-        "tools/ot-bench/target/memusage/ot-bench",
-        "tools/ot-bench/target/release/ot-bench",
-      ),
-      60.0,
-    ),
-  )
-
-  def __init__(
-    self,
-    *,
-    config: EntryConfig,
-    logger: logging.Logger,
-    targets: Sequence[BuildTarget] | None = None,
-  ) -> None:
-    super().__init__(logger = logger)
-    self._config = config
-
-    if targets is None:
-      targets = self._make_default_targets()
-    self._targets = tuple(targets)
-
-    names = [t.name for t in self._targets]
-    if len(names) != len(set(names)):
-      raise ValueError(f"Duplicate build target names: {names!r}")
-
-  def _make_default_targets(self) -> tuple[BuildTarget, ...]:
-    """Creates default targets (stored in the EntryConfig object)."""
-    return tuple(
-      BuildTarget(name = name, command = command, timeout_seconds = timeout_seconds)
-      for (name, command, timeout_seconds) in self._DEFAULT_TARGET_SPECS
+    """The artifact build oracle for artifact-core.
+
+    Defaults:
+      * Runs build commands in the repo keyed by config.name.
+      * EntryConfig.repository_paths is expected to contain an entry for config.name.
+    """
+
+    _DEFAULT_TARGET_SPECS: tuple[tuple[str, tuple[str, ...], float], ...] = (
+        (
+            "artifact-core: make tools",
+            (
+                "make",
+                "-j8",
+                "tools/diamond-types/target/release/dt",
+                "tools/crdt-converter/target/release/crdt-converter",
+                "tools/diamond-types/target/release/paper-stats",
+                "tools/paper-benchmarks/target/memusage/paper-benchmarks",
+                "tools/paper-benchmarks/target/release/paper-benchmarks",
+                "tools/ot-bench/target/memusage/ot-bench",
+                "tools/ot-bench/target/release/ot-bench",
+            ),
+            300.0,
+        ),
     )
 
-  def requirements(self) -> Sequence[BuildRequirement]:
-    """Returns an ordered list of build requirements to validate."""
-    return tuple(
-      BuildCommandRequirement(
-        name = target.name,
-        optional = target.optional,
-        cwd = self._config.repository_paths[self._config.name],
-        command = target.command,
-        cwd_relative = target.cwd_relative,
-        timeout_seconds = target.timeout_seconds,
-        env_overrides = target.env_overrides,
-      )
-      for target in self._targets
-    )
\ No newline at end of file
+    def __init__(
+        self,
+        *,
+        config: EntryConfig,
+        logger: logging.Logger,
+        targets: Sequence[BuildTarget] | None = None,
+    ) -> None:
+        super().__init__(logger=logger)
+        self._config = config
+
+        if targets is None:
+            targets = self._make_default_targets()
+        self._targets = tuple(targets)
+
+        names = [t.name for t in self._targets]
+        if len(names) != len(set(names)):
+            raise ValueError(f"Duplicate build target names: {names!r}")
+
+    def _make_default_targets(self) -> tuple[BuildTarget, ...]:
+        return tuple(
+            BuildTarget(name=name, cmd=cmd, timeout_seconds=timeout_seconds)
+            for (name, cmd, timeout_seconds) in self._DEFAULT_TARGET_SPECS
+        )
+
+    def requirements(self) -> Sequence[BaseRequirement]:
+        """Returns an ordered list of build requirements to validate."""
+        repo_root = self._config.repository_paths.get(self._config.name)
+
+        if repo_root is None:
+            return (
+                BuildCommandRequirement(
+                    name=f"config: missing repository_paths entry for {self._config.name!r}",
+                    optional=False,
+                    cwd=Path(self._config.home_dir) / "__MISSING_REPOSITORY_PATH__",
+                    cmd=("true",),
+                    timeout_seconds=1.0,
+                ),
+            )
+
+        return tuple(
+            BuildCommandRequirement(
+                name=target.name,
+                optional=target.optional,
+                cwd=repo_root,
+                cmd=target.cmd,
+                relative_workdir=target.relative_workdir,
+                timeout_seconds=target.timeout_seconds,
+                env_overrides=target.env_overrides,
+            )
+            for target in self._targets
+        )
diff --git a/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_benchmark_prep.py b/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_benchmark_prep.py
index 28f891e9..f279f3b1 100644
--- a/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_benchmark_prep.py
+++ b/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_benchmark_prep.py
@@ -1,5 +1,4 @@
-#!/usr/bin/env python3
-"""Benchmark preparation oracle for _agent_eval bundles.
+"""Benchmark preparation oracle for EGWALKER (EuroSys'25).
 
 Validates:
   - Dataset manifest JSON is readable and well-formed.
@@ -13,35 +12,30 @@
 import logging
 import sys
 from pathlib import Path
-from typing import Mapping, Sequence
+from typing import Mapping, Sequence, Any
 
+from evaluator import utils
 from evaluator.utils import EntryConfig
 from evaluator.oracle_benchmark_prep_primitives import (
     BenchmarkRequirement,
     FailRequirement,
     OracleBenchmarkPrepBase,
-    Requirement,
 )
 
 
-def _required_path(paths: Mapping[str, Path], key: str, *, label: str) -> Path:
-  """Returns a required path from a mapping with a clear error."""
-  try:
-    return paths[key]
-  except KeyError as e:
-    raise ValueError(f"Missing {label}[{key!r}] in EntryConfig") from e
-
-
-def _resolve_nonstrict(path: Path) -> Path:
-  """Resolves a path without requiring it to exist."""
-  return path.resolve(strict = False)
-
-
 def _is_within(root: Path, candidate: Path) -> bool:
-  """Returns True iff candidate is within root after resolution."""
-  root_resolved = _resolve_nonstrict(root)
-  cand_resolved = _resolve_nonstrict(candidate)
-  return cand_resolved == root_resolved or root_resolved in cand_resolved.parents
+  """Returns True iff candidate is within root after (non-strict) resolution.
+
+  Uses resolution to collapse '..' and resolve symlinks in existing parents
+  (as much as possible without requiring the final path to exist).
+  """
+  root_resolved = root.resolve(strict=False)
+  cand_resolved = candidate.resolve(strict=False)
+  try:
+    cand_resolved.relative_to(root_resolved)
+    return True
+  except ValueError:
+    return False
 
 
 class OracleBenchmarkPrep(OracleBenchmarkPrepBase):
@@ -54,124 +48,129 @@ def __init__(
       logger: logging.Logger,
       manifest_key: str = "datasets",
   ) -> None:
-    super().__init__(logger = logger)
+    super().__init__(logger=logger)
     self._config = config
     self._manifest_key = manifest_key
 
-  def requirements(self) -> Sequence[Requirement]:
-    repo_root = _required_path(
-      self._config.repository_paths,
-      self._config.name,
-      label = "repository_paths",
-    )
-    manifest_path = _required_path(
-      self._config.ground_truth_paths,
-      self._manifest_key,
-      label = "ground_truth_paths",
-    )
+  def requirements(self) -> Sequence[utils.BaseRequirement]:
+    reqs: list[utils.BaseRequirement] = []
+
+    repo_root = self._config.repository_paths.get(self._config.name)
+    if repo_root is None:
+      return [
+          FailRequirement(
+              name="config:repo_root",
+              message=(
+                  f"Missing repository_paths[{self._config.name!r}] in EntryConfig"
+              ),
+          )
+      ]
 
-    reqs: list[Requirement] = [
-      BenchmarkRequirement(
-        name = "repo_root_exists",
-        filepath = repo_root,
-      ),
-      BenchmarkRequirement(
-        name = "dataset_manifest_exists",
-        filepath = manifest_path,
-      ),
-    ]
+    # Always report repo root existence as a normal requirement
+    reqs.append(BenchmarkRequirement(name="repo_root_exists", filepath=repo_root))
+
+    manifest_path = self._config.ground_truth_paths.get(self._manifest_key)
+    if manifest_path is None:
+      reqs.append(
+          FailRequirement(
+              name="config:dataset_manifest",
+              message=(
+                  f"Missing ground_truth_paths[{self._manifest_key!r}] in EntryConfig"
+              ),
+          )
+      )
+      return reqs
+
+    reqs.append(
+        BenchmarkRequirement(name="dataset_manifest_exists", filepath=manifest_path)
+    )
 
     if not manifest_path.exists():
       return reqs
 
     try:
-      obj = json.loads(manifest_path.read_text(encoding = "utf-8"))
+      obj: Any = json.loads(manifest_path.read_text(encoding="utf-8"))
     except (OSError, json.JSONDecodeError) as exc:
       reqs.append(
-        FailRequirement(
-          name = "dataset_manifest_readable",
-          message = f"manifest unreadable: {exc}",
-        )
+          FailRequirement(
+              name="dataset_manifest_readable",
+              message=f"manifest unreadable: {exc}",
+          )
       )
       return reqs
 
     if not isinstance(obj, list):
       reqs.append(
-        FailRequirement(
-          name = "dataset_manifest_format",
-          message = "manifest JSON must be a list of objects",
-        )
+          FailRequirement(
+              name="dataset_manifest_format",
+              message="manifest JSON must be a list of objects",
+          )
       )
       return reqs
 
-    # Print a stable marker so signature matching is robust
-    # and portable across different platforms
+    # Portable size check -- prints a stable marker for signature matching
     size_script = (
-      "import os, sys\n"
-      "p = sys.argv[1]\n"
-      "print(f'OK size = {os.path.getsize(p)}')\n"
+        "import os, sys\n"
+        "p = sys.argv[1]\n"
+        "print(f'OK size = {os.path.getsize(p)}')\n"
     )
 
     for i, entry in enumerate(obj):
       entry_name = f"entry[{i}]"
 
       if not isinstance(entry, dict):
-        reqs.append(
-          FailRequirement(
-            name = entry_name,
-            message = "entry must be an object",
-          )
-        )
+        reqs.append(FailRequirement(name=entry_name, message="entry must be an object"))
         continue
 
       filepath = entry.get("filepath")
       size = entry.get("sizeinbytes")
 
       if not isinstance(filepath, str) or not filepath.strip():
-        reqs.append(
-          FailRequirement(
-            name = entry_name,
-            message = "missing/invalid filepath",
-          )
-        )
+        reqs.append(FailRequirement(name=entry_name, message="missing/invalid filepath"))
         continue
       if not isinstance(size, int) or size < 0:
         reqs.append(
-          FailRequirement(
-            name = entry_name,
-            message = f"{filepath!r}: missing/invalid sizeinbytes",
-          )
+            FailRequirement(
+                name=entry_name,
+                message=f"{filepath!r}: missing/invalid sizeinbytes",
+            )
         )
         continue
 
       rel = Path(filepath)
+
+      # Disallow absolute paths up-front
       if rel.is_absolute():
         reqs.append(
-          FailRequirement(
-            name = f"dataset:{filepath}",
-            message = "absolute paths not allowed",
-          )
+            FailRequirement(
+                name=f"dataset:{filepath}",
+                message="absolute paths not allowed",
+            )
         )
         continue
 
       full_path = repo_root / rel
+
+      # Enforce containment (prevents '..' traversal / symlink escapes where resolvable)
       if not _is_within(repo_root, full_path):
         reqs.append(
-          FailRequirement(
-            name = f"dataset:{filepath}",
-            message = "path escapes repo root (.. traversal not allowed)",
-          )
+            FailRequirement(
+                name=f"dataset:{filepath}",
+                message="path escapes repo root (.. traversal not allowed)",
+            )
         )
         continue
 
+      # NOTE: Existance is handled by BenchmarkRequirement(filepath=...), but 
+      # size matching is handled by cmd+signature
       reqs.append(
-        BenchmarkRequirement(
-          name = f"dataset:{filepath}",
-          filepath = full_path,
-          cmd = (sys.executable, "-c", size_script, str(full_path)),
-          signature = f"OK size = {size}",
-          timeout_seconds = 30.0,
-        )
+          BenchmarkRequirement(
+              name=f"dataset:{filepath}",
+              filepath=full_path,
+              cmd=(sys.executable, "-c", size_script, str(full_path)),
+              signature=f"OK size = {size}",
+              timeout_seconds=30.0,
+          )
       )
 
     return reqs
diff --git a/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_env_setup.py b/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_env_setup.py
index 191b104c..e150dd59 100644
--- a/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_env_setup.py
+++ b/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_env_setup.py
@@ -1,4 +1,4 @@
-"""Environment setup oracle for the Eurosys'25 EGWALKER bundle.
+"""Environment setup oracle for EGWALKER (EuroSys'25).
 
 Validates:
   - Required tools and minimum versions where applicable.
@@ -8,21 +8,20 @@
 
 from __future__ import annotations
 
+import logging
 from pathlib import Path
 from typing import Mapping, Sequence
 
-from evaluator.utils import EntryConfig, logger
+from evaluator import utils
+from evaluator.utils import EntryConfig
 from evaluator.oracle_env_setup_primitives import (
   DependencyVersionRequirement,
   FilesystemPathRequirement,
   OracleEnvSetupBase,
   PathType,
-  Requirement,
   VersionCompare,
 )
 
-_REPO_KEY = "egwalker"
-
 
 def _required_path(paths: Mapping[str, Path], key: str, *, label: str) -> Path:
   """Returns a required path from a mapping with a clear error."""
@@ -35,58 +34,55 @@ def _required_path(paths: Mapping[str, Path], key: str, *, label: str) -> Path:
 class OracleEnvSetup(OracleEnvSetupBase):
   """Validates environment prerequisites for EGWALKER."""
 
-  def __init__(self, *, config: EntryConfig, logger: logger) -> None:
+  def __init__(self, *, config: EntryConfig, logger: logging.Logger) -> None:
     super().__init__(logger)
     self._config = config
 
-  def requirements(self) -> Sequence[Requirement]:
+  def requirements(self) -> Sequence[utils.BaseRequirement]:
     repo_root = _required_path(
-      self._config.repository_paths, self._config.name, label="repository_paths")
+      self._config.repository_paths, self._config.name, label="repository_paths"
+    )
 
-    reqs: list[Requirement] = [
-      # Tooling.
+    reqs: list[utils.BaseRequirement] = [
       DependencyVersionRequirement(
-        name="rustc",
-        command=("rustc", "--version"),
-        required_version=(1, 78, 0),
-        compare=VersionCompare.GEQ,
+        name = "rustc",
+        cmd = ("rustc", "--version"),
+        required_version = (1, 83, 0),
+        compare = VersionCompare.GEQ,
       ),
       DependencyVersionRequirement(
-        name="cargo",
-        command=("cargo", "--version"),
-        required_version=(1, 0, 0),
-        compare=VersionCompare.GEQ,
+        name = "cargo",
+        cmd = ("cargo", "--version"),
+        required_version = (1, 0, 0),
+        compare = VersionCompare.GEQ,
       ),
       DependencyVersionRequirement(
-        name="node",
-        command=("node", "--version"),
-        required_version=(0, 0, 0),
-        compare=VersionCompare.GEQ,
+        name = "node",
+        cmd = ("node", "--version"),
+        required_version = (0, 0, 0),
+        compare = VersionCompare.GEQ,
       ),
       DependencyVersionRequirement(
-        name="make",
-        command=("make", "--version"),
-        required_version=(0, 0, 0),
-        compare=VersionCompare.GEQ,
-        optional=True,
+        name = "make",
+        cmd = ("make", "--version"),
+        required_version = (0, 0, 0),
+        compare = VersionCompare.GEQ,
+        optional = True,
       ),
-
-      # Repo directory.
       FilesystemPathRequirement(
-        name="repo_root_exists",
-        path=repo_root,
-        path_type=PathType.DIRECTORY,
+        name = "repo_root_exists",
+        path = repo_root,
+        path_type = PathType.DIRECTORY,
       ),
     ]
 
-    # Reference files (required).
     for key, ref_path in sorted(self._config.ground_truth_paths.items()):
       reqs.append(
         FilesystemPathRequirement(
-          name=f"reference_{key}_exists",
-          path=ref_path,
-          path_type=PathType.FILE,
+          name = f"reference_{key}_exists",
+          path = ref_path,
+          path_type = PathType.FILE,
         )
       )
 
-    return reqs
\ No newline at end of file
+    return reqs
diff --git a/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_experiment_runs.py b/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_experiment_runs.py
index 473d1aeb..7c27d25b 100644
--- a/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_experiment_runs.py
+++ b/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/_agent_eval/oracle_experiment_runs.py
@@ -1,199 +1,239 @@
-#!/usr/bin/env python3
-"""Experiment runs oracle for the EuroSys'25 EGWALKER artifact.
+"""Experiment runs oracle for EGWALKER (EuroSys'25).
 
-This oracle compares experiment-produced timings against reference timings.
+Validates:
+  - Timing results file can be read and parsed.
+  - Ground-truth reference timings file exists and can be read.
+  - Observed timings meet the configured similarity threshold against reference timings.
 """
 
 from __future__ import annotations
 
+import dataclasses
 import json
+import logging
 from collections.abc import Iterable, Mapping, Sequence
-from functools import partial
 from pathlib import Path
-import logging
+from typing import Any
 
+from evaluator import utils
 from evaluator.oracle_experiment_runs_primitives import (
-  ExperimentRunsRequirement,
-  LabeledSequenceSimilarityThresholdRequirement,
-  OracleExperimentRunsBase,
+    ElementwiseSimilarityThresholdRequirement,
+    ExperimentRunsContext,
+    OracleExperimentRunsBase,
 )
 from evaluator.utils import EntryConfig
 
 
 def _required_path(paths: Mapping[str, Path], key: str, *, label: str) -> Path:
-  """Returns a required path from a mapping with a clear error message."""
-  try:
-    return paths[key]
-  except KeyError as exc:
-    raise ValueError(f"Missing {label}[{key!r}] in EntryConfig") from exc
-
-
-def _loads_json_from_lines(lines: Sequence[str], *, label: str) -> object:
-  """Parses JSON content from already-read file lines."""
-  text = "\n".join(lines).strip()
-  if not text:
-    raise ValueError(f"{label}: empty JSON content")
-  try:
-    return json.loads(text)
-  except json.JSONDecodeError as exc:
-    raise ValueError(f"{label}: invalid JSON: {exc}") from exc
+    """Returns a required path from a mapping with a clear error message."""
+    try:
+        p = paths[key]
+    except KeyError as exc:
+        raise ValueError(f"Missing {label}[{key!r}] in EntryConfig") from exc
+    return p
 
 
 def _load_json_file(path: Path, *, label: str) -> object:
-  """Loads JSON from a file path."""
-  try:
-    text = path.read_text(encoding="utf-8")
-  except OSError as exc:
-    raise ValueError(f"{label}: failed to read {path}: {exc}") from exc
-  try:
-    return json.loads(text)
-  except json.JSONDecodeError as exc:
-    raise ValueError(f"{label}: invalid JSON: {exc}") from exc
+    """Loads JSON from a file path with consistent error messages."""
+    try:
+        text = path.read_text(encoding="utf-8")
+    except OSError as exc:
+        raise ValueError(f"{label}: failed to read {path}: {exc}") from exc
+    text = text.strip()
+    if not text:
+        raise ValueError(f"{label}: empty JSON content at {path}")
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError as exc:
+        raise ValueError(f"{label}: invalid JSON in {path}: {exc}") from exc
 
 
 def _as_float(v: object, *, label: str) -> float:
-  """Converts numeric values to float; raises on non-numeric."""
-  if isinstance(v, (int, float)):
-    return float(v)
-  raise ValueError(f"{label}: non-numeric value {v!r}")
+    """Converts numeric values to float; raises on non-numeric."""
+    if isinstance(v, (int, float)):
+        return float(v)
+    raise ValueError(f"{label}: non-numeric value {v!r}")
 
 
-def _iter_metric_tag_rows(obj: object, *, label: str) -> Iterable[tuple[str, Mapping[str, object]]]:
-  """Yields (row_key, stats_dict) where row_key is '<metric>.<tag>'."""
-  if not isinstance(obj, dict):
-    raise ValueError(f"{label}: timings JSON must be an object at top-level")
+def _iter_metric_tag_rows(
+    obj: object, *, label: str
+) -> Iterable[tuple[str, Mapping[str, object]]]:
+    """Yields (row_key, stats_dict) where row_key is '<metric>.<tag>'."""
+    if not isinstance(obj, dict):
+        raise ValueError(f"{label}: timings JSON must be an object at top-level")
 
-  for metric_name, metric in obj.items():
-    if not isinstance(metric, dict):
-      raise ValueError(f"{label}: {metric_name!r} must map to an object")
-    for tag, stats in metric.items():
-      if not isinstance(stats, dict):
-        raise ValueError(f"{label}: {metric_name}.{tag} must map to an object")
-      row_key = f"{metric_name}.{tag}"
-      yield row_key, stats
+    for metric_name, metric in obj.items():
+        if not isinstance(metric_name, str):
+            raise ValueError(f"{label}: non-string metric name {metric_name!r}")
+        if not isinstance(metric, dict):
+            raise ValueError(f"{label}: {metric_name!r} must map to an object")
+
+        for tag, stats in metric.items():
+            if not isinstance(tag, str):
+                raise ValueError(f"{label}: non-string tag name {tag!r}")
+            if not isinstance(stats, dict):
+                raise ValueError(f"{label}: {metric_name}.{tag} must map to an object")
+
+            row_key = f"{metric_name}.{tag}"
+            yield row_key, stats
 
 
 def _discover_reference_fields(reference_obj: object, *, label: str) -> tuple[str, ...]:
-  """Returns unique stats fields in order of first appearance in the reference."""
-  seen: set[str] = set()
-  ordered: list[str] = []
-  for _row_key, stats in _iter_metric_tag_rows(reference_obj, label=label):
-    for field in stats.keys():
-      if not isinstance(field, str):
-        raise ValueError(f"{label}: non-string field name {field!r}")
-      if field not in seen:
-        seen.add(field)
-        ordered.append(field)
-  return tuple(ordered)
-
-
-def _pairs_for_field_from_obj(
+    """Returns unique stats fields in order of first appearance in the reference."""
+    seen: set[str] = set()
+    ordered: list[str] = []
+    for _row_key, stats in _iter_metric_tag_rows(reference_obj, label=label):
+        for field in stats.keys():
+            if not isinstance(field, str):
+                raise ValueError(f"{label}: non-string field name {field!r}")
+            if field not in seen:
+                seen.add(field)
+                ordered.append(field)
+    return tuple(ordered)
+
+
+def _values_by_label_for_field(
     obj: object,
     *,
-    field: str,
+    field: str | None,
     label: str,
-) -> list[tuple[str, float]]:
-  """Builds (row_key, value) pairs for a given stats field."""
-  out: list[tuple[str, float]] = []
-  for row_key, stats in _iter_metric_tag_rows(obj, label=label):
-    if field not in stats:
-      # Skip: the primitives will treat this as "missing label" if reference
-      # expected it for this field (i.e., if reference includes row_key here).
-      continue
-    out.append((row_key, _as_float(stats[field], label=f"{label}: {row_key}.{field}")))
-  return out
-
-
-def _pairs_flatten_all_fields(obj: object, *, label: str) -> list[tuple[str, float]]:
-  """Fallback: flattens all fields into '<metric>.<tag>.<field>' labels."""
-  out: list[tuple[str, float]] = []
-  for row_key, stats in _iter_metric_tag_rows(obj, label=label):
-    for field, raw in stats.items():
-      if not isinstance(field, str):
-        raise ValueError(f"{label}: non-string field name {field!r}")
-      full = f"{row_key}.{field}"
-      out.append((full, _as_float(raw, label=f"{label}: {full}")))
-  return out
-
-
-def _parse_results_pairs_for_field(lines: Sequence[str], *, field: str) -> list[tuple[str, float]]:
-  obj = _loads_json_from_lines(lines, label="timings results")
-  return _pairs_for_field_from_obj(obj, field=field, label="timings results")
-
-
-def _parse_reference_pairs_for_field(path: Path, *, field: str) -> list[tuple[str, float]]:
-  obj = _load_json_file(path, label="timings reference")
-  return _pairs_for_field_from_obj(obj, field=field, label="timings reference")
-
-
-def _parse_results_pairs_flat(lines: Sequence[str]) -> list[tuple[str, float]]:
-  obj = _loads_json_from_lines(lines, label="timings results")
-  return _pairs_flatten_all_fields(obj, label="timings results")
-
-
-def _parse_reference_pairs_flat(path: Path) -> list[tuple[str, float]]:
-  obj = _load_json_file(path, label="timings reference")
-  return _pairs_flatten_all_fields(obj, label="timings reference")
+) -> dict[str, float]:
+    """Extracts timing values keyed by stable labels.
+
+    If field is not None:
+      - label is '<metric>.<tag>'
+      - value is stats[field]
+      - rows missing the field are skipped (so the *reference* defines expectation)
+
+    If field is None (flatten):
+      - label is '<metric>.<tag>.<field>'
+      - value is stats[field]
+    """
+    out: dict[str, float] = {}
+    for row_key, stats in _iter_metric_tag_rows(obj, label=label):
+        if field is None:
+            for f, raw in stats.items():
+                if not isinstance(f, str):
+                    raise ValueError(f"{label}: non-string field name {f!r}")
+                k = f"{row_key}.{f}"
+                if k in out:
+                    raise ValueError(f"{label}: duplicate label {k!r}")
+                out[k] = _as_float(raw, label=f"{label}: {k}")
+        else:
+            if field not in stats:
+                continue
+            if row_key in out:
+                raise ValueError(f"{label}: duplicate label {row_key!r}")
+            out[row_key] = _as_float(stats[field], label=f"{label}: {row_key}.{field}")
+    return out
+
+
+def _format_missing_labels(missing: Sequence[str], *, max_items: int = 10) -> str:
+    if not missing:
+        return ""
+    head = list(missing[:max_items])
+    more = len(missing) - len(head)
+    suffix = f"\n... ({more} more)" if more > 0 else ""
+    return "missing labels:\n" + "\n".join(f"- {k}" for k in head) + suffix
+
+
+@dataclasses.dataclass(frozen=True, slots=True, kw_only=True)
+class TimingsJSONSimilarityRequirement(utils.BaseRequirement):
+    """Artifact-specific wrapper that delegates numeric checks to base primitives."""
+
+    results_path: Path
+    reference_path: Path
+    threshold: float
+    field: str | None = None 
+    abs_epsilon: float = 1e-12
+    max_mismatches_to_report: int = 10
+
+    def check(self, ctx: ExperimentRunsContext) -> utils.CheckResult:
+        try:
+            results_obj = _load_json_file(self.results_path, label="timings results")
+            reference_obj = _load_json_file(self.reference_path, label="timings reference")
+
+            ref_map = _values_by_label_for_field(
+                reference_obj, field=self.field, label="timings reference"
+            )
+            res_map = _values_by_label_for_field(
+                results_obj, field=self.field, label="timings results"
+            )
+
+            expected_labels = sorted(ref_map.keys())
+            missing = [k for k in expected_labels if k not in res_map]
+            if missing:
+                detail = _format_missing_labels(missing, max_items=self.max_mismatches_to_report)
+                msg = f"{self.name}: results missing required reference entries"
+                if detail:
+                    msg = f"{msg}\n{detail}"
+                return utils.CheckResult.failure(msg)
+
+            observed = [res_map[k] for k in expected_labels]
+            reference = [ref_map[k] for k in expected_labels]
+        except ValueError as exc:
+            return utils.CheckResult.failure(f"{self.name}: {exc}")
+
+        delegated = ElementwiseSimilarityThresholdRequirement(
+            name=self.name,
+            optional=self.optional,
+            observed=observed,
+            reference=reference,
+            threshold=self.threshold,
+            abs_epsilon=self.abs_epsilon,
+            max_mismatches_to_report=self.max_mismatches_to_report,
+        )
+        return delegated.check(ctx)
 
 
 class OracleExperimentRuns(OracleExperimentRunsBase):
-  """Validates experiment run timings for EGWALKER."""
-
-  _NAME = "ExperimentRuns"
-
-  def __init__(self, *, config: EntryConfig, logger: logging.Logger) -> None:
-    super().__init__(logger=logger)
-    self._config = config
+    """Validates experiment run timings."""
 
-  def requirements(self) -> Sequence[ExperimentRunsRequirement]:
-    if not self._config.results_paths:
-      raise ValueError("EntryConfig.results_paths must be non-empty")
-    if not self._config.ground_truth_paths:
-      raise ValueError("EntryConfig.ground_truth_paths must be non-empty")
+    def __init__(self, *, config: EntryConfig, logger: logging.Logger) -> None:
+        super().__init__(logger=logger)
+        self._config = config
 
-    results_path = _required_path(
-      self._config.results_paths, "timings", label="results_paths"
-    )
-    reference_path = _required_path(
-      self._config.ground_truth_paths, "timings", label="ground_truth_paths"
-    )
+    def requirements(self) -> Sequence[utils.BaseRequirement]:
+        if not self._config.results_paths:
+            raise ValueError("EntryConfig.results_paths must be non-empty")
+        if not self._config.ground_truth_paths:
+            raise ValueError("EntryConfig.ground_truth_paths must be non-empty")
 
-    threshold = self._config.similarity_ratio
-
-    # Discover which "types" (fields) to check from the reference.
-    # If discovery fails (missing/invalid JSON), fall back to a single requirement
-    # that will report the real failure via the primitives.
-    try:
-      ref_obj = _load_json_file(reference_path, label="timings reference")
-      fields = _discover_reference_fields(ref_obj, label="timings reference")
-    except ValueError:
-      fields = ()
-
-    if not fields:
-      # Fallback or "no fields": compare all qualified fields as one sequence.
-      return (
-        LabeledSequenceSimilarityThresholdRequirement(
-          name="timings",
-          label="Timings",
-          results_path=results_path,
-          reference_path=reference_path,
-          threshold=threshold,
-          parse_results_fn=_parse_results_pairs_flat,
-          parse_reference_fn=_parse_reference_pairs_flat,
-        ),
-      )
-
-    reqs: list[ExperimentRunsRequirement] = []
-    for field in fields:
-      reqs.append(
-        LabeledSequenceSimilarityThresholdRequirement(
-          name=f"timings_{field}",
-          label=f"Timings {field}",
-          results_path=results_path,
-          reference_path=reference_path,
-          threshold=threshold,
-          parse_results_fn=partial(_parse_results_pairs_for_field, field=field),
-          parse_reference_fn=partial(_parse_reference_pairs_for_field, field=field),
+        results_path = _required_path(self._config.results_paths, "timings", label="results_paths")
+        reference_path = _required_path(
+            self._config.ground_truth_paths, "timings", label="ground_truth_paths"
         )
-      )
-    return tuple(reqs)
\ No newline at end of file
+
+        threshold = self._config.similarity_ratio
+
+        # Discover which fields to check from the reference.
+        try:
+            ref_obj = _load_json_file(reference_path, label="timings reference")
+            fields = _discover_reference_fields(ref_obj, label="timings reference")
+        except ValueError:
+            fields = ()
+
+        if not fields:
+            # Fallback: compare all fields flattened.
+            return (
+                TimingsJSONSimilarityRequirement(
+                    name="timings",
+                    results_path=results_path,
+                    reference_path=reference_path,
+                    threshold=threshold,
+                    field=None,
+                ),
+            )
+
+        reqs: list[utils.BaseRequirement] = []
+        for field in fields:
+            reqs.append(
+                TimingsJSONSimilarityRequirement(
+                    name=f"timings_{field}",
+                    results_path=results_path,
+                    reference_path=reference_path,
+                    threshold=threshold,
+                    field=field,
+                )
+            )
+        return tuple(reqs)
diff --git a/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/egwalker/README.md b/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/egwalker/README.md
index b9062b33..6e327aa7 100644
--- a/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/egwalker/README.md
+++ b/benchmarks/arteval_bench/data/benchmark/eurosys25_egwalker/egwalker/README.md
@@ -35,12 +35,10 @@ You can use `git diff` on `results/timings.json` (and other files) to see how yo
 
 **Tools:** You will need the following tools installed on your computer:
 
-- *Rust compiler & toolchain*: Any "recent" version of rust should work. The published version of the paper used rust 1.78. The easiest way to install rust is via [rustup](https://rustup.rs/).
-- *NodeJS*: Nodejs is only used for scripting - like extracting benchmarking results into 'clean' JSON files and generating the charts used in the paper.
+- *Rust compiler & toolchain*: Any "recent" version of rust should work. The published version of the paper used rust 1.78, but the current evaluation reuqires at least 1.83.0. The easiest way to install rust is via [rustup](https://rustup.rs/).
+- *NodeJS*: Nodejs is only used for scripting - like extracting benchmarking results into 'clean' JSON files and generating the charts used in the paper. We used nodejs v21 for the current evaluation, but any newer version should work.
 - *(Optional)*: GNU Make 4.3. We have alternative shell scripts if you don't have make available, but its less convenient.
 
-We used rust 1.80 and nodejs v21 when generating the results in the current version of the paper.
-
 This process has only been tested on linux, but it *should* work on other broadly supported platforms (like macos) too.
 
 
diff --git a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/main.py b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/main.py
index 95967ba7..608262be 100644
--- a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/main.py
+++ b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/main.py
@@ -2,50 +2,87 @@
 """Runs environment setup checks for WASABI."""
 
 from __future__ import annotations
+
 from pathlib import Path
 from typing import Dict
 import os
 import sys
 
-
-_AGENT_EVAL_DIR = Path(__file__).resolve().parent
-_AGENT_SRC_DIR = _AGENT_EVAL_DIR.parents[3] / "src"
-sys.path.append(str(_AGENT_SRC_DIR))
-
-
 from evaluator.utils import (
-  EntryConfig,
-  LoggerConfig,
-  get_logger,
-  record_result,
+    EntryConfig,
+    LoggerConfig,
+    get_logger,
+    record_result,
 )
 from oracle_artifact_build import OracleArtifactBuild
 from oracle_env_setup import OracleEnvSetup
 from oracle_benchmark_prep import OracleBenchmarkPrep
 from oracle_experiment_runs import OracleExperimentRuns
 
-
-# NOTE: WASABI bundle layout mirrors the legacy constants, but we build it directly
-# from EntryConfig rather than importing legacy globals.
-_WASABI_HOME = Path.home() / "sosp24_wasabi"
-_WASABI_REPO = _WASABI_HOME / "wasabi"
-_WASABI_BENCH = _WASABI_HOME / "benchmarks"
-
+_AGENT_EVAL_DIR = Path(__file__).resolve().parent
+_WASABI_HOME = _AGENT_EVAL_DIR.parent.resolve()
+_WASABI_REPO = (_WASABI_HOME / "wasabi" / "wasabi-testing").resolve()
+_WASABI_BENCH = (_WASABI_HOME / "benchmarks").resolve()
 
 WASABI_CONFIG = EntryConfig(
-  name = "sosp24-wasabi",
-  home_dir = _WASABI_HOME,
-  repository_paths = {
-    "sosp24-wasabi": _WASABI_REPO,
-    "benchmarks": _WASABI_BENCH,
-  },
-  results_paths = {
-    "results_root": _WASABI_REPO / "results",
-  },
-  ground_truth_paths = {
-    "bugs_ground_truth": _WASABI_REPO / "bugs_ground_truth.txt",
-  },
-  similarity_ratio = 0.75,
+    name="sosp24-wasabi",
+    home_dir=_WASABI_HOME,
+    repository_paths={
+        "sosp24-wasabi": _WASABI_REPO,
+        "benchmarks": _WASABI_BENCH,
+    },
+    results_paths={
+        "results_root": _WASABI_REPO / "results",
+    },
+    ground_truth_paths={
+        "bugs_ground_truth": _AGENT_EVAL_DIR / "refs" / "bugs_ground_truth.csv",
+    },
+    similarity_ratio=0.75,
+    metadata={
+        "maven_repo_dir":
+            Path.home() / ".m2" / "repository",
+        "weaving_plugin_signature":
+            "aspectj-maven-plugin",
+        "primary_artifact":
+            "edu.uchicago.cs.systems:wasabi",
+        "benchmarks": {
+            "hadoop": {
+                "repo_url": "https://github.com/apache/hadoop.git",
+                "commit": "60867de",
+                "pom_file": "pom.xml",
+                "pom_backup": "pom-original.xml",
+            },
+            "hbase": {
+                "repo_url": "https://github.com/apache/hbase.git",
+                "commit": "89ca7f4",
+                "pom_file": "pom.xml",
+                "pom_backup": "pom-original.xml",
+            },
+            "hive": {
+                "repo_url": "https://github.com/apache/hive.git",
+                "commit": "e08a600",
+                "pom_file": "pom.xml",
+                "pom_backup": "pom-original.xml",
+            },
+        },
+        "aspectj_markers": [
+            "ajc$preClinit",
+            "ajc$initFailureCause",
+            "ajc$tjp",
+            "ajc$before$",
+            "ajc$after$",
+            "ajc$around$",
+            "ajc$interField$",
+            "ajc$interMethod$",
+            "org.aspectj.runtime.reflect.Factory",
+            "org.aspectj.runtime.internal.AroundClosure",
+            "org.aspectj.lang.JoinPoint",
+            "org.aspectj.lang.JoinPoint$StaticPart",
+            "org.aspectj.lang.ProceedingJoinPoint",
+            "org.aspectj.lang.Signature",
+            "org.aspectj.lang.NoAspectBoundException",
+        ],
+    },
 )
 
 
@@ -56,27 +93,23 @@ def main(argv: list[str]) -> int:
   score = 0
 
   logger_name = os.environ.get("EVAL_LOGGER_NAME", "WASABI-AGENT-EVALUATOR")
-  logger = get_logger(LoggerConfig(root_name = logger_name))
-
-  env_checker = OracleEnvSetup(config = WASABI_CONFIG, logger = logger)
-  score += record_result(
-    logger, results, type(env_checker).__name__, env_checker.run(verbose = verbose)
-  )
-
-  build_checker = OracleArtifactBuild(config = WASABI_CONFIG, logger = logger)
-  score += record_result(
-    logger, results, type(build_checker).__name__, build_checker.run(verbose = verbose)
-  )
-
-  prep_checker = OracleBenchmarkPrep(config = WASABI_CONFIG, logger = logger)
-  score += record_result(
-    logger, results, type(prep_checker).__name__, prep_checker.run(verbose = verbose)
-  )
-
-  runs_checker = OracleExperimentRuns(config = WASABI_CONFIG, logger = logger)
-  score += record_result(
-    logger, results, type(runs_checker).__name__, runs_checker.run(verbose = verbose)
-  )
+  logger = get_logger(LoggerConfig(root_name=logger_name))
+
+  env_checker = OracleEnvSetup(config=WASABI_CONFIG, logger=logger)
+  env_ok = env_checker.run(verbose=verbose)
+  score += record_result(results, type(env_checker).__name__, env_ok)
+
+  build_checker = OracleArtifactBuild(config=WASABI_CONFIG, logger=logger)
+  build_ok = build_checker.run(verbose=verbose)
+  score += record_result(results, type(build_checker).__name__, build_ok)
+
+  prep_checker = OracleBenchmarkPrep(config=WASABI_CONFIG, logger=logger)
+  prep_ok = prep_checker.run(verbose=verbose)
+  score += record_result(results, type(prep_checker).__name__, prep_ok)
+
+  runs_checker = OracleExperimentRuns(config=WASABI_CONFIG, logger=logger)
+  runs_ok = runs_checker.run(verbose=verbose)
+  score += record_result(results, type(runs_checker).__name__, runs_ok)
 
   logger.info("Agent scores: %s", results)
   return score
diff --git a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_artifact_build.py b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_artifact_build.py
index 6bf39f2f..8899d646 100644
--- a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_artifact_build.py
+++ b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_artifact_build.py
@@ -1,116 +1,234 @@
 #!/usr/bin/env python3
-import xml.etree.ElementTree as ET
+import dataclasses
 import fnmatch
-
-from utils import HOME
-from utils import REPO_DIR
-from utils import logger
+import hashlib
+import logging
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
 
 from evaluator.oracle_artifact_build_primitives import OracleArtifactBuildBase
+from evaluator.utils import EntryConfig
 from evaluator import utils
 
 
+def _required_path(paths: Dict[str, Path], key: str, *, label: str) -> Path:
+  """Returns a required path from a mapping with a clear error."""
+  try:
+    p = paths[key]
+  except KeyError as e:
+    raise ValueError(f"Missing {label}[{key!r}] in EntryConfig") from e
+  return utils.to_path(p)
+
+
+def _required_meta(meta: Dict[str, Any], key: str, *, label: str) -> Any:
+  """Returns a required metadata value with a clear error."""
+  try:
+    return meta[key]
+  except KeyError as e:
+    raise ValueError(f"Missing {label}[{key!r}] in EntryConfig.metadata") from e
+
+
+def _sha256(path: Path) -> str:
+  h = hashlib.sha256()
+  with path.open("rb") as f:
+    for chunk in iter(lambda: f.read(1024 * 1024), b""):
+      h.update(chunk)
+  return h.hexdigest()
+
+
+def _pick_primary_jar(dir_path: Path, artifact_id: str,
+                      version: str) -> Path | None:
+  """
+  Picks a "primary" jar from a directory by matching artifactId/version while
+  excluding common auxiliary jars (e.g., sources/javadoc/tests/original-*).
+  """
+  if not dir_path.is_dir():
+    return None
+
+  bad_tokens = ("-sources", "-javadoc", "-tests", "original-")
+  pattern = f"{artifact_id}-{version}*.jar"
+  cands = [
+      p for p in dir_path.glob("*.jar")
+      if p.is_file() and fnmatch.fnmatch(p.name, pattern) and not any(
+          tok in p.name for tok in bad_tokens)
+  ]
+  if not cands:
+    return None
+
+  # Prefer newest (best-effort)
+  return max(cands, key=lambda p: p.stat().st_mtime)
+
+
+def _strip_ns(tag: str) -> str:
+  return tag.split("}", 1)[-1]
+
+
 @dataclasses.dataclass(frozen=True, slots=True)
-class _BuildInputsRequirement(utils.BaseRequirement):
+class _BuildInputsRequirement:
+  name: str
   oracle: "OracleArtifactBuild"
+  optional: bool = False
 
-  def check(self, ctx: object) -> utils.CheckResult:
-    del ctx
+  def check(self, ctx) -> utils.CheckResult:
+    repo_dir = self.oracle.repo_dir
+    if not repo_dir.exists() or not repo_dir.is_dir():
+      ctx.logger.info("Build: FAIL - base project directory not found")
+      return utils.CheckResult.failure("base project directory not found",
+                                       cwd=repo_dir)
 
-    if not REPO_DIR.exists():
-      logger.info("Build: FAIL - base project directory not found")
-      return utils.CheckResult.failure("base project directory not found")
-
-    poms = self.oracle.find_poms(REPO_DIR)
+    poms = self.oracle.find_poms(repo_dir)
     if not poms:
-      logger.info("Build: FAIL - no pom.xml files found under wasabi-testing")
-      return utils.CheckResult.failure("no pom.xml files found under wasabi-testing")
+      ctx.logger.info("Build: FAIL - no pom.xml files found under repo")
+      return utils.CheckResult.failure("no pom.xml files found under repo",
+                                       cwd=repo_dir)
 
-    root_pom = REPO_DIR / "pom.xml"
-    top_defaults = {}
+    root_pom = repo_dir / "pom.xml"
+    top_defaults: Dict[str, str] = {}
     if root_pom.exists():
-      root_mod = self.oracle.parse_pom(root_pom)
+      root_mod = self.oracle.parse_pom(root_pom, top_defaults=None)
       if not root_mod.get("error"):
         if root_mod.get("groupId"):
           top_defaults["groupId"] = root_mod["groupId"]
         if root_mod.get("version"):
           top_defaults["version"] = root_mod["version"]
 
-    modules = []
-    errors = []
+    modules: List[Dict[str, Any]] = []
+    errors: List[Tuple[Path, str]] = []
     for pom in poms:
       m = self.oracle.parse_pom(pom, top_defaults=top_defaults)
       if m.get("error"):
         errors.append((pom, m["error"]))
         continue
       if not all([m.get("artifactId"), m.get("groupId"), m.get("version")]):
-        errors.append((pom, "missing groupId/artifactId/version after inheritance"))
+        errors.append(
+            (pom, "missing groupId/artifactId/version after inheritance"))
       else:
         modules.append(m)
 
     if errors:
-      logger.info("Build: FAIL - POM parsing errors present")
+      ctx.logger.info("Build: FAIL - POM parsing errors present")
       for pom, err in errors[:5]:
-        logger.info(f" - {pom}: {err}")
+        ctx.logger.info(f" - {pom}: {err}")
       if len(errors) > 5:
-        logger.info(f" ... {len(errors)-5} more")
-      return utils.CheckResult.failure("POM parsing errors present")
+        ctx.logger.info(f" ... {len(errors)-5} more")
+      return utils.CheckResult.failure("POM parsing errors present",
+                                       cwd=repo_dir)
 
     self.oracle._modules = modules
-    return utils.CheckResult.success()
+    return utils.CheckResult.success(cwd=repo_dir)
 
 
 @dataclasses.dataclass(frozen=True, slots=True)
-class _CodeBuildRequirement(utils.BaseRequirement):
+class _PrimaryModuleBuildRequirement:
+  name: str
   oracle: "OracleArtifactBuild"
+  optional: bool = False
 
-  def check(self, ctx: object) -> utils.CheckResult:
-    del ctx
-
+  def check(self, ctx) -> utils.CheckResult:
     modules = getattr(self.oracle, "_modules", None)
     if not modules:
-      return utils.CheckResult.success()
-
-    missing_targets = []
-    missing_installs = []
-
+      return utils.CheckResult.failure("modules not initialized",
+                                       cwd=self.oracle.repo_dir)
+
+    selector = self.oracle.primary_artifact_selector.strip()
+    if ":" in selector:
+      want_gid, want_aid = selector.split(":", 1)
+      want_gid = want_gid.strip()
+      want_aid = want_aid.strip()
+    else:
+      want_gid, want_aid = "", selector.strip()
+
+    chosen = None
     for m in modules:
-      if not self.oracle.has_target_jar(m):
-        missing_targets.append(str(m["dir"]))
-      if not self.oracle.has_installed_artifact(m):
-        missing_installs.append(f"{m['groupId']}:{m['artifactId']}:{m['version']}")
-
-    if missing_targets or missing_installs:
-      logger.info("Code build: FAIL")
-      if missing_targets:
-        logger.info(" Missing built JARs in target/:")
-        for d in missing_targets[:10]:
-          logger.info(f"  - {d}")
-        if len(missing_targets) > 10:
-          logger.info(f"  ... {len(missing_targets)-10} more")
-      if missing_installs:
-        logger.info(" Missing artifacts in local ~/.m2 repository:")
-        for gav in missing_installs[:10]:
-          logger.info(f"  - {gav}")
-        if len(missing_installs) > 10:
-          logger.info(f"  ... {len(missing_installs)-10} more")
-
-      return utils.CheckResult.failure("missing built jars and/or installed artifacts")
-
-    logger.info("Code build: PASS")
-    return utils.CheckResult.success()
+      gid = (m.get("groupId") or "").strip()
+      aid = (m.get("artifactId") or "").strip()
+      if not aid:
+        continue
+      if want_gid:
+        if gid == want_gid and aid == want_aid:
+          chosen = m
+          break
+      else:
+        if aid == want_aid:
+          chosen = m
+          break
+
+    if not chosen:
+      return utils.CheckResult.failure(
+          f"primary module not found for selector {selector!r}",
+          cwd=self.oracle.repo_dir,
+      )
+
+    packaging = (chosen.get("packaging") or "jar").strip()
+    if packaging == "pom":
+      ctx.logger.info("Code build: FAIL")
+      return utils.CheckResult.failure(
+          "primary module resolved to packaging=pom", cwd=Path(chosen["dir"]))
+
+    gid = (chosen.get("groupId") or "").strip()
+    aid = (chosen.get("artifactId") or "").strip()
+    ver = (chosen.get("version") or "").strip()
+    module_dir = Path(chosen["dir"])
+
+    if not gid or not aid or not ver:
+      return utils.CheckResult.failure(
+          "primary module missing groupId/artifactId/version after inheritance",
+          cwd=module_dir,
+      )
+
+    built = _pick_primary_jar(module_dir / "target", aid, ver)
+    installed_dir = self.oracle.repo_path(gid, aid, ver)
+    installed = _pick_primary_jar(installed_dir, aid, ver)
+
+    if not built or not installed:
+      ctx.logger.info("Code build: FAIL")
+      if not built:
+        ctx.logger.info(" Missing built JARs in target/:")
+        ctx.logger.info(f"  - {module_dir}")
+      if not installed:
+        ctx.logger.info(" Missing artifacts in local Maven repository:")
+        ctx.logger.info(f"  - {gid}:{aid}:{ver}")
+      return utils.CheckResult.failure(
+          "missing built jar and/or installed artifact", cwd=module_dir)
+
+    hb = _sha256(built)
+    hi = _sha256(installed)
+    if hb != hi:
+      ctx.logger.info("Code build: FAIL")
+      detail = f"built={built} sha256={hb}\ninstalled={installed} sha256={hi}"
+      return utils.CheckResult.failure(
+          "primary artifact mismatch: target/ jar does not match local Maven repo jar",
+          stdout=utils.truncate_text(detail, utils.DEFAULT_MAX_CAPTURE_CHARS),
+          cwd=module_dir,
+      )
+
+    ctx.logger.info("Code build: PASS")
+    return utils.CheckResult.success(cwd=module_dir)
 
 
 class OracleArtifactBuild(OracleArtifactBuildBase):
-  def __init__(self, *, logger=logger):
+
+  def __init__(self, *, config: EntryConfig, logger: logging.Logger):
     super().__init__(logger=logger)
-    self.maven_packages_dir = HOME / ".m2" / "repository"
+    self._config = config
+
+    self.repo_dir = _required_path(config.repository_paths,
+                                   "sosp24-wasabi",
+                                   label="repository_paths").resolve()
+
+    meta: Dict[str, Any] = getattr(config, "metadata", {}) or {}
+    self.maven_packages_dir = utils.to_path(
+        _required_meta(meta, "maven_repo_dir", label="metadata")).resolve()
+    self.primary_artifact_selector = str(
+        _required_meta(meta, "primary_artifact", label="metadata"))
+
     self._modules = None
 
   def requirements(self):
     return (
-      _BuildInputsRequirement(name="Build", oracle=self),
-      _CodeBuildRequirement(name="Code build", oracle=self),
+        _BuildInputsRequirement(name="Build", oracle=self),
+        _PrimaryModuleBuildRequirement(name="Code build", oracle=self),
     )
 
   def xget(self, elem, tag):
@@ -136,14 +254,23 @@ def parse_pom(self, pom_path, top_defaults=None):
       tree = ET.parse(pom_path)
       root = tree.getroot()
     except Exception as e:
-      return {"dir": pom_path.parent, "pom": pom_path, "error": f"XML parse error: {e}"}
+      return {
+          "dir": pom_path.parent,
+          "pom": pom_path,
+          "error": f"XML parse error: {e}"
+      }
 
     artifactId = self.xget(root, "artifactId")
     groupId = self.xget(root, "groupId")
     version = self.xget(root, "version")
     packaging = self.xget(root, "packaging") or "jar"
 
-    parent = root.find("parent")
+    parent = None
+    for c in list(root):
+      if _strip_ns(c.tag) == "parent":
+        parent = c
+        break
+
     if parent is not None:
       p_groupId = self.xget(parent, "groupId")
       p_version = self.xget(parent, "version")
@@ -157,12 +284,12 @@ def parse_pom(self, pom_path, top_defaults=None):
       version = version or top_defaults.get("version")
 
     return {
-      "dir": pom_path.parent,
-      "pom": pom_path,
-      "groupId": groupId,
-      "artifactId": artifactId,
-      "version": version,
-      "packaging": packaging
+        "dir": pom_path.parent,
+        "pom": pom_path,
+        "groupId": groupId,
+        "artifactId": artifactId,
+        "version": version,
+        "packaging": packaging
     }
 
   def find_poms(self, base):
@@ -171,20 +298,3 @@ def find_poms(self, base):
   def repo_path(self, groupId, artifactId, version):
     parts = groupId.split(".")
     return self.maven_packages_dir.joinpath(*parts, artifactId, version)
-
-  def has_target_jar(self, module):
-    if module["packaging"] == "pom":
-      return True # no jar expected
-    target = module["dir"] / "target"
-    if not target.is_dir():
-      return False
-    pattern = f"{module['artifactId']}-{module['version']}*.jar"
-    return any(fnmatch.fnmatch(p.name, pattern) for p in target.glob("*.jar"))
-
-  def has_installed_artifact(self, module):
-    rp = self.repo_path(module["groupId"], module["artifactId"], module["version"])
-    if module["packaging"] == "pom":
-      return (rp / f"{module['artifactId']}-{module['version']}.pom").is_file()
-    return any(p.suffix == ".jar" and fnmatch.fnmatch(
-          p.name, f"{module['artifactId']}-{module['version']}*.jar")
-          for p in rp.glob("*.jar"))
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_benchmark_prep.py b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_benchmark_prep.py
index 96f19eef..8dbc9bd8 100644
--- a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_benchmark_prep.py
+++ b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_benchmark_prep.py
@@ -1,115 +1,149 @@
 #!/usr/bin/env python3
-import sys
-import shlex
 import subprocess
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Any, Dict, List
 
-from utils import BENCH_DIR
-from utils import logger
-
-from evaluator.utils import EntryConfig
 from evaluator.oracle_benchmark_prep_primitives import (
-  OracleBenchmarkPrepBase,
-  Requirement,
+    OracleBenchmarkPrepBase,
+    BenchmarkRequirement,
 )
 from evaluator import utils
 
 
-
-REPOS = {
-  "hadoop": ("https://github.com/apache/hadoop.git", "60867de"),
-  "hbase": ("https://github.com/apache/hbase.git", "89ca7f4"),
-  "hive": ("https://github.com/apache/hive.git", "e08a600"),
-}
-
-ASPECTJ_MARKERS = [
-  "ajc$preClinit",
-  "ajc$initFailureCause",
-  "ajc$tjp",
-  "ajc$before$",
-  "ajc$after$",
-  "ajc$around$",
-  "ajc$interField$",
-  "ajc$interMethod$",
-  "org.aspectj.runtime.reflect.Factory",
-  "org.aspectj.runtime.internal.AroundClosure",
-  "org.aspectj.lang.JoinPoint",
-  "org.aspectj.lang.JoinPoint$StaticPart",
-  "org.aspectj.lang.ProceedingJoinPoint",
-  "org.aspectj.lang.Signature",
-  "org.aspectj.lang.NoAspectBoundException",
-]
-
-
 def _required_path(paths, key: str, *, label: str) -> Path:
   """Returns a required path from a mapping with a clear error."""
   try:
-    return paths[key]
+    return utils.to_path(paths[key])
   except KeyError as e:
     raise ValueError(f"Missing {label}[{key!r}] in EntryConfig") from e
 
 
-@dataclass(frozen=True, slots=True)
-class _RepoCommitRequirement(utils.BaseRequirement):
-  oracle: "OracleBenchmarkPrep"
-  app: str
-  app_root: Path
-  expected_commit_prefix: str
+def _required_meta(meta: Dict[str, Any], key: str, *, label: str) -> Any:
+  """Returns a required metadata value with a clear error."""
+  try:
+    return meta[key]
+  except KeyError as e:
+    raise ValueError(f"Missing {label}[{key!r}] in EntryConfig.metadata") from e
 
-  def check(self, ctx) -> utils.CheckResult:
-    ok, msg = self.oracle.check_repo_commit(self.app, self.app_root, self.expected_commit_prefix)
-    ctx.logger.info(msg)
-    return utils.CheckResult.success() if ok else utils.CheckResult.failure(msg)
+
+def _as_dict(x: Any) -> Dict[str, Any]:
+  if isinstance(x, dict):
+    return x
+  raise ValueError(f"Expected dict in EntryConfig.metadata, got: {type(x)!r}")
+
+
+def _as_list_str(x: Any) -> List[str]:
+  if isinstance(x, list) and all(isinstance(v, str) for v in x):
+    return x
+  raise ValueError("Expected list[str] in EntryConfig.metadata")
 
 
 @dataclass(frozen=True, slots=True)
-class _WeavingRequirement(utils.BaseRequirement):
+class _WeavingRequirement:
+  name: str
   oracle: "OracleBenchmarkPrep"
   app: str
   app_root: Path
+  optional: bool = False
 
   def check(self, ctx) -> utils.CheckResult:
     ok, msg = self.oracle.check_app_weaving(self.app, self.app_root)
     ctx.logger.info(msg)
-    return utils.CheckResult.success() if ok else utils.CheckResult.failure(msg)
+    return utils.CheckResult.success(
+        cwd=self.app_root) if ok else utils.CheckResult.failure(
+            msg, cwd=self.app_root)
 
 
 class OracleBenchmarkPrep(OracleBenchmarkPrepBase):
 
-  def __init__(self, *, config: EntryConfig, logger: logger.__class__):
-    super().__init__(logger = logger)
+  def __init__(self, *, config: utils.EntryConfig, logger):
+    super().__init__(logger=logger)
     self._config = config
 
-    self.max_class_dirs = 200
-    self.max_classess_per_dir = 2000
+    meta = _as_dict(getattr(config, "metadata", {}) or {})
+
+    self._bench_specs = _as_dict(
+        _required_meta(meta, "benchmarks", label="metadata"))
+    self._weaving_plugin_sig = str(
+        _required_meta(meta, "weaving_plugin_signature", label="metadata"))
+    self._aspectj_markers = _as_list_str(
+        _required_meta(meta, "aspectj_markers", label="metadata"))
+
+    # Bounds for the max number of compiled classes checked for instrumentation markers
+    self.max_class_dirs = int(meta.get("max_class_dirs", 200))
+    self.max_classess_per_dir = int(meta.get("max_classess_per_dir", 2000))
 
-  def requirements(self) -> tuple[Requirement, ...]:
-    bench_root = _required_path(self._config.repository_paths, "benchmarks", label="repository_paths")
+  def requirements(self) -> tuple[object, ...]:
+    bench_root = _required_path(self._config.repository_paths,
+                                "benchmarks",
+                                label="repository_paths")
+    wasabi_root = _required_path(self._config.repository_paths,
+                                 "sosp24-wasabi",
+                                 label="repository_paths")
+
+    reqs: List[object] = []
+
+    for app in sorted(self._bench_specs.keys()):
+      spec = _as_dict(self._bench_specs[app])
 
-    reqs: list[Requirement] = []
-    for app in REPOS:
       app_root = bench_root / app
+      expected_commit = str(
+          _required_meta(spec, "commit", label=f"metadata.benchmarks[{app}]"))
+      pom_file = str(
+          _required_meta(spec, "pom_file", label=f"metadata.benchmarks[{app}]"))
+      pom_backup = str(
+          _required_meta(spec,
+                         "pom_backup",
+                         label=f"metadata.benchmarks[{app}]"))
+
+      reqs.append(
+          BenchmarkRequirement(
+              name=f"{app}: clone",
+              filepath=app_root,
+              cmd=["git", "-C", str(app_root), "rev-parse", "HEAD"],
+              signature=expected_commit,
+              timeout_seconds=10.0,
+          ))
 
-      expected_commit = REPOS[app][1]
       reqs.append(
-        _RepoCommitRequirement(
-          name = f"{app}: clone",
-          oracle = self,
-          app = app,
-          app_root = app_root,
-          expected_commit_prefix = expected_commit,
-        )
-      )
+          BenchmarkRequirement(
+              name=f"{app}: pom swap",
+              filepath=app_root,
+              cmd=[
+                  "bash",
+                  "-lc",
+                  ("set -euo pipefail; "
+                   f"test -f {pom_file}; "
+                   f"test -f {pom_backup}; "
+                   f"! cmp -s {pom_file} {pom_backup}; "
+                   f"grep -a -F -q {self._weaving_plugin_sig} {pom_file}; "
+                   "echo POM_SWAP_OK"),
+              ],
+              signature="POM_SWAP_OK",
+              timeout_seconds=10.0,
+              use_shell=False,
+          ))
 
       reqs.append(
-        _WeavingRequirement(
-          name = f"{app}: weaving",
-          oracle = self,
-          app = app,
-          app_root = app_root,
-        )
-      )
+          BenchmarkRequirement(
+              name=f"{app}: weaving config",
+              filepath=app_root,
+              cmd=[
+                  "cat",
+                  "pom.xml",
+              ],
+              signature=self._weaving_plugin_sig,
+              timeout_seconds=120.0,
+          ))
+
+      reqs.append(
+          _WeavingRequirement(
+              name=f"{app}: weaving",
+              oracle=self,
+              app=app,
+              app_root=app_root,
+          ))
 
     return tuple(reqs)
 
@@ -118,7 +152,10 @@ def run_shell_command(self, cmd):
     Run a bash command given as argument.
     """
     try:
-      cp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+      cp = subprocess.run(cmd,
+                          stdout=subprocess.PIPE,
+                          stderr=subprocess.PIPE,
+                          text=True)
       return cp.returncode, (cp.stdout or "").strip(), (cp.stderr or "").strip()
     except FileNotFoundError as e:
       return 127, "", str(e)
@@ -127,16 +164,13 @@ def find_class_dirs(self, app_root: Path):
     """
     Find directories that contain .class files.
     """
-    qroot = shlex.quote(str(app_root))
     cmd = [
-      "bash",
-      "-lc",
-      (
-        f"shopt -s nullglob; "
-        f"find {qroot} -type f -name '*.class' "
-        f"-not -path '*/.git/*' -not -path '*/.m2/*' -not -path '*/.gradle/*' "
-        f"-printf '%h\n' | sort -u"
-      ),
+        "bash",
+        "-lc",
+        (f"shopt -s nullglob; "
+         f"find {app_root} -type f -name '*.class' "
+         f"-not -path '*/.git/*' -not -path '*/.m2/*' -not -path '*/.gradle/*' "
+         f"-printf '%h\n' | sort -u"),
     ]
     rc, out, err = self.run_shell_command(cmd)
     if rc != 0:
@@ -149,8 +183,10 @@ def iter_class_files(self, classes_dir: Path, limit: int):
     Iterate over .class files from a class directory, processing up to
     a configurable number of files.
     """
-    q = shlex.quote(str(classes_dir))
-    cmd = ["bash", "-lc", f"shopt -s nullglob; find {q} -type f -name '*.class' | sort"]
+    cmd = [
+        "bash", "-lc",
+        f"shopt -s nullglob; find {classes_dir} -type f -name '*.class' | sort"
+    ]
     rc, out, err = self.run_shell_command(cmd)
     if rc != 0 or not out:
       return []
@@ -160,33 +196,15 @@ def iter_class_files(self, classes_dir: Path, limit: int):
       files = files[::step][:limit]
     return files
 
-  def check_repo_commit(self, app: str, app_root: Path, expected_commit_prefix: str):
-    """
-    Verify the repo at app_root is a git repo and HEAD matches an expected commit ID prefix.
-    """
-    if not app_root.is_dir():
-      return False, f"{app}: FAIL (clone) - directory not found: {app_root}"
-
-    rc, out, err = self.run_shell_command(["git", "-C", str(app_root), "rev-parse", "HEAD"])
-    if rc != 0:
-      return False, f"{app}: FAIL (clone) - not a git repo or unreadable HEAD: {err or out}"
-
-    head = (out or "").strip()
-    if head.startswith(expected_commit_prefix):
-      return True, f"{app}: PASS (clone) - commit {head[:12]} matches {expected_commit_prefix}"
-    else:
-      return False, f"{app}: FAIL (clone) - HEAD {head[:12]} != expected {expected_commit_prefix}*"
-
-
   def classfile_has_aspect_markers(self, class_path: Path):
     """
     Search through a decoded .class for AspectJ markers.
     """
-    pattern = "|".join(ASPECTJ_MARKERS)
-    cmd = ["bash", "-lc", f"strings {shlex.quote(str(class_path))} | grep -a -E '{pattern}' -m 1"]
+    e_args = " ".join(f"-e {m}" for m in self._aspectj_markers)
+    cmd = ["bash", "-lc", f"strings {class_path} | grep -a -F -m 1 {e_args}"]
     rc, out, err = self.run_shell_command(cmd)
     if rc == 0 and out:
-      matched = next((m for m in ASPECTJ_MARKERS if m in out), out)
+      matched = next((m for m in self._aspectj_markers if m in out), out)
       return True, matched
     return False, ""
 
@@ -203,7 +221,9 @@ def check_app_weaving(self, app: str, app_root: Path):
     if not class_dirs:
       return False, f"{app}: FAIL (waving) - no compiled .class files found under {app_root}"
 
-    dirs = class_dirs[:self.max_class_dirs] if (self.max_class_dirs and len(class_dirs) > self.max_class_dirs) else class_dirs
+    dirs = class_dirs[:self.max_class_dirs] if (
+        self.max_class_dirs and
+        len(class_dirs) > self.max_class_dirs) else class_dirs
 
     for cdir in dirs:
       for cf in self.iter_class_files(cdir, self.max_classess_per_dir):
diff --git a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_env_setup.py b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_env_setup.py
index 4c6016e2..5333e651 100644
--- a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_env_setup.py
+++ b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_env_setup.py
@@ -1,185 +1,181 @@
 #!/usr/bin/env python3
-import os
-import re
-import shutil
-import subprocess
-from dataclasses import dataclass
-from typing import Iterable, Optional, Tuple
-from pathlib import Path
-
-from utils import REPO_DIR
-from utils import logger as _default_logger
-
-from evaluator.oracle_env_setup_primitives import OracleEnvSetupBase, Requirement
-from evaluator import utils
-
-
-VersionTuple = Tuple[int, ...]
-
-
-@dataclass(frozen=True)
-class Dependency:
-  name: str
-  binary: str
-  cmd: Optional[list] = None
-  parse_regex: Optional[str] = None
-  require: Optional[VersionTuple] = None
-  compare: Optional[str] = None
+"""Environment setup oracle for WASABI.
 
+This oracle reuses the shared env-setup primitives to validate:
+  * Toolchain dependencies and versions referenced by the WASABI README.
+  * Environment variables WASABI_ROOT_DIR and JAVA_HOME (exact string match).
+  * Updated directory structure for the flattened WASABI repo layout.
+"""
 
-DEPENDENCIES: list[Dependency] = [
+from __future__ import annotations
 
-  Dependency(
-    name="git", binary="git"
-  ),
-
-  Dependency(
-    name="maven", binary="mvn",
-    cmd=["mvn", "-v"], parse_regex=r"Apache Maven\s+([0-9.]+)",
-    require=(3, 6, 3), compare="gte",
-  ),
-  Dependency(
-    name="gradle", binary="gradle",
-    cmd=["gradle", "-v"], parse_regex=r"Gradle\s+([0-9.]+)",
-    require=(4, 4, 1), compare="gte",
-  ),
-  Dependency(
-    name="ant", binary="ant",
-    cmd=["ant", "-version"], parse_regex=r"version\s+([0-9.]+)",
-    require=(1, 10), compare="gte",
-  ),
-  Dependency(
-    name="python3", binary="python3",
-    cmd=["python3", "--version"], parse_regex=r"Python\s+([0-9.]+)",
-    require=(3, 10), compare="gte",
-  ),
-  Dependency(
-    name="java", binary="java",
-    cmd=["java", "-version"], parse_regex=r'version\s+"([^"]+)"',
-    require=(1, 8), compare="eq",
-  ),
-]
-
-
-@dataclass(frozen=True, slots=True)
-class _PrereqsRequirement(utils.BaseRequirement):
-  oracle: "OracleEnvSetup"
-
-  def check(self, ctx: object) -> utils.CheckResult:
-    del ctx
-    ok, why = self.oracle.prereqs_check()
-    if ok:
-      return utils.CheckResult.success()
-    return utils.CheckResult.failure(why or "Prerequisites failed")
-
-
-@dataclass(frozen=True, slots=True)
-class _PathsRequirement(utils.BaseRequirement):
-  oracle: "OracleEnvSetup"
+import logging
+from pathlib import Path
+from typing import Sequence
 
-  def check(self, ctx: object) -> utils.CheckResult:
-    del ctx
-    ok, why = self.oracle.paths_check()
-    if ok:
-      return utils.CheckResult.success()
-    return utils.CheckResult.failure(why or "Paths failed")
+from evaluator import utils
+from evaluator.oracle_env_setup_primitives import (
+    DependencyVersionRequirement,
+    EnvironmentVariableRequirement,
+    FilesystemPathRequirement,
+    OracleEnvSetupBase,
+    PathType,
+    VersionCompare,
+)
 
 
 class OracleEnvSetup(OracleEnvSetupBase):
+  """WASABI environment setup oracle."""
+
+  _JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64/jre"  # Check for Java 1.8
 
-  def __init__(self, *, logger=_default_logger) -> None:
+  def __init__(self,
+               *,
+               config: utils.EntryConfig,
+               logger=logging.Logger) -> None:
     super().__init__(logger=logger)
+    self._config = config
+    self._wasabi_root = Path(
+        self._config.repository_paths[self._config.name]).resolve()
+    self._benchmarks_root = Path(
+        self._config.repository_paths["benchmarks"]).resolve()
 
-    self.expected_root_dir = REPO_DIR
-    self.expected_java_home = "/usr/lib/jvm/java-8-openjdk-amd64/jre"
+  def requirements(self) -> Sequence[utils.BaseRequirement]:
+    wasabi_root_str = str(self._wasabi_root)
 
-  def requirements(self) -> Tuple[Requirement, ...]:
     return (
-      _PrereqsRequirement(name="Prerequisites", oracle=self),
-      _PathsRequirement(name="Paths", oracle=self),
+        # Dependencies, toolchains, and third-party utilites
+        DependencyVersionRequirement(
+            name="git",
+            cmd=("git", "--version"),
+            required_version=(0, 0, 0),
+            compare=VersionCompare.GEQ,
+            timeout_seconds=5.0,
+        ),
+        DependencyVersionRequirement(
+            name="maven",
+            cmd=("mvn", "-v"),
+            required_version=(3, 6, 3),
+            compare=VersionCompare.GEQ,
+            version_regex=r"Apache Maven\s+([0-9.]+)",
+            timeout_seconds=5.0,
+        ),
+        DependencyVersionRequirement(
+            name="gradle",
+            cmd=("gradle", "-v"),
+            required_version=(4, 4, 1),
+            compare=VersionCompare.GEQ,
+            version_regex=r"Gradle\s+([0-9.]+)",
+            timeout_seconds=5.0,
+        ),
+        DependencyVersionRequirement(
+            name="ant",
+            cmd=("ant", "-version"),
+            required_version=(1, 10, 0),
+            compare=VersionCompare.GEQ,
+            version_regex=r"version\s+([0-9.]+)",
+            timeout_seconds=5.0,
+        ),
+        DependencyVersionRequirement(
+            name="python3",
+            cmd=("python3", "--version"),
+            required_version=(3, 10, 0),
+            compare=VersionCompare.GEQ,
+            version_regex=r"Python\s+([0-9.]+)",
+            timeout_seconds=5.0,
+        ),
+        DependencyVersionRequirement(
+            name="java",
+            cmd=("java", "-version"),
+            required_version=(1, 8, 0),
+            compare=VersionCompare.EQ,
+            version_regex=r'version\s+"([^"]+)"',
+            timeout_seconds=5.0,
+        ),
+        DependencyVersionRequirement(
+            name="tree",
+            cmd=("tree", "--version"),
+            required_version=(0, 0, 0),
+            compare=VersionCompare.GEQ,
+            optional=True,
+            timeout_seconds=5.0,
+        ),
+
+        # Environment variables
+        EnvironmentVariableRequirement(
+            name="WASABI_ROOT_DIR matches expected",
+            env_var="WASABI_ROOT_DIR",
+            expected=str(self._wasabi_root.resolve().parent),
+        ),
+        FilesystemPathRequirement(
+            name="WASABI root directory exists",
+            path=self._wasabi_root,
+            path_type=PathType.DIRECTORY,
+        ),
+        EnvironmentVariableRequirement(
+            name="JAVA_HOME matches expected",
+            env_var="JAVA_HOME",
+            expected=self._JAVA_HOME,
+        ),
+        FilesystemPathRequirement(
+            name="JAVA_HOME directory exists",
+            path=Path(self._JAVA_HOME),
+            path_type=PathType.DIRECTORY,
+        ),
+
+        # Directory structure and required exported configs
+        FilesystemPathRequirement(
+            name="benchmarks directory exists",
+            path=self._benchmarks_root,
+            path_type=PathType.DIRECTORY,
+        ),
+        FilesystemPathRequirement(
+            name="config directory exists",
+            path=self._wasabi_root / "config",
+            path_type=PathType.DIRECTORY,
+        ),
+        FilesystemPathRequirement(
+            name="utils directory exists",
+            path=self._wasabi_root / "utils",
+            path_type=PathType.DIRECTORY,
+        ),
+        FilesystemPathRequirement(
+            name="pom.xml exists",
+            path=self._wasabi_root / "pom.xml",
+            path_type=PathType.FILE,
+        ),
+
+        # Required build/running scripts
+        FilesystemPathRequirement(
+            name="utils/prereqs.sh exists",
+            path=self._wasabi_root / "utils" / "prereqs.sh",
+            path_type=PathType.FILE,
+        ),
+        FilesystemPathRequirement(
+            name="utils/run.py exists",
+            path=self._wasabi_root / "utils" / "run.py",
+            path_type=PathType.FILE,
+        ),
+        FilesystemPathRequirement(
+            name="utils/display_bug_results.py exists",
+            path=self._wasabi_root / "utils" / "display_bug_results.py",
+            path_type=PathType.FILE,
+        ),
+
+        # Required configuration files
+        FilesystemPathRequirement(
+            name="config/hadoop/example.conf exists",
+            path=self._wasabi_root / "config" / "hadoop" / "example.conf",
+            path_type=PathType.FILE,
+        ),
+        FilesystemPathRequirement(
+            name="config/hadoop/hadoop.conf exists",
+            path=self._wasabi_root / "config" / "hadoop" / "hadoop.conf",
+            path_type=PathType.FILE,
+        ),
+        FilesystemPathRequirement(
+            name="config/hadoop/pom-hadoop.xml exists",
+            path=self._wasabi_root / "config" / "hadoop" / "pom-hadoop.xml",
+            path_type=PathType.FILE,
+        ),
     )
-
-  def run_shell_command(self, cmd: Iterable[str]) -> Tuple[int, str, str]:
-    """
-    Run a command and return (rc, stdout, stderr) tuple.
-    """
-    try:
-      cp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-      return cp.returncode, cp.stdout or "", cp.stderr or ""
-    except FileNotFoundError:
-      return 127, "", ""
-
-  def parse_version_tuple(self, text: str) -> VersionTuple:
-    """
-    Extract the first version-like token from arbitrary text.
-    For example, for Java: '1.8.0_422' -> (1, 8, 0)
-    """
-    m = re.search(r"(\d+(?:\.\d+){0,3})", text)
-    return tuple(int(x) for x in m.group(1).split(".")) if m else ()
-
-  def extract_version(self, text: str, pattern: str) -> Tuple[VersionTuple, str]:
-    """
-    Apply regex pattern on a version string.
-    """
-    m = re.search(pattern, text, re.I)
-    if not m:
-      return (), "unknown"
-    ver_str = m.group(1)
-    return self.parse_version_tuple(ver_str), ver_str
-
-  def cmp_versions(self, found: VersionTuple, required: VersionTuple, mode: str) -> bool:
-    """
-    Compare versions either to match exactly ('eq')
-    or the installed version is greather than the reference one ('gte').
-    """
-    if not found:
-      return False
-    f, r = list(found), list(required)
-    while len(f) < len(r): f.append(0)
-    while len(r) < len(f): r.append(0)
-    return (f == r) if mode == "eq" else (f >= r)
-
-  def paths_check(self):
-    wasabi_root = os.environ.get("WASABI_ROOT_DIR", "")
-    if not (wasabi_root == self.expected_root_dir and Path(wasabi_root).exists()):
-      return False, "WASABI_ROOT_DIR incorrect"
-    java_home = os.environ.get("JAVA_HOME", "")
-    if not (java_home == self.expected_java_home and Path(java_home).exists()):
-      return False, "JAVA_HOME incorrect"
-    return True, ""
-
-  def check_dependency(self, dep: Dependency) -> Optional[str]:
-    """
-    Core method that checks whether a certain dependency of a version
-    equal or greather than that specified in the README is installed.
-    """
-    if shutil.which(dep.binary) is None:
-      return f"{dep.name} missing"
-
-
-    if dep.cmd is None and dep.parse_regex is None and dep.require is None:
-      return None
-
-    rc, out, err = self.run_shell_command(dep.cmd or [])
-    text = (out + "\n" + err).strip()
-
-    if dep.parse_regex and dep.require and dep.compare:
-      ver_tuple, ver_str = self.extract_version(text, dep.parse_regex)
-      if not ver_tuple:
-        return f"{dep.name} version unreadable"
-      ok = self.cmp_versions(ver_tuple, dep.require, dep.compare)
-      cmp_word = "==" if dep.compare == "eq" else ">="
-      want = ".".join(map(str, dep.require))
-      return None if ok else f"{dep.name} {cmp_word} {want} not met (got {ver_str})"
-
-    return f"{dep.name} check misconfigured"
-
-  def prereqs_check(self):
-    problems: list[str] = []
-    for dep in DEPENDENCIES:
-      msg = self.check_dependency(dep)
-      if msg:
-        problems.append(msg)
-    if problems:
-      return False, "; ".join(problems)
-    return True, ""
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_experiment_runs.py b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_experiment_runs.py
index e37e0d42..69ab851e 100644
--- a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_experiment_runs.py
+++ b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_experiment_runs.py
@@ -1,121 +1,94 @@
-from collections import defaultdict
-import os
-
-from utils import RESULTS_ROOT_DIR
-from utils import GROUND_TRUTH_FILE
-from utils import SIMILARITY_RATIO
-
-from utils import logger
-
-class OracleExperimentRuns:
-  def __init__(self):
-    pass
-
-  def get_benchmark_name(self, loc):
-    """
-    Classifies the location based on its prefix.
-    """
-    if loc.startswith("org.apache.hadoop.hdfs") and "SecondaryNameNode.doWork" not in loc:
-      return "hdfs"
-    elif loc.startswith("org.apache.hadoop.yarn"):
-      return "yarn"
-    elif loc.startswith("org.apache.hadoop.mapreduce") or loc.startswith("org.apache.hadoop.mapred"):
-      return "mapreduce"
-    elif loc.startswith("org.apache.hadoop.hbase"):
-      return "hbase"
-    elif loc.startswith("org.apache.hadoop.hive"):
-      return "hive"
-    elif loc.startswith("org.apache.cassandra"):
-      return "cassandra"
-    elif loc.startswith("org.apache.hadoop") or "SecondaryNameNode.doWork" in loc:  # initialy found in hadoop-common, added here to match Table 3
-      return "hadoop"
-    elif loc.startswith("org.elasticsearch"):
-      return "elasticsearch"
-    else:
-      return "unknown"
-
-  def aggregate_bugs(self, root_dir):
-    """
-    Searches for bug report files and aggregates bugs based on their type and 
-    which application have been found in.
-    """
-    bugs = defaultdict(lambda: defaultdict(set))
-    unique = dict()
-
-    for dirpath, _, files in os.walk(root_dir):
-      for file in files:
-        if file.endswith(".csv"):
-          file_path = os.path.join(dirpath, file)
-          
-          with open(file_path, 'r') as f:
-            for line in f:
-              if "how-bug" in line or "when-missing-" in line:
-                tokens = line.strip().split(",")
-        
-                bug_type = tokens[1]
-                bug_loc = tokens[2]
-                
-                key = bug_type + bug_loc
-                if key in unique:
-                  continue
-                unique[key] = "x"
-
-                benchmark = self.get_benchmark_name(bug_loc)       
-                bugs[bug_type][benchmark].add(bug_loc)
-  
-    return bugs
-
-  def get_ground_truth_bugs(self, file_path: str):
-    """
-    Reads the ground truth values from a file into a dictionary.
-    """
-    ground_truth = defaultdict(lambda: defaultdict(set))
-    
-    try:
-      with open(file_path, 'r') as f:
-        for line in f:
-          tokens = line.strip().split(",")
-          benchmark = tokens[0]
-          bug_type = tokens[1]
-          retry_location = tokens[2]
-          ground_truth[bug_type][benchmark].add(retry_location)
-    except Exception:
-      logger.info(f"Cannot open {file_path} or file not present.")
-    
-    return ground_truth
-
-  def count_bugs(self, bugs, ground_truth):
-    """
-    Compares the total number of bugs found against the ground truth.
-    """
-    total_ground_truth = 0
-    total_found = 0
-
-    for bug_type, benchmarks in ground_truth.items():
-      for benchmark, ground_truth_locations in benchmarks.items():
-        total_ground_truth += len(ground_truth_locations)
-        bug_locations = bugs.get(bug_type, {}).get(benchmark, set())
-        matching_locations = ground_truth_locations & bug_locations
-        total_found += len(matching_locations)
-
-    if total_ground_truth == 0:
-      logger.info("No ground truth bugs available.")
-      return False
-
-    coverage = total_found / total_ground_truth
-    logger.info(f"Found {total_found} out of {total_ground_truth} ground truth bugs ({coverage:.2%}).")
-
-    passed = coverage >= SIMILARITY_RATIO
-    logger.info("Results reproduced: PASS" if passed else "Results reproduced: FAIL")
-    return passed
-
-
-  def run(self):
-    bugs = self.aggregate_bugs(str(RESULTS_ROOT_DIR))
-    ground_truth = self.get_ground_truth_bugs(str(GROUND_TRUTH_FILE))
-    passed = self.count_bugs(bugs, ground_truth)
-
-    if passed:
-      return True
-    
-    return False
\ No newline at end of file
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+import csv
+
+from evaluator.utils import EntryConfig
+from oracle_experiment_runs_primitives import (
+    OracleExperimentRunsBase,
+    ElementwiseSimilarityThresholdRequirement,
+)
+
+
+@dataclass(frozen=True)
+class _BugKey:
+  bug_type: str
+  benchmark: str
+  location: str
+
+
+class OracleExperimentRuns(OracleExperimentRunsBase):
+  _ORACLE_NAME = "WasabiExperimentRuns"
+
+  def __init__(self, *, config: EntryConfig, logger) -> None:
+    super().__init__(logger=logger)
+    self._config = config
+    self._results_root = config.results_paths["results_root"]
+    self._gt_file = config.ground_truth_paths["bugs_ground_truth"]
+    self._threshold = config.similarity_ratio
+
+    self._prefix_map = config.metadata.get("benchmark_prefix_map", [])
+    self._contains_rules = config.metadata.get("benchmark_contains_rules", [])
+    self._glob = config.metadata.get("results_file_glob", "*.csv")
+
+  def _classify_benchmark(self, loc: str) -> str:
+    for bench, needles in self._contains_rules:
+      if any(n in loc for n in needles):
+        return bench
+    for bench, prefixes in self._prefix_map:
+      if any(loc.startswith(p) for p in prefixes):
+        return bench
+    return "unknown"
+
+  def _load_ground_truth(self) -> dict[tuple[str, str], set[str]]:
+    # key: (bug_type, benchmark) -> set(loc)
+    out: dict[tuple[str, str], set[str]] = {}
+    p = Path(self._gt_file)
+    with p.open() as f:
+      for line in f:
+        bench, bug_type, loc = line.strip().split(",", 2)
+        out.setdefault((bug_type, bench), set()).add(loc)
+    return out
+
+  def _load_observed(self) -> dict[tuple[str, str], set[str]]:
+    out: dict[tuple[str, str], set[str]] = {}
+    root = Path(self._results_root)
+    for csv_path in root.rglob(self._glob):
+      with csv_path.open(newline="") as f:
+        reader = csv.reader(f)
+        for row in reader:
+          if not row:
+            continue
+          line = ",".join(row)
+          if ("how-bug" not in line) and ("when-missing-" not in line):
+            continue
+          bug_type = row[1]
+          bug_loc = row[2]
+          bench = self._classify_benchmark(bug_loc)
+          out.setdefault((bug_type, bench), set()).add(bug_loc)
+    return out
+
+  def requirements(self):
+    gt = self._load_ground_truth()
+    obs = self._load_observed()
+
+    # Stable ordering over ground truth bug IDs
+    buckets = sorted(gt.keys())
+
+    ref_counts = []
+    matched_counts = []
+
+    for k in buckets:
+      gt_locs = gt[k]
+      obs_locs = obs.get(k, set())
+      ref_counts.append(float(len(gt_locs)))
+      matched_counts.append(float(len(gt_locs & obs_locs)))
+
+    return [
+        ElementwiseSimilarityThresholdRequirement(
+            name="ground-truth-coverage-by-bucket",
+            observed=matched_counts,
+            reference=ref_counts,
+            threshold=self._threshold,
+        ),
+    ]