diff --git a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/main.py b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/main.py
index 95967ba7..fed5fbaa 100644
--- a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/main.py
+++ b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/main.py
@@ -1,23 +1,18 @@
 #!/usr/bin/env python3
-"""Runs environment setup checks for WASABI."""
+"""Runs environment setup, build, benchmark prep, and experiment runs checks for WASABI."""
 
 from __future__ import annotations
+
 from pathlib import Path
 from typing import Dict
 import os
 import sys
 
-
-_AGENT_EVAL_DIR = Path(__file__).resolve().parent
-_AGENT_SRC_DIR = _AGENT_EVAL_DIR.parents[3] / "src"
-sys.path.append(str(_AGENT_SRC_DIR))
-
-
 from evaluator.utils import (
-  EntryConfig,
-  LoggerConfig,
-  get_logger,
-  record_result,
+    EntryConfig,
+    LoggerConfig,
+    get_logger,
+    record_result,
 )
 from oracle_artifact_build import OracleArtifactBuild
 from oracle_env_setup import OracleEnvSetup
@@ -25,28 +20,111 @@
 from oracle_experiment_runs import OracleExperimentRuns
 
 
-# NOTE: WASABI bundle layout mirrors the legacy constants, but we build it directly
-# from EntryConfig rather than importing legacy globals.
-_WASABI_HOME = Path.home() / "sosp24_wasabi"
-_WASABI_REPO = _WASABI_HOME / "wasabi"
-_WASABI_BENCH = _WASABI_HOME / "benchmarks"
-
-
-WASABI_CONFIG = EntryConfig(
-  name = "sosp24-wasabi",
-  home_dir = _WASABI_HOME,
-  repository_paths = {
-    "sosp24-wasabi": _WASABI_REPO,
-    "benchmarks": _WASABI_BENCH,
-  },
-  results_paths = {
-    "results_root": _WASABI_REPO / "results",
-  },
-  ground_truth_paths = {
-    "bugs_ground_truth": _WASABI_REPO / "bugs_ground_truth.txt",
-  },
-  similarity_ratio = 0.75,
-)
+def _resolve_workspace_paths() -> tuple[Path, Path, Path]:
+  """Resolve and validate _agent_eval/ and wasabi/ locations.
+  This expectes that either:
+    (1) _agent_eval/ and wasabi/ are located in the same root directory; or
+    (2) _AGENT_EVAL_DIR and _WASABI_HOME are set by the user
+  """
+  try:
+    env_agent_eval = os.environ.get("_AGENT_EVAL_DIR")
+    env_wasabi_home = os.environ.get("_WASABI_HOME")
+
+    if env_agent_eval:
+      agent_eval_dir = Path(env_agent_eval).expanduser().resolve()
+    else:
+      agent_eval_dir = Path(__file__).resolve().parent
+
+    if env_wasabi_home:
+      wasabi_home = Path(env_wasabi_home).expanduser().resolve()
+    else:
+      wasabi_home = agent_eval_dir.parent.resolve()
+
+    if not agent_eval_dir.exists() or not agent_eval_dir.is_dir():
+      raise RuntimeError(
+          f"Invalid _agent_eval dir: {agent_eval_dir}\n"
+          f"This runner expects _agent_eval/ and wasabi/ to be located in the same root directory.\n"
+          f"Set _AGENT_EVAL_DIR to the directory containing main.py if needed."
+      )
+
+    wasabi_repo_root = wasabi_home / "wasabi"
+    if not wasabi_repo_root.exists() or not wasabi_repo_root.is_dir():
+      raise RuntimeError(
+          f"Invalid WASABI workspace: {wasabi_home}\n"
+          f"Expected to find a 'wasabi/' directory at: {wasabi_repo_root}\n"
+          f"This runner expects _agent_eval/ and wasabi/ to be located in the same root directory.\n"
+          f"Set _WASABI_HOME to the workspace root if needed."
+      )
+
+    workspace_root = wasabi_home
+    return agent_eval_dir, wasabi_home, workspace_root
+
+  except OSError as exc:
+    raise RuntimeError(f"Failed to resolve workspace paths: {exc}") from exc
+
+
+def _build_configs(*, agent_eval_dir: Path, workspace_root: Path) -> EntryConfig:
+  """Constructs EntryConfig for the WASABI evaluation bundle from resolved paths."""
+  wasabi_repo = (workspace_root / "wasabi").resolve()
+  benchmarks_dir = (workspace_root / "benchmarks").resolve()
+
+  return EntryConfig(
+      name="sosp24-wasabi",
+      home_dir=workspace_root,
+      repository_paths={
+          "sosp24-wasabi": wasabi_repo,
+          "benchmarks": benchmarks_dir,
+      },
+      results_paths={
+          "results_root": wasabi_repo / "results",
+      },
+      ground_truth_paths={
+          "bugs_ground_truth": agent_eval_dir / "refs" / "bugs_ground_truth.csv",
+      },
+      similarity_ratio=0.75,
+      metadata={
+          "maven_repo_dir": Path.home() / ".m2" / "repository",
+          "weaving_plugin_signature": "aspectj-maven-plugin",
+          "primary_artifact": "edu.uchicago.cs.systems:wasabi",
+          "benchmarks": {
+              "hadoop": {
+                  "repo_url": "https://github.com/apache/hadoop.git",
+                  "commit": "60867de",
+                  "pom_file": "pom.xml",
+                  "pom_backup": "pom-original.xml",
+              },
+              "hbase": {
+                  "repo_url": "https://github.com/apache/hbase.git",
+                  "commit": "89ca7f4",
+                  "pom_file": "pom.xml",
+                  "pom_backup": "pom-original.xml",
+              },
+              "hive": {
+                  "repo_url": "https://github.com/apache/hive.git",
+                  "commit": "e08a600",
+                  "pom_file": "pom.xml",
+                  "pom_backup": "pom-original.xml",
+              },
+          },
+          "aspectj_markers": [
+              "ajc$preClinit",
+              "ajc$initFailureCause",
+              "ajc$tjp",
+              "ajc$before$",
+              "ajc$after$",
+              "ajc$around$",
+              "ajc$interField$",
+              "ajc$interMethod$",
+              "org.aspectj.runtime.reflect.Factory",
+              "org.aspectj.runtime.internal.AroundClosure",
+              "org.aspectj.lang.JoinPoint",
+              "org.aspectj.lang.JoinPoint$StaticPart",
+              "org.aspectj.lang.ProceedingJoinPoint",
+              "org.aspectj.lang.Signature",
+              "org.aspectj.lang.NoAspectBoundException",
+          ],
+      },
+  )
 
 
 def main(argv: list[str]) -> int:
@@ -56,27 +134,30 @@ def main(argv: list[str]) -> int:
   score = 0
 
   logger_name = os.environ.get("EVAL_LOGGER_NAME", "WASABI-AGENT-EVALUATOR")
-  logger = get_logger(LoggerConfig(root_name = logger_name))
-
-  env_checker = OracleEnvSetup(config = WASABI_CONFIG, logger = logger)
-  score += record_result(
-    logger, results, type(env_checker).__name__, env_checker.run(verbose = verbose)
-  )
-
-  build_checker = OracleArtifactBuild(config = WASABI_CONFIG, logger = logger)
-  score += record_result(
-    logger, results, type(build_checker).__name__, build_checker.run(verbose = verbose)
-  )
-
-  prep_checker = OracleBenchmarkPrep(config = WASABI_CONFIG, logger = logger)
-  score += record_result(
-    logger, results, type(prep_checker).__name__, prep_checker.run(verbose = verbose)
-  )
-
-  runs_checker = OracleExperimentRuns(config = WASABI_CONFIG, logger = logger)
-  score += record_result(
-    logger, results, type(runs_checker).__name__, runs_checker.run(verbose = verbose)
-  )
+  logger = get_logger(LoggerConfig(root_name=logger_name))
+
+  try:
+    agent_eval_dir, _wasabi_home, workspace_root = _resolve_workspace_paths()
+    wasabi_config = _build_configs(agent_eval_dir=agent_eval_dir, workspace_root=workspace_root)
+  except RuntimeError as exc:
+    # Keep failure message clean and actionable
+    raise SystemExit(str(exc)) from exc
+
+  env_checker = OracleEnvSetup(config=wasabi_config, logger=logger)
+  env_ok = env_checker.run(verbose=verbose)
+  score += record_result(results, type(env_checker).__name__, env_ok)
+
+  build_checker = OracleArtifactBuild(config=wasabi_config, logger=logger)
+  build_ok = build_checker.run(verbose=verbose)
+  score += record_result(results, type(build_checker).__name__, build_ok)
+
+  prep_checker = OracleBenchmarkPrep(config=wasabi_config, logger=logger)
+  prep_ok = prep_checker.run(verbose=verbose)
+  score += record_result(results, type(prep_checker).__name__, prep_ok)
+
+  runs_checker = OracleExperimentRuns(config=wasabi_config, logger=logger)
+  runs_ok = runs_checker.run(verbose=verbose)
+  score += record_result(results, type(runs_checker).__name__, runs_ok)
 
   logger.info("Agent scores: %s", results)
   return score
diff --git a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_artifact_build.py b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_artifact_build.py
index 6bf39f2f..291d44f9 100644
--- a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_artifact_build.py
+++ b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_artifact_build.py
@@ -1,116 +1,242 @@
-#!/usr/bin/env python3
-import xml.etree.ElementTree as ET
-import fnmatch
+"""Artifact build oracle for Wasabi (SOSP'24).
 
-from utils import HOME
-from utils import REPO_DIR
-from utils import logger
+Validates:
+  - Required repository and metadata inputs are present
+  - Project structure is Maven-based and modules can be identified from POMs
+  - The configured primary module (WASABI) produces a compilable binary artifact
+  - The primary module (WASABI) is present both as a build output and as an installed Maven artifact
+"""
+
+import dataclasses
+import fnmatch
+import hashlib
+import logging
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
 
 from evaluator.oracle_artifact_build_primitives import OracleArtifactBuildBase
+from evaluator.utils import EntryConfig
 from evaluator import utils
 
 
+def _required_path(paths: Dict[str, Path], key: str, *, label: str) -> Path:
+  """Returns a required path from a mapping with a clear error."""
+  try:
+    p = paths[key]
+  except KeyError as e:
+    raise ValueError(f"Missing {label}[{key!r}] in EntryConfig") from e
+  return utils.to_path(p)
+
+
+def _required_meta(meta: Dict[str, Any], key: str, *, label: str) -> Any:
+  """Returns a required metadata value with a clear error."""
+  try:
+    return meta[key]
+  except KeyError as e:
+    raise ValueError(f"Missing {label}[{key!r}] in EntryConfig.metadata") from e
+
+
+def _sha256(path: Path) -> str:
+  h = hashlib.sha256()
+  with path.open("rb") as f:
+    for chunk in iter(lambda: f.read(1024 * 1024), b""):
+      h.update(chunk)
+  return h.hexdigest()
+
+
+def _pick_primary_jar(dir_path: Path, artifact_id: str,
+                      version: str) -> Path | None:
+  """
+  Picks a "primary" jar from a directory by matching artifactId/version while
+  excluding common auxiliary jars (e.g., sources/javadoc/tests/original-*).
+  """
+  if not dir_path.is_dir():
+    return None
+
+  bad_tokens = ("-sources", "-javadoc", "-tests", "original-")
+  pattern = f"{artifact_id}-{version}*.jar"
+  cands = [
+      p for p in dir_path.glob("*.jar")
+      if p.is_file() and fnmatch.fnmatch(p.name, pattern) and not any(
+          tok in p.name for tok in bad_tokens)
+  ]
+  if not cands:
+    return None
+
+  # Prefer newest (best-effort)
+  return max(cands, key=lambda p: p.stat().st_mtime)
+
+
+def _strip_ns(tag: str) -> str:
+  return tag.split("}", 1)[-1]
+
+
 @dataclasses.dataclass(frozen=True, slots=True)
-class _BuildInputsRequirement(utils.BaseRequirement):
+class _BuildInputsRequirement:
+  name: str
   oracle: "OracleArtifactBuild"
+  optional: bool = False
 
-  def check(self, ctx: object) -> utils.CheckResult:
-    del ctx
-
-    if not REPO_DIR.exists():
-      logger.info("Build: FAIL - base project directory not found")
-      return utils.CheckResult.failure("base project directory not found")
+  def check(self, ctx) -> utils.CheckResult:
+    repo_dir = self.oracle.repo_dir
+    if not repo_dir.exists() or not repo_dir.is_dir():
+      ctx.logger.info("Build: FAIL - base project directory not found")
+      return utils.CheckResult.failure("base project directory not found",
+                                       cwd=repo_dir)
 
-    poms = self.oracle.find_poms(REPO_DIR)
+    poms = self.oracle.find_poms(repo_dir)
     if not poms:
-      logger.info("Build: FAIL - no pom.xml files found under wasabi-testing")
-      return utils.CheckResult.failure("no pom.xml files found under wasabi-testing")
+      ctx.logger.info("Build: FAIL - no pom.xml files found under repo")
+      return utils.CheckResult.failure("no pom.xml files found under repo",
+                                       cwd=repo_dir)
 
-    root_pom = REPO_DIR / "pom.xml"
-    top_defaults = {}
+    root_pom = repo_dir / "pom.xml"
+    top_defaults: Dict[str, str] = {}
     if root_pom.exists():
-      root_mod = self.oracle.parse_pom(root_pom)
+      root_mod = self.oracle.parse_pom(root_pom, top_defaults=None)
       if not root_mod.get("error"):
         if root_mod.get("groupId"):
           top_defaults["groupId"] = root_mod["groupId"]
         if root_mod.get("version"):
           top_defaults["version"] = root_mod["version"]
 
-    modules = []
-    errors = []
+    modules: List[Dict[str, Any]] = []
+    errors: List[Tuple[Path, str]] = []
     for pom in poms:
       m = self.oracle.parse_pom(pom, top_defaults=top_defaults)
       if m.get("error"):
         errors.append((pom, m["error"]))
         continue
       if not all([m.get("artifactId"), m.get("groupId"), m.get("version")]):
-        errors.append((pom, "missing groupId/artifactId/version after inheritance"))
+        errors.append(
+            (pom, "missing groupId/artifactId/version after inheritance"))
       else:
         modules.append(m)
 
     if errors:
-      logger.info("Build: FAIL - POM parsing errors present")
+      ctx.logger.info("Build: FAIL - POM parsing errors present")
       for pom, err in errors[:5]:
-        logger.info(f" - {pom}: {err}")
+        ctx.logger.info(f" - {pom}: {err}")
       if len(errors) > 5:
-        logger.info(f" ... {len(errors)-5} more")
-      return utils.CheckResult.failure("POM parsing errors present")
+        ctx.logger.info(f" ... {len(errors)-5} more")
+      return utils.CheckResult.failure("POM parsing errors present",
+                                       cwd=repo_dir)
 
     self.oracle._modules = modules
-    return utils.CheckResult.success()
+    return utils.CheckResult.success(cwd=repo_dir)
 
 
 @dataclasses.dataclass(frozen=True, slots=True)
-class _CodeBuildRequirement(utils.BaseRequirement):
+class _PrimaryModuleBuildRequirement:
+  name: str
   oracle: "OracleArtifactBuild"
+  optional: bool = False
 
-  def check(self, ctx: object) -> utils.CheckResult:
-    del ctx
-
+  def check(self, ctx) -> utils.CheckResult:
     modules = getattr(self.oracle, "_modules", None)
     if not modules:
-      return utils.CheckResult.success()
-
-    missing_targets = []
-    missing_installs = []
-
+      return utils.CheckResult.failure("modules not initialized",
+                                       cwd=self.oracle.repo_dir)
+
+    selector = self.oracle.primary_artifact_selector.strip()
+    if ":" in selector:
+      want_gid, want_aid = selector.split(":", 1)
+      want_gid = want_gid.strip()
+      want_aid = want_aid.strip()
+    else:
+      want_gid, want_aid = "", selector.strip()
+
+    chosen = None
     for m in modules:
-      if not self.oracle.has_target_jar(m):
-        missing_targets.append(str(m["dir"]))
-      if not self.oracle.has_installed_artifact(m):
-        missing_installs.append(f"{m['groupId']}:{m['artifactId']}:{m['version']}")
-
-    if missing_targets or missing_installs:
-      logger.info("Code build: FAIL")
-      if missing_targets:
-        logger.info(" Missing built JARs in target/:")
-        for d in missing_targets[:10]:
-          logger.info(f"  - {d}")
-        if len(missing_targets) > 10:
-          logger.info(f"  ... {len(missing_targets)-10} more")
-      if missing_installs:
-        logger.info(" Missing artifacts in local ~/.m2 repository:")
-        for gav in missing_installs[:10]:
-          logger.info(f"  - {gav}")
-        if len(missing_installs) > 10:
-          logger.info(f"  ... {len(missing_installs)-10} more")
-
-      return utils.CheckResult.failure("missing built jars and/or installed artifacts")
-
-    logger.info("Code build: PASS")
-    return utils.CheckResult.success()
+      gid = (m.get("groupId") or "").strip()
+      aid = (m.get("artifactId") or "").strip()
+      if not aid:
+        continue
+      if want_gid:
+        if gid == want_gid and aid == want_aid:
+          chosen = m
+          break
+      else:
+        if aid == want_aid:
+          chosen = m
+          break
+
+    if not chosen:
+      return utils.CheckResult.failure(
+          f"primary module not found for selector {selector!r}",
+          cwd=self.oracle.repo_dir,
+      )
+
+    packaging = (chosen.get("packaging") or "jar").strip()
+    if packaging == "pom":
+      ctx.logger.info("Code build: FAIL")
+      return utils.CheckResult.failure(
+          "primary module resolved to packaging=pom", cwd=Path(chosen["dir"]))
+
+    gid = (chosen.get("groupId") or "").strip()
+    aid = (chosen.get("artifactId") or "").strip()
+    ver = (chosen.get("version") or "").strip()
+    module_dir = Path(chosen["dir"])
+
+    if not gid or not aid or not ver:
+      return utils.CheckResult.failure(
+          "primary module missing groupId/artifactId/version after inheritance",
+          cwd=module_dir,
+      )
+
+    built = _pick_primary_jar(module_dir / "target", aid, ver)
+    installed_dir = self.oracle.repo_path(gid, aid, ver)
+    installed = _pick_primary_jar(installed_dir, aid, ver)
+
+    if not built or not installed:
+      ctx.logger.info("Code build: FAIL")
+      if not built:
+        ctx.logger.info(" Missing built JARs in target/:")
+        ctx.logger.info(f"  - {module_dir}")
+      if not installed:
+        ctx.logger.info(" Missing artifacts in local Maven repository:")
+        ctx.logger.info(f"  - {gid}:{aid}:{ver}")
+      return utils.CheckResult.failure(
+          "missing built jar and/or installed artifact", cwd=module_dir)
+
+    hb = _sha256(built)
+    hi = _sha256(installed)
+    if hb != hi:
+      ctx.logger.info("Code build: FAIL")
+      detail = f"built={built} sha256={hb}\ninstalled={installed} sha256={hi}"
+      return utils.CheckResult.failure(
+          "primary artifact mismatch: target/ jar does not match local Maven repo jar",
+          stdout=utils.truncate_text(detail, utils.DEFAULT_MAX_CAPTURE_CHARS),
+          cwd=module_dir,
+      )
+
+    ctx.logger.info("Code build: PASS")
+    return utils.CheckResult.success(cwd=module_dir)
 
 
 class OracleArtifactBuild(OracleArtifactBuildBase):
-  def __init__(self, *, logger=logger):
+
+  def __init__(self, *, config: EntryConfig, logger: logging.Logger):
     super().__init__(logger=logger)
-    self.maven_packages_dir = HOME / ".m2" / "repository"
+    self._config = config
+
+    self.repo_dir = _required_path(config.repository_paths,
+                                   "sosp24-wasabi",
+                                   label="repository_paths").resolve()
+
+    meta: Dict[str, Any] = getattr(config, "metadata", {}) or {}
+    self.maven_packages_dir = utils.to_path(
+        _required_meta(meta, "maven_repo_dir", label="metadata")).resolve()
+    self.primary_artifact_selector = str(
+        _required_meta(meta, "primary_artifact", label="metadata"))
+
     self._modules = None
 
   def requirements(self):
     return (
-      _BuildInputsRequirement(name="Build", oracle=self),
-      _CodeBuildRequirement(name="Code build", oracle=self),
+        _BuildInputsRequirement(name="Build", oracle=self),
+        _PrimaryModuleBuildRequirement(name="Code build", oracle=self),
     )
 
   def xget(self, elem, tag):
@@ -136,14 +262,23 @@ def parse_pom(self, pom_path, top_defaults=None):
       tree = ET.parse(pom_path)
       root = tree.getroot()
     except Exception as e:
-      return {"dir": pom_path.parent, "pom": pom_path, "error": f"XML parse error: {e}"}
+      return {
+          "dir": pom_path.parent,
+          "pom": pom_path,
+          "error": f"XML parse error: {e}"
+      }
 
     artifactId = self.xget(root, "artifactId")
     groupId = self.xget(root, "groupId")
     version = self.xget(root, "version")
     packaging = self.xget(root, "packaging") or "jar"
 
-    parent = root.find("parent")
+    parent = None
+    for c in list(root):
+      if _strip_ns(c.tag) == "parent":
+        parent = c
+        break
+
     if parent is not None:
       p_groupId = self.xget(parent, "groupId")
       p_version = self.xget(parent, "version")
@@ -157,12 +292,12 @@ def parse_pom(self, pom_path, top_defaults=None):
       version = version or top_defaults.get("version")
 
     return {
-      "dir": pom_path.parent,
-      "pom": pom_path,
-      "groupId": groupId,
-      "artifactId": artifactId,
-      "version": version,
-      "packaging": packaging
+        "dir": pom_path.parent,
+        "pom": pom_path,
+        "groupId": groupId,
+        "artifactId": artifactId,
+        "version": version,
+        "packaging": packaging
     }
 
   def find_poms(self, base):
@@ -171,20 +306,3 @@ def find_poms(self, base):
   def repo_path(self, groupId, artifactId, version):
     parts = groupId.split(".")
     return self.maven_packages_dir.joinpath(*parts, artifactId, version)
-
-  def has_target_jar(self, module):
-    if module["packaging"] == "pom":
-      return True # no jar expected
-    target = module["dir"] / "target"
-    if not target.is_dir():
-      return False
-    pattern = f"{module['artifactId']}-{module['version']}*.jar"
-    return any(fnmatch.fnmatch(p.name, pattern) for p in target.glob("*.jar"))
-
-  def has_installed_artifact(self, module):
-    rp = self.repo_path(module["groupId"], module["artifactId"], module["version"])
-    if module["packaging"] == "pom":
-      return (rp / f"{module['artifactId']}-{module['version']}.pom").is_file()
-    return any(p.suffix == ".jar" and fnmatch.fnmatch(
-          p.name, f"{module['artifactId']}-{module['version']}*.jar")
-          for p in rp.glob("*.jar"))
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_benchmark_prep.py b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_benchmark_prep.py
index 96f19eef..7fdadc48 100644
--- a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_benchmark_prep.py
+++ b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_benchmark_prep.py
@@ -1,115 +1,157 @@
-#!/usr/bin/env python3
-import sys
-import shlex
+"""Experiment runs oracle for Wasabi (SOSP'24).
+
+Validates:
+  - Required benchmark repositories and application directories are present
+  - Each benchmark is checked out at the expected commit
+  - Benchmark build configuration enables the expected weaving/instrumentation plugin
+  - Compiled outputs contain expected AspectJ weaving markers
+"""
+
 import subprocess
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Any, Dict, List
 
-from utils import BENCH_DIR
-from utils import logger
-
-from evaluator.utils import EntryConfig
 from evaluator.oracle_benchmark_prep_primitives import (
-  OracleBenchmarkPrepBase,
-  Requirement,
+    OracleBenchmarkPrepBase,
+    BenchmarkRequirement,
 )
 from evaluator import utils
 
 
-
-REPOS = {
-  "hadoop": ("https://github.com/apache/hadoop.git", "60867de"),
-  "hbase": ("https://github.com/apache/hbase.git", "89ca7f4"),
-  "hive": ("https://github.com/apache/hive.git", "e08a600"),
-}
-
-ASPECTJ_MARKERS = [
-  "ajc$preClinit",
-  "ajc$initFailureCause",
-  "ajc$tjp",
-  "ajc$before$",
-  "ajc$after$",
-  "ajc$around$",
-  "ajc$interField$",
-  "ajc$interMethod$",
-  "org.aspectj.runtime.reflect.Factory",
-  "org.aspectj.runtime.internal.AroundClosure",
-  "org.aspectj.lang.JoinPoint",
-  "org.aspectj.lang.JoinPoint$StaticPart",
-  "org.aspectj.lang.ProceedingJoinPoint",
-  "org.aspectj.lang.Signature",
-  "org.aspectj.lang.NoAspectBoundException",
-]
-
-
 def _required_path(paths, key: str, *, label: str) -> Path:
   """Returns a required path from a mapping with a clear error."""
   try:
-    return paths[key]
+    return utils.to_path(paths[key])
   except KeyError as e:
     raise ValueError(f"Missing {label}[{key!r}] in EntryConfig") from e
 
 
-@dataclass(frozen=True, slots=True)
-class _RepoCommitRequirement(utils.BaseRequirement):
-  oracle: "OracleBenchmarkPrep"
-  app: str
-  app_root: Path
-  expected_commit_prefix: str
+def _required_meta(meta: Dict[str, Any], key: str, *, label: str) -> Any:
+  """Returns a required metadata value with a clear error."""
+  try:
+    return meta[key]
+  except KeyError as e:
+    raise ValueError(f"Missing {label}[{key!r}] in EntryConfig.metadata") from e
 
-  def check(self, ctx) -> utils.CheckResult:
-    ok, msg = self.oracle.check_repo_commit(self.app, self.app_root, self.expected_commit_prefix)
-    ctx.logger.info(msg)
-    return utils.CheckResult.success() if ok else utils.CheckResult.failure(msg)
+
+def _as_dict(x: Any) -> Dict[str, Any]:
+  if isinstance(x, dict):
+    return x
+  raise ValueError(f"Expected dict in EntryConfig.metadata, got: {type(x)!r}")
+
+
+def _as_list_str(x: Any) -> List[str]:
+  if isinstance(x, list) and all(isinstance(v, str) for v in x):
+    return x
+  raise ValueError("Expected list[str] in EntryConfig.metadata")
 
 
 @dataclass(frozen=True, slots=True)
-class _WeavingRequirement(utils.BaseRequirement):
+class _WeavingRequirement:
+  name: str
   oracle: "OracleBenchmarkPrep"
   app: str
   app_root: Path
+  optional: bool = False
 
   def check(self, ctx) -> utils.CheckResult:
     ok, msg = self.oracle.check_app_weaving(self.app, self.app_root)
     ctx.logger.info(msg)
-    return utils.CheckResult.success() if ok else utils.CheckResult.failure(msg)
+    return utils.CheckResult.success(
+        cwd=self.app_root) if ok else utils.CheckResult.failure(
+            msg, cwd=self.app_root)
 
 
 class OracleBenchmarkPrep(OracleBenchmarkPrepBase):
 
-  def __init__(self, *, config: EntryConfig, logger: logger.__class__):
-    super().__init__(logger = logger)
+  def __init__(self, *, config: utils.EntryConfig, logger):
+    super().__init__(logger=logger)
     self._config = config
 
-    self.max_class_dirs = 200
-    self.max_classess_per_dir = 2000
+    meta = _as_dict(getattr(config, "metadata", {}) or {})
+
+    self._bench_specs = _as_dict(
+        _required_meta(meta, "benchmarks", label="metadata"))
+    self._weaving_plugin_sig = str(
+        _required_meta(meta, "weaving_plugin_signature", label="metadata"))
+    self._aspectj_markers = _as_list_str(
+        _required_meta(meta, "aspectj_markers", label="metadata"))
 
-  def requirements(self) -> tuple[Requirement, ...]:
-    bench_root = _required_path(self._config.repository_paths, "benchmarks", label="repository_paths")
+    # Bounds for the max number of compiled classes checked for instrumentation markers
+    self.max_class_dirs = int(meta.get("max_class_dirs", 200))
+    self.max_classess_per_dir = int(meta.get("max_classess_per_dir", 2000))
+
+  def requirements(self) -> tuple[object, ...]:
+    bench_root = _required_path(self._config.repository_paths,
+                                "benchmarks",
+                                label="repository_paths")
+    wasabi_root = _required_path(self._config.repository_paths,
+                                 "sosp24-wasabi",
+                                 label="repository_paths")
+
+    reqs: List[object] = []
+
+    for app in sorted(self._bench_specs.keys()):
+      spec = _as_dict(self._bench_specs[app])
 
-    reqs: list[Requirement] = []
-    for app in REPOS:
       app_root = bench_root / app
+      expected_commit = str(
+          _required_meta(spec, "commit", label=f"metadata.benchmarks[{app}]"))
+      pom_file = str(
+          _required_meta(spec, "pom_file", label=f"metadata.benchmarks[{app}]"))
+      pom_backup = str(
+          _required_meta(spec,
+                         "pom_backup",
+                         label=f"metadata.benchmarks[{app}]"))
+
+      reqs.append(
+          BenchmarkRequirement(
+              name=f"{app}: clone",
+              filepath=app_root,
+              cmd=["git", "-C", str(app_root), "rev-parse", "HEAD"],
+              signature=expected_commit,
+              timeout_seconds=10.0,
+          ))
 
-      expected_commit = REPOS[app][1]
       reqs.append(
-        _RepoCommitRequirement(
-          name = f"{app}: clone",
-          oracle = self,
-          app = app,
-          app_root = app_root,
-          expected_commit_prefix = expected_commit,
-        )
-      )
+          BenchmarkRequirement(
+              name=f"{app}: pom swap",
+              filepath=app_root,
+              cmd=[
+                  "bash",
+                  "-lc",
+                  ("set -euo pipefail; "
+                   f"test -f {pom_file}; "
+                   f"test -f {pom_backup}; "
+                   f"! cmp -s {pom_file} {pom_backup}; "
+                   f"grep -a -F -q {self._weaving_plugin_sig} {pom_file}; "
+                   "echo POM_SWAP_OK"),
+              ],
+              signature="POM_SWAP_OK",
+              timeout_seconds=10.0,
+              use_shell=False,
+          ))
 
       reqs.append(
-        _WeavingRequirement(
-          name = f"{app}: weaving",
-          oracle = self,
-          app = app,
-          app_root = app_root,
-        )
-      )
+          BenchmarkRequirement(
+              name=f"{app}: weaving config",
+              filepath=app_root,
+              cmd=[
+                  "cat",
+                  "pom.xml",
+              ],
+              signature=self._weaving_plugin_sig,
+              timeout_seconds=120.0,
+          ))
+
+      reqs.append(
+          _WeavingRequirement(
+              name=f"{app}: weaving",
+              oracle=self,
+              app=app,
+              app_root=app_root,
+          ))
 
     return tuple(reqs)
 
@@ -118,7 +160,10 @@ def run_shell_command(self, cmd):
     Run a bash command given as argument.
     """
     try:
-      cp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+      cp = subprocess.run(cmd,
+                          stdout=subprocess.PIPE,
+                          stderr=subprocess.PIPE,
+                          text=True)
       return cp.returncode, (cp.stdout or "").strip(), (cp.stderr or "").strip()
     except FileNotFoundError as e:
       return 127, "", str(e)
@@ -127,16 +172,13 @@ def find_class_dirs(self, app_root: Path):
     """
     Find directories that contain .class files.
     """
-    qroot = shlex.quote(str(app_root))
     cmd = [
-      "bash",
-      "-lc",
-      (
-        f"shopt -s nullglob; "
-        f"find {qroot} -type f -name '*.class' "
-        f"-not -path '*/.git/*' -not -path '*/.m2/*' -not -path '*/.gradle/*' "
-        f"-printf '%h\n' | sort -u"
-      ),
+        "bash",
+        "-lc",
+        (f"shopt -s nullglob; "
+         f"find {app_root} -type f -name '*.class' "
+         f"-not -path '*/.git/*' -not -path '*/.m2/*' -not -path '*/.gradle/*' "
+         f"-printf '%h\n' | sort -u"),
     ]
     rc, out, err = self.run_shell_command(cmd)
     if rc != 0:
@@ -149,8 +191,10 @@ def iter_class_files(self, classes_dir: Path, limit: int):
     Iterate over .class files from a class directory, processing up to
     a configurable number of files.
     """
-    q = shlex.quote(str(classes_dir))
-    cmd = ["bash", "-lc", f"shopt -s nullglob; find {q} -type f -name '*.class' | sort"]
+    cmd = [
+        "bash", "-lc",
+        f"shopt -s nullglob; find {classes_dir} -type f -name '*.class' | sort"
+    ]
     rc, out, err = self.run_shell_command(cmd)
     if rc != 0 or not out:
       return []
@@ -160,33 +204,15 @@ def iter_class_files(self, classes_dir: Path, limit: int):
       files = files[::step][:limit]
     return files
 
-  def check_repo_commit(self, app: str, app_root: Path, expected_commit_prefix: str):
-    """
-    Verify the repo at app_root is a git repo and HEAD matches an expected commit ID prefix.
-    """
-    if not app_root.is_dir():
-      return False, f"{app}: FAIL (clone) - directory not found: {app_root}"
-
-    rc, out, err = self.run_shell_command(["git", "-C", str(app_root), "rev-parse", "HEAD"])
-    if rc != 0:
-      return False, f"{app}: FAIL (clone) - not a git repo or unreadable HEAD: {err or out}"
-
-    head = (out or "").strip()
-    if head.startswith(expected_commit_prefix):
-      return True, f"{app}: PASS (clone) - commit {head[:12]} matches {expected_commit_prefix}"
-    else:
-      return False, f"{app}: FAIL (clone) - HEAD {head[:12]} != expected {expected_commit_prefix}*"
-
-
   def classfile_has_aspect_markers(self, class_path: Path):
     """
     Search through a decoded .class for AspectJ markers.
     """
-    pattern = "|".join(ASPECTJ_MARKERS)
-    cmd = ["bash", "-lc", f"strings {shlex.quote(str(class_path))} | grep -a -E '{pattern}' -m 1"]
+    e_args = " ".join(f"-e {m}" for m in self._aspectj_markers)
+    cmd = ["bash", "-lc", f"strings {class_path} | grep -a -F -m 1 {e_args}"]
     rc, out, err = self.run_shell_command(cmd)
     if rc == 0 and out:
-      matched = next((m for m in ASPECTJ_MARKERS if m in out), out)
+      matched = next((m for m in self._aspectj_markers if m in out), out)
       return True, matched
     return False, ""
 
@@ -203,7 +229,9 @@ def check_app_weaving(self, app: str, app_root: Path):
     if not class_dirs:
       return False, f"{app}: FAIL (waving) - no compiled .class files found under {app_root}"
 
-    dirs = class_dirs[:self.max_class_dirs] if (self.max_class_dirs and len(class_dirs) > self.max_class_dirs) else class_dirs
+    dirs = class_dirs[:self.max_class_dirs] if (
+        self.max_class_dirs and
+        len(class_dirs) > self.max_class_dirs) else class_dirs
 
     for cdir in dirs:
       for cf in self.iter_class_files(cdir, self.max_classess_per_dir):
diff --git a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_env_setup.py b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_env_setup.py
index 4c6016e2..8d23fce8 100644
--- a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_env_setup.py
+++ b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_env_setup.py
@@ -1,185 +1,180 @@
-#!/usr/bin/env python3
-import os
-import re
-import shutil
-import subprocess
-from dataclasses import dataclass
-from typing import Iterable, Optional, Tuple
-from pathlib import Path
-
-from utils import REPO_DIR
-from utils import logger as _default_logger
-
-from evaluator.oracle_env_setup_primitives import OracleEnvSetupBase, Requirement
-from evaluator import utils
-
-
-VersionTuple = Tuple[int, ...]
-
-
-@dataclass(frozen=True)
-class Dependency:
-  name: str
-  binary: str
-  cmd: Optional[list] = None
-  parse_regex: Optional[str] = None
-  require: Optional[VersionTuple] = None
-  compare: Optional[str] = None
+"""Environment setup oracle for Wasabi (SOSP'24).
 
+Validates:
+  - The required dependencies and other prerequisits are correctly installed
+  - The workspace and key environment variables are set up correctly
+  - Expected directories, directory layout and configuration files are present
+"""
 
-DEPENDENCIES: list[Dependency] = [
+from __future__ import annotations
 
-  Dependency(
-    name="git", binary="git"
-  ),
-
-  Dependency(
-    name="maven", binary="mvn",
-    cmd=["mvn", "-v"], parse_regex=r"Apache Maven\s+([0-9.]+)",
-    require=(3, 6, 3), compare="gte",
-  ),
-  Dependency(
-    name="gradle", binary="gradle",
-    cmd=["gradle", "-v"], parse_regex=r"Gradle\s+([0-9.]+)",
-    require=(4, 4, 1), compare="gte",
-  ),
-  Dependency(
-    name="ant", binary="ant",
-    cmd=["ant", "-version"], parse_regex=r"version\s+([0-9.]+)",
-    require=(1, 10), compare="gte",
-  ),
-  Dependency(
-    name="python3", binary="python3",
-    cmd=["python3", "--version"], parse_regex=r"Python\s+([0-9.]+)",
-    require=(3, 10), compare="gte",
-  ),
-  Dependency(
-    name="java", binary="java",
-    cmd=["java", "-version"], parse_regex=r'version\s+"([^"]+)"',
-    require=(1, 8), compare="eq",
-  ),
-]
-
-
-@dataclass(frozen=True, slots=True)
-class _PrereqsRequirement(utils.BaseRequirement):
-  oracle: "OracleEnvSetup"
-
-  def check(self, ctx: object) -> utils.CheckResult:
-    del ctx
-    ok, why = self.oracle.prereqs_check()
-    if ok:
-      return utils.CheckResult.success()
-    return utils.CheckResult.failure(why or "Prerequisites failed")
-
-
-@dataclass(frozen=True, slots=True)
-class _PathsRequirement(utils.BaseRequirement):
-  oracle: "OracleEnvSetup"
+import logging
+from pathlib import Path
+from typing import Sequence
 
-  def check(self, ctx: object) -> utils.CheckResult:
-    del ctx
-    ok, why = self.oracle.paths_check()
-    if ok:
-      return utils.CheckResult.success()
-    return utils.CheckResult.failure(why or "Paths failed")
+from evaluator import utils
+from evaluator.oracle_env_setup_primitives import (
+    DependencyVersionRequirement,
+    EnvironmentVariableRequirement,
+    FilesystemPathRequirement,
+    OracleEnvSetupBase,
+    PathType,
+    VersionCompare,
+)
 
 
 class OracleEnvSetup(OracleEnvSetupBase):
+  """WASABI environment setup oracle."""
+
+  _JAVA_HOME = "/usr/lib/jvm/java-8-openjdk-amd64/jre"  # Check for Java 1.8
 
-  def __init__(self, *, logger=_default_logger) -> None:
+  def __init__(self,
+               *,
+               config: utils.EntryConfig,
+               logger=logging.Logger) -> None:
     super().__init__(logger=logger)
+    self._config = config
+    self._wasabi_root = Path(
+        self._config.repository_paths[self._config.name]).resolve()
+    self._benchmarks_root = Path(
+        self._config.repository_paths["benchmarks"]).resolve()
 
-    self.expected_root_dir = REPO_DIR
-    self.expected_java_home = "/usr/lib/jvm/java-8-openjdk-amd64/jre"
+  def requirements(self) -> Sequence[utils.BaseRequirement]:
+    wasabi_root_str = str(self._wasabi_root)
 
-  def requirements(self) -> Tuple[Requirement, ...]:
     return (
-      _PrereqsRequirement(name="Prerequisites", oracle=self),
-      _PathsRequirement(name="Paths", oracle=self),
+        # Dependencies, toolchains, and third-party utilites
+        DependencyVersionRequirement(
+            name="git",
+            cmd=("git", "--version"),
+            required_version=(0, 0, 0),
+            compare=VersionCompare.GEQ,
+            timeout_seconds=5.0,
+        ),
+        DependencyVersionRequirement(
+            name="maven",
+            cmd=("mvn", "-v"),
+            required_version=(3, 6, 3),
+            compare=VersionCompare.GEQ,
+            version_regex=r"Apache Maven\s+([0-9.]+)",
+            timeout_seconds=5.0,
+        ),
+        DependencyVersionRequirement(
+            name="gradle",
+            cmd=("gradle", "-v"),
+            required_version=(4, 4, 1),
+            compare=VersionCompare.GEQ,
+            version_regex=r"Gradle\s+([0-9.]+)",
+            timeout_seconds=5.0,
+        ),
+        DependencyVersionRequirement(
+            name="ant",
+            cmd=("ant", "-version"),
+            required_version=(1, 10, 0),
+            compare=VersionCompare.GEQ,
+            version_regex=r"version\s+([0-9.]+)",
+            timeout_seconds=5.0,
+        ),
+        DependencyVersionRequirement(
+            name="python3",
+            cmd=("python3", "--version"),
+            required_version=(3, 10, 0),
+            compare=VersionCompare.GEQ,
+            version_regex=r"Python\s+([0-9.]+)",
+            timeout_seconds=5.0,
+        ),
+        DependencyVersionRequirement(
+            name="java",
+            cmd=("java", "-version"),
+            required_version=(1, 8, 0),
+            compare=VersionCompare.EQ,
+            version_regex=r'version\s+"([^"]+)"',
+            timeout_seconds=5.0,
+        ),
+        DependencyVersionRequirement(
+            name="tree",
+            cmd=("tree", "--version"),
+            required_version=(0, 0, 0),
+            compare=VersionCompare.GEQ,
+            optional=True,
+            timeout_seconds=5.0,
+        ),
+
+        # Environment variables
+        EnvironmentVariableRequirement(
+            name="WASABI_ROOT_DIR matches expected",
+            env_var="WASABI_ROOT_DIR",
+            expected=str(self._wasabi_root.resolve().parent),
+        ),
+        FilesystemPathRequirement(
+            name="WASABI root directory exists",
+            path=self._wasabi_root,
+            path_type=PathType.DIRECTORY,
+        ),
+        EnvironmentVariableRequirement(
+            name="JAVA_HOME matches expected",
+            env_var="JAVA_HOME",
+            expected=self._JAVA_HOME,
+        ),
+        FilesystemPathRequirement(
+            name="JAVA_HOME directory exists",
+            path=Path(self._JAVA_HOME),
+            path_type=PathType.DIRECTORY,
+        ),
+
+        # Directory structure and required exported configs
+        FilesystemPathRequirement(
+            name="benchmarks directory exists",
+            path=self._benchmarks_root,
+            path_type=PathType.DIRECTORY,
+        ),
+        FilesystemPathRequirement(
+            name="config directory exists",
+            path=self._wasabi_root / "config",
+            path_type=PathType.DIRECTORY,
+        ),
+        FilesystemPathRequirement(
+            name="utils directory exists",
+            path=self._wasabi_root / "utils",
+            path_type=PathType.DIRECTORY,
+        ),
+        FilesystemPathRequirement(
+            name="pom.xml exists",
+            path=self._wasabi_root / "pom.xml",
+            path_type=PathType.FILE,
+        ),
+
+        # Required build/running scripts
+        FilesystemPathRequirement(
+            name="utils/prereqs.sh exists",
+            path=self._wasabi_root / "utils" / "prereqs.sh",
+            path_type=PathType.FILE,
+        ),
+        FilesystemPathRequirement(
+            name="utils/run.py exists",
+            path=self._wasabi_root / "utils" / "run.py",
+            path_type=PathType.FILE,
+        ),
+        FilesystemPathRequirement(
+            name="utils/display_bug_results.py exists",
+            path=self._wasabi_root / "utils" / "display_bug_results.py",
+            path_type=PathType.FILE,
+        ),
+
+        # Required configuration files
+        FilesystemPathRequirement(
+            name="config/hadoop/example.conf exists",
+            path=self._wasabi_root / "config" / "hadoop" / "example.conf",
+            path_type=PathType.FILE,
+        ),
+        FilesystemPathRequirement(
+            name="config/hadoop/hadoop.conf exists",
+            path=self._wasabi_root / "config" / "hadoop" / "hadoop.conf",
+            path_type=PathType.FILE,
+        ),
+        FilesystemPathRequirement(
+            name="config/hadoop/pom-hadoop.xml exists",
+            path=self._wasabi_root / "config" / "hadoop" / "pom-hadoop.xml",
+            path_type=PathType.FILE,
+        ),
     )
-
-  def run_shell_command(self, cmd: Iterable[str]) -> Tuple[int, str, str]:
-    """
-    Run a command and return (rc, stdout, stderr) tuple.
-    """
-    try:
-      cp = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-      return cp.returncode, cp.stdout or "", cp.stderr or ""
-    except FileNotFoundError:
-      return 127, "", ""
-
-  def parse_version_tuple(self, text: str) -> VersionTuple:
-    """
-    Extract the first version-like token from arbitrary text.
-    For example, for Java: '1.8.0_422' -> (1, 8, 0)
-    """
-    m = re.search(r"(\d+(?:\.\d+){0,3})", text)
-    return tuple(int(x) for x in m.group(1).split(".")) if m else ()
-
-  def extract_version(self, text: str, pattern: str) -> Tuple[VersionTuple, str]:
-    """
-    Apply regex pattern on a version string.
-    """
-    m = re.search(pattern, text, re.I)
-    if not m:
-      return (), "unknown"
-    ver_str = m.group(1)
-    return self.parse_version_tuple(ver_str), ver_str
-
-  def cmp_versions(self, found: VersionTuple, required: VersionTuple, mode: str) -> bool:
-    """
-    Compare versions either to match exactly ('eq')
-    or the installed version is greather than the reference one ('gte').
-    """
-    if not found:
-      return False
-    f, r = list(found), list(required)
-    while len(f) < len(r): f.append(0)
-    while len(r) < len(f): r.append(0)
-    return (f == r) if mode == "eq" else (f >= r)
-
-  def paths_check(self):
-    wasabi_root = os.environ.get("WASABI_ROOT_DIR", "")
-    if not (wasabi_root == self.expected_root_dir and Path(wasabi_root).exists()):
-      return False, "WASABI_ROOT_DIR incorrect"
-    java_home = os.environ.get("JAVA_HOME", "")
-    if not (java_home == self.expected_java_home and Path(java_home).exists()):
-      return False, "JAVA_HOME incorrect"
-    return True, ""
-
-  def check_dependency(self, dep: Dependency) -> Optional[str]:
-    """
-    Core method that checks whether a certain dependency of a version
-    equal or greather than that specified in the README is installed.
-    """
-    if shutil.which(dep.binary) is None:
-      return f"{dep.name} missing"
-
-
-    if dep.cmd is None and dep.parse_regex is None and dep.require is None:
-      return None
-
-    rc, out, err = self.run_shell_command(dep.cmd or [])
-    text = (out + "\n" + err).strip()
-
-    if dep.parse_regex and dep.require and dep.compare:
-      ver_tuple, ver_str = self.extract_version(text, dep.parse_regex)
-      if not ver_tuple:
-        return f"{dep.name} version unreadable"
-      ok = self.cmp_versions(ver_tuple, dep.require, dep.compare)
-      cmp_word = "==" if dep.compare == "eq" else ">="
-      want = ".".join(map(str, dep.require))
-      return None if ok else f"{dep.name} {cmp_word} {want} not met (got {ver_str})"
-
-    return f"{dep.name} check misconfigured"
-
-  def prereqs_check(self):
-    problems: list[str] = []
-    for dep in DEPENDENCIES:
-      msg = self.check_dependency(dep)
-      if msg:
-        problems.append(msg)
-    if problems:
-      return False, "; ".join(problems)
-    return True, ""
\ No newline at end of file
diff --git a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_experiment_runs.py b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_experiment_runs.py
index e37e0d42..27d65224 100644
--- a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_experiment_runs.py
+++ b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/_agent_eval/oracle_experiment_runs.py
@@ -1,121 +1,103 @@
-from collections import defaultdict
-import os
-
-from utils import RESULTS_ROOT_DIR
-from utils import GROUND_TRUTH_FILE
-from utils import SIMILARITY_RATIO
-
-from utils import logger
-
-class OracleExperimentRuns:
-  def __init__(self):
-    pass
-
-  def get_benchmark_name(self, loc):
-    """
-    Classifies the location based on its prefix.
-    """
-    if loc.startswith("org.apache.hadoop.hdfs") and "SecondaryNameNode.doWork" not in loc:
-      return "hdfs"
-    elif loc.startswith("org.apache.hadoop.yarn"):
-      return "yarn"
-    elif loc.startswith("org.apache.hadoop.mapreduce") or loc.startswith("org.apache.hadoop.mapred"):
-      return "mapreduce"
-    elif loc.startswith("org.apache.hadoop.hbase"):
-      return "hbase"
-    elif loc.startswith("org.apache.hadoop.hive"):
-      return "hive"
-    elif loc.startswith("org.apache.cassandra"):
-      return "cassandra"
-    elif loc.startswith("org.apache.hadoop") or "SecondaryNameNode.doWork" in loc:  # initialy found in hadoop-common, added here to match Table 3
-      return "hadoop"
-    elif loc.startswith("org.elasticsearch"):
-      return "elasticsearch"
-    else:
-      return "unknown"
-
-  def aggregate_bugs(self, root_dir):
-    """
-    Searches for bug report files and aggregates bugs based on their type and 
-    which application have been found in.
-    """
-    bugs = defaultdict(lambda: defaultdict(set))
-    unique = dict()
-
-    for dirpath, _, files in os.walk(root_dir):
-      for file in files:
-        if file.endswith(".csv"):
-          file_path = os.path.join(dirpath, file)
-          
-          with open(file_path, 'r') as f:
-            for line in f:
-              if "how-bug" in line or "when-missing-" in line:
-                tokens = line.strip().split(",")
-        
-                bug_type = tokens[1]
-                bug_loc = tokens[2]
-                
-                key = bug_type + bug_loc
-                if key in unique:
-                  continue
-                unique[key] = "x"
-
-                benchmark = self.get_benchmark_name(bug_loc)       
-                bugs[bug_type][benchmark].add(bug_loc)
-  
-    return bugs
-
-  def get_ground_truth_bugs(self, file_path: str):
-    """
-    Reads the ground truth values from a file into a dictionary.
-    """
-    ground_truth = defaultdict(lambda: defaultdict(set))
-    
-    try:
-      with open(file_path, 'r') as f:
-        for line in f:
-          tokens = line.strip().split(",")
-          benchmark = tokens[0]
-          bug_type = tokens[1]
-          retry_location = tokens[2]
-          ground_truth[bug_type][benchmark].add(retry_location)
-    except Exception:
-      logger.info(f"Cannot open {file_path} or file not present.")
-    
-    return ground_truth
-
-  def count_bugs(self, bugs, ground_truth):
-    """
-    Compares the total number of bugs found against the ground truth.
-    """
-    total_ground_truth = 0
-    total_found = 0
-
-    for bug_type, benchmarks in ground_truth.items():
-      for benchmark, ground_truth_locations in benchmarks.items():
-        total_ground_truth += len(ground_truth_locations)
-        bug_locations = bugs.get(bug_type, {}).get(benchmark, set())
-        matching_locations = ground_truth_locations & bug_locations
-        total_found += len(matching_locations)
-
-    if total_ground_truth == 0:
-      logger.info("No ground truth bugs available.")
-      return False
-
-    coverage = total_found / total_ground_truth
-    logger.info(f"Found {total_found} out of {total_ground_truth} ground truth bugs ({coverage:.2%}).")
-
-    passed = coverage >= SIMILARITY_RATIO
-    logger.info("Results reproduced: PASS" if passed else "Results reproduced: FAIL")
-    return passed
-
-
-  def run(self):
-    bugs = self.aggregate_bugs(str(RESULTS_ROOT_DIR))
-    ground_truth = self.get_ground_truth_bugs(str(GROUND_TRUTH_FILE))
-    passed = self.count_bugs(bugs, ground_truth)
-
-    if passed:
-      return True
-    
-    return False
\ No newline at end of file
+"""Experiment runs oracle for Wasabi (SOSP'24).
+
+Validates:
+  - Required results and ground-truth inputs are present
+  - Bug reports can be parsed from the configured results directory
+  - Parsed bug reports can be mapped to the expected benchmark categories
+  - Observed bug coverage matches the provided ground-truth dataset within the configured similarity threshold
+"""
+
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+import csv
+
+from evaluator.utils import EntryConfig
+from oracle_experiment_runs_primitives import (
+    OracleExperimentRunsBase,
+    ElementwiseSimilarityThresholdRequirement,
+)
+
+
+@dataclass(frozen=True)
+class _BugKey:
+  bug_type: str
+  benchmark: str
+  location: str
+
+
+class OracleExperimentRuns(OracleExperimentRunsBase):
+  _ORACLE_NAME = "WasabiExperimentRuns"
+
+  def __init__(self, *, config: EntryConfig, logger) -> None:
+    super().__init__(logger=logger)
+    self._config = config
+    self._results_root = config.results_paths["results_root"]
+    self._gt_file = config.ground_truth_paths["bugs_ground_truth"]
+    self._threshold = config.similarity_ratio
+
+    self._prefix_map = config.metadata.get("benchmark_prefix_map", [])
+    self._contains_rules = config.metadata.get("benchmark_contains_rules", [])
+    self._glob = config.metadata.get("results_file_glob", "*.csv")
+
+  def _classify_benchmark(self, loc: str) -> str:
+    for bench, needles in self._contains_rules:
+      if any(n in loc for n in needles):
+        return bench
+    for bench, prefixes in self._prefix_map:
+      if any(loc.startswith(p) for p in prefixes):
+        return bench
+    return "unknown"
+
+  def _load_ground_truth(self) -> dict[tuple[str, str], set[str]]:
+    # key: (bug_type, benchmark) -> set(loc)
+    out: dict[tuple[str, str], set[str]] = {}
+    p = Path(self._gt_file)
+    with p.open() as f:
+      for line in f:
+        bench, bug_type, loc = line.strip().split(",", 2)
+        out.setdefault((bug_type, bench), set()).add(loc)
+    return out
+
+  def _load_observed(self) -> dict[tuple[str, str], set[str]]:
+    out: dict[tuple[str, str], set[str]] = {}
+    root = Path(self._results_root)
+    for csv_path in root.rglob(self._glob):
+      with csv_path.open(newline="") as f:
+        reader = csv.reader(f)
+        for row in reader:
+          if not row:
+            continue
+          line = ",".join(row)
+          if ("how-bug" not in line) and ("when-missing-" not in line):
+            continue
+          bug_type = row[1]
+          bug_loc = row[2]
+          bench = self._classify_benchmark(bug_loc)
+          out.setdefault((bug_type, bench), set()).add(bug_loc)
+    return out
+
+  def requirements(self):
+    gt = self._load_ground_truth()
+    obs = self._load_observed()
+
+    # Stable ordering over ground truth bug IDs
+    buckets = sorted(gt.keys())
+
+    ref_counts = []
+    matched_counts = []
+
+    for k in buckets:
+      gt_locs = gt[k]
+      obs_locs = obs.get(k, set())
+      ref_counts.append(float(len(gt_locs)))
+      matched_counts.append(float(len(gt_locs & obs_locs)))
+
+    return [
+        ElementwiseSimilarityThresholdRequirement(
+            name="ground-truth-coverage-by-bucket",
+            observed=matched_counts,
+            reference=ref_counts,
+            threshold=self._threshold,
+        ),
+    ]
diff --git a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/wasabi/README.md b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/wasabi/README.md
index 050700df..f809b2d1 100644
--- a/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/wasabi/README.md
+++ b/benchmarks/arteval_bench/data/benchmark/sosp24_wasabi/wasabi/README.md
@@ -2,320 +2,143 @@
 
 The testing component of WASABI triggers retry bugs by using a combination of static analysis, large language models (LLMs), fault injection, and testing. 
 
-## 2. Getting Started
+## 2. Getting Started  (0.5h, 5min human effort)
 
-To get started, users should create a new directory structure, clone this repository, work on the `main` branch of the repository, configure and install dependencies.
-
-Start by checking you have 'root' access to the system, and installing `sudo` using `apt-get install`. Then, go through the following three steps:
-
-1. If not already in place, create a the appropriate directory structure:
+> [!NOTE]
+> WASABI was originally developed, compiled, built, and evaluated on a Ubuntu 22.04 distribution running `bash` as its default shell. While agnostic to the underlying system, the steps in this README might need to be adapted accordingly under different distributions, versions, packages, etc. 
 
-Note that your current working directory where the `README.md` is located id `~/sosp24_wasabi/wasabi`
-```bash
-mkdir -p ~/sosp24_wasabi/benchmarks
-cd ~/sosp24_wasabi/
-ls -la .
+To get started, users should create a `benchmark` in the current working directory such that `wasabi/` and `benchmarks/` share the same parent directory by running,
 ```
+mkdir -p $WORKSPACE_DIR/benchmarks
+```
+Note that `$WORKSPACE_DIR` is to be selected by the user. 
 
 The working directory structure should look similar to the one below:
 ```plaintext
-~/sosp24_wasabi
+$WORKSPACE_DIR
   ├── benchmarks/
   └── wasabi/
-      ├── config/
-      ├── README.md
-      ├── config/
-      ├── pom-java11.xml
-      ├── pom-java8.xml
-      ├── pom.xml
-      ├── src/
-      └── utils/
-```
-The `wasabi` directory contains the codebase of WASABI, while the `benchmarks` directory is where users can add applications that they want to use WASABI to find retry bugs.
-
-2. Set up the `WASABI_ROOT_DIR` environment variable:
-```
-export WASABI_ROOT_DIR=$(echo $HOME)/sosp24_wasabi/wasabi
-```
-3. Installing necessary dependnecies:
-```
-cd ~/sosp24_wasabi/wasabi/wasabi-testing/utils
-sudo ./prereqs.sh
+      └── wasabi-testing
+          ├── README.md
+          ├── config/
+          ├── pom-java11.xml
+          ├── pom-java8.xml
+          ├── pom.xml
+          ├── src/
+          └── utils/
 ```
 
-> [!NOTE]
-> WASABI requires the following dependencies:
-> * Ubuntu >=22.04 LTE
-> * Python >=3.10
-> * Java 8 and 11
-> * Maven >=3.6
-> * Gradle >=4.4.1
-> * Ant >=1.10
-> * AspectJ runtime plugin** (`aspectjr`) 1.9.8.M1 for Java 8 and 1.9.19 for Java 11, respectively
-> * AspectJ Maven plugin** (`aspectj-maven-plugin`) 1.13 for Java 8 and 1.13.1 for Java 11, respectively
->
->**both added to WASABI's `pom.xml` as plugin dependencies
-> 
-> WASABI was developed, built, and tested on a bare metal machine with an Intel i7-8700 CPU, 32 GB of RAM, and 512 GB of disk space, running Ubuntu 22.04 LTE.
-> While we implement WASABI to be agnostic to environment settings (i.e., OS distribution, versions of packages and dependencies), using WASABI in a different environment. Please see "[Known issues](README.md#7-known-issues)".
-
-## 3. Building and installing WASABI
-
-To build and install WASABI, first switch to the appropriate Java distribution. In this tutorial we work with Java 8 as it is the latest distribution required for HDFS.
+Users can check their directory structure by installing the `tree` package
 ```bash
-sudo update-alternatives --config java 
-...(select java 8)
-export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre
+sudo apt-get install tree
 ```
-
-Next, run Maven's `clean`, `compile`, and `install` Maven from the `wasabi-testing` directory, to build WASABI. Note that the current codebase includes AspectJ for each of the applications used to evaluate WASABI (see Section 4 from our [paper](https://bastoica.github.io/files/papers/2024_sosp_wasabi.pdf)). In this walkthrough we build WASABI for finding bugs in HDFS (Hadoop) and use triggering HDFS-17590 as an example, [below](README.md#6-running-example-reproducing-hdfs-17590). 
+and running
 ```bash
-cd ~/sosp24_wasabi/wasabi/wasabi-testing
-mvn clean install -U -fn -B -Dinstrumentation.target=hadoop -DskipTests 2>&1 | tee wasabi-install.log
+tree -L 3 $WORKSPACE_DIR/
 ```
 
-If successful users should see a message similar to 
-```bash
-...
-[INFO] ------------------------------------------------------------------------
-[INFO] BUILD SUCCESS
-[INFO] ------------------------------------------------------------------------
-[INFO] Total time:  36.384 s
-[INFO] Finished at: 2024-08-12T19:57:24Z
-[INFO] ------------------------------------------------------------------------
+Next, users should set up the `WASABI_ROOT_DIR` system variable which is used by most of the scripts in this artifact:
 ```
-If users need to use Java 11, they can either modify the `pom.xml` accordingly. We also provide pre-configured `pom` files for [Java 8](pom-java8.xml) and [Java 11](pom-java11.xml`).
-
-> [!NOTE]
-> When building WASABI multiple times, especially under a different Java distribution, it is recommended to first remove Maven's cache directory prior to compiling WASABI.
-```bash
-rm -rf ~/.m2/repository
-```
-
-## 4. Weaving (instrumenting) a target application
-
-WASABI can be woven into or instrument a target applications either at compile- or load-time.
-
-### 4.1 Compile-time weaving (Maven)
-
-To enable compile-time weaving for a target application, users need to modify the original `pom.xml` of the target to include Wasabi as a dependence and invoke the `aspectj` plugin:
-
-```xml
-<dependencies>
-  <!-- Existing dependencies -->
-
-  <!-- AspectJ Runtime -->
-  <dependency>
-    <groupId>org.aspectj</groupId>
-    <artifactId>aspectjrt</artifactId>
-    <version>${aspectj.version}</version>
-  </dependency>
-
-  <!-- WASABI Fault Injection Library -->
-  <dependency>
-    <groupId>edu.uchicago.cs.systems</groupId>
-    <artifactId>wasabi</artifactId>
-    <version>${wasabi.version}</version>
-  </dependency>
-</dependencies>
-
-<properties>
-  <!-- Versions -->
-  <aspectj.version>1.9.19</aspectj.version>
-  <aspectj-maven.version>1.13.1</aspectj-maven.version>
-  <wasabi.version>1.0.0</wasabi.version>
-</properties>
-
-<build>
-  <plugins>
-    <!-- Existing plugins -->
-
-    <!-- AspectJ Maven Plugin -->
-    <plugin>
-      <groupId>dev.aspectj</groupId>
-      <artifactId>aspectj-maven-plugin</artifactId>
-      <version>${aspectj-maven.version}</version>
-      <configuration>
-        <aspectLibraries>
-          <aspectLibrary>
-            <groupId>edu.uchicago.cs.systems</groupId>
-            <artifactId>wasabi</artifactId>
-          </aspectLibrary>
-        </aspectLibraries>
-        <showWeaveInfo>true</showWeaveInfo>
-        <verbose>true</verbose>
-      </configuration>
-      <executions>
-        <execution>
-          <goals>
-            <goal>compile</goal>
-            <goal>test-compile</goal>
-          </goals>
-        </execution>
-      </executions>
-    </plugin>
-  </plugins>
-</build>
-```
-
-Next, build the target application with WASABI woven in:
-```bash
-cd /path/to/target_application
-mvn clean compile -T 8 -fn -DskipTests && mvn install -fn -DskipTests -B 2>&1 | tee wasabi-build.log
-```
-
-Successful weaving should produce log messages like this one:
-```bash
-[INFO] Join point 'method-execution(...)' in Type 'org.apache.hadoop.metrics2.util.SampleStat' ...
+export WASABI_ROOT_DIR=$WORKSPACE_DIR/wasabi
 ```
 
-Users should also check out [examples](https://github.com/bastoica/wasabi/tree/sosp24_wasabi/wasabi-testing) of target applications instrumented with WASABI from our `sosp24-ae` branch. These not only include detailed weaving steps, but also the modified `pom.xml` files.
+### 2.1. System Requirements
 
-### 4.2 Load-time weaving (Gradle, Ant, others)
+WASABI and its benchmarks are compiled using Java 8 on an Ubuntu 22.04 distribution that runs `bash` as its default shell. The default build system is Maven (3.6.3), except for ElasticSearch that requires Gradle (>=4.4.1), and Cassandra which needs Ant (>=1.10). Finally, the scripts supporting WASABI require Python (>=3.10).
 
-Some applications use build systems other than Maven, like Gradle or Ant. In these cases, WASABI can be woven at load-time.
+### 2.2. Installing Prerequisites
 
-#### Load-time weaving with Gradle
-
-First, add the AspectJ plugin and dependencies to your build.gradle file:
-```xml
-plugins {
-  id 'io.freefair.aspectj.post-compile-weaving' version '8.1.0'
-  id 'java'
-}
-
-dependencies {
-  implementation 'org.aspectj:aspectjrt:1.9.19'
-  aspect 'edu.uchicago.cs.systems:wasabi:1.0.0'
-}
+Users can either install them manually using `apt-get` or run the `prereqs.sh` provided by our artifact:
 ```
-
-Next, configure AspectJ for load-time weaving:
-```xml
-compileJava {
-  options.compilerArgs += ['-Xlint:none']
-  doLast {
-    javaexec {
-      main = '-jar'
-      args = [configurations.aspectj.getSingleFile(), '-inpath', sourceSets.main.output.classesDirs.asPath, '-aspectpath', configurations.aspect.asPath]
-    }
-  }
-}
+cd $WORKSPACE_DIR/wasabi/wasabi-testing/utils
+sudo ./prereqs.sh
 ```
+Note that this command requires `sudo` privileges.
 
-Finally, compile and build the project:
+As a sanity check, users can verify the versions of each packaged installed. For Maven, this requires running
 ```bash
-gradle clean build -i 2>&1 | tee wasabi-build.log
+mvn -v
 ```
-
-#### Load-time weaving with Ant
-
-First, make sure AspectJ libraries (`aspectjrt.jar`, `aspectjtools.jar`) are available in your project.
-
-Next, modify `build.xml` by adding the AspectJ tasks and specify WASABI in the aspect path:
-
-```xml
-<taskdef resource="org/aspectj/tools/ant/taskdefs/aspectjTaskdefs.properties"
-         classpathref="aspectj-libs"/>
-
-<target name="compile">
-  <mkdir dir="build/classes"/>
-  <ajc destdir="build/classes" source="1.8" target="1.8" fork="true" aspectpathref="wasabi-libs">
-    <classpath>
-      <pathelement path="src/main/java"/>
-      <pathelement refid="aspectj-libs"/>
-    </classpath>
-    <sourcepath>
-      <pathelement path="src/main/java"/>
-    </sourcepath>
-    <inpath>
-      <pathelement path="src/main/java"/>
-    </inpath>
-  </ajc>
-</target>
+which should yield
+```bash
+Apache Maven 3.6.3
+Maven home: /usr/share/maven
+Java version: 1.8.0_482, vendor: Private Build, runtime: /usr/lib/jvm/java-8-openjdk-amd64/jre
+Default locale: en, platform encoding: UTF-8
+OS name: "linux", version: "6.8.0-64-generic", arch: "amd64", family: "unix"
 ```
 
-Finally, compile and build the project:
+Finally, users need to manually switch to Java 8
 ```bash
-ant compile 2>&1 | tee wasabi-build.log
+$ sudo update-alternatives --config java
 ```
+which would redirect users to the following configuration window:
+```bash
+There are 3 choices for the alternative java (providing /usr/bin/java).
 
-## 5. Configure fault injection policies and metadata
-
-To specify fault injection policies and the precise injection locations, users need to create two types of files&mdash;a location data file (`.data`) and a policy configuration file (`.conf`).
-
-A `.data` file describes the retry locations and their respective exceptions to be injected by Wasabi. It has the following format:
-```xml
-Retry location!!!Enclosing method!!!Retried method!!!Injection site!!!Exception
-https://github.com/apache/hadoop/tree//ee7d178//hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java#L790!!!org.apache.hadoop.ipc.Client$Connection.setupIOstreams!!!org.apache.hadoop.ipc.Client$Connection.writeConnectionContext!!!Client.java:831!!!java.net.SocketException
-https://github.com/apache/hadoop/tree//ee7d178//hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java#L609!!!org.apache.hadoop.hdfs.server.namenode.ha.EditLogTailer$MultipleNameNodeProxy.getActiveNodeProxy!!!org.apache.hadoop.ipc.RPC.getProtocolVersion!!!N/A!!!java.io.IOException
-https://github.com/apache/hadoop/tree//ee7d178//hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/RPC.java#L419!!!org.apache.hadoop.ipc.RPC.waitForProtocolProxy!!!org.apache.hadoop.ipc.RPC.getProtocolProxy!!!RPC.java:421!!!java.net.ConnectException
-...
+  Selection    Path                                            Priority   Status
+------------------------------------------------------------
+  0            /usr/lib/jvm/java-17-openjdk-amd64/bin/java      1711      auto mode
+  1            /usr/lib/jvm/java-11-openjdk-amd64/bin/java      1111      manual mode
+  2            /usr/lib/jvm/java-17-openjdk-amd64/bin/java      1711      manual mode
+* 3            /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java   1081      manual mode
 ```
-where
-* `Retry location` indicates the program locations of a retry (e.g. https://github.com/apache/hadoop/tree//ee7d178//hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java#L790)
-* `Enclosing method` indicates the method from where the retry location is called (e.g. `org.apache.hadoop.ipc.Client$Connection.setupIOstreams`)
-* `Retried method` indicates the method inside the retry logic ought to be retried (e.g. `org.apache.hadoop.ipc.Client$IpcStreams.setSaslClient`)
-* `Injection site` indicates the source location (source file and line of code) where a retried method is called. Also, this represents the program location where Wasabi injects exceptions.
-* `Exception` indicates the exception that Wasabi should throw at that location (e.g. `java.io.SocketException`)
-
 
-A `.conf` file instructs WASABI to use a specific injection policy and load injection locations from a particular `.data` file and has the following structure:
-
-```xml
-retry_data_file: /absolute/path/to/data/file/example_retry_locations.data
-injection_policy: max-count
-max_injection_count: 10
+Also, users need to set the `JAVA_HOME` environment variable to the appropriate path to the Java 11 directory in `/usr/lib`:
+```bash
+export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre
 ```
-where
-* retry_data_file: Absolute path to a .data file specifying injection sites.
-* injection_policy: One of no-injection, forever, or max-count.
-* max_injection_count: Positive integer specifying the upper limit of injections (used with max-count policy).
-
-The users can check out examples of `.data` and `.conf` files in the `./config` directory, or on the `sosp24-ae` [branch](https://github.com/bastoica/wasabi/tree/sosp24_wasabi/wasabi-testing/config).
-
-
-## Find retry bugs 
 
-Once WASABI is successfuly build, woven into a target application, and configured, users can instruct WASABI to finding potential retry bugs.
-
-To do so, users have two options:
-
-1. Option #1 (recommended): run individual tests and instruct WASABI to inject faults at only one location during the test run. The reason is that, by desing, WASABI tries to force the test to either crash or hang. If this happens at the first injection location, subsequent injection locations will not get a chance to execute due to the test terminating (or hanging) early.
+Users can check whether these operations were successful by
 ```bash
-cd [target_application_path]
-mvn clean install -U -fn -B -DskipTests 2>&1 | tee wasabi-build.log
-mvn surefire:test -fn -B -DconfigFile="$(echo $HOME)/wasabi/wasabi-testing/config/example_hdfs.conf" -Dtest=[TEST_NAME] 2>&1 | tee wasabi-test.log
+java -version
 ```
-
-2. Option #2: run the entire test suite and inject faults at multiple locations in the same testing runs. Users can opt to inject faults at multiple locations in the same testing run if they are confident that injecting at an earlier location does not affect the execution of a later location. In this case, users can create a multi-location `.data` file (e.g., like [this one](https://github.com/bastoica/wasabi/blob/sosp24_wasabi/wasabi-testing/config/hadoop/hadoop_retry_locations.data) for Hadoop).
-
+which should yield
 ```bash
-cd [target_application_path]
-mvn clean install -U -fn -B -DskipTests 2>&1 | tee wasabi-build.log
-mvn test  -fn -B  -DconfigFile="$(echo $HOME)/wasabi/wasabi-testing/config/example_hdfs.conf" 2>&1 | tee wasabi-test.log
+openjdk version "1.8.0_422"
+OpenJDK Runtime Environment (build 1.8.0_422-8u422-b05-1~22.04-b05)
+OpenJDK 64-Bit Server VM (build 25.422-b05, mixed mode)
+```
+and
+```bash
+echo $JAVA_HOME
+```
+which should yield
+```bash
+/usr/lib/jvm/java-8-openjdk-amd64/jre
 ```
 
-## 6. Running example: reproducing HDFS-17590 
-
-To illustrate how WASABI work, we walk users through an example that reproduces [HDFS-17590](https://issues.apache.org/jira/browse/HDFS-17590)&mdash;a previously unknown retry bug uncovered by WASABI.
-
-> [!NOTE]
-> Users might observe a "build failure" message when building and testing Hadoop. This is expected as a few testing-related components of Hadoop need more configuration to build properly with the ACJ compiler. WASABI does not need those components to find retry bugs. See the "[Known issues](README.md#7-known-issues)" section below for more details.
+## 3. A Minimal Example: Reproducing HDFS-17590 (1.5h, 15min human effort)
 
+With the prerequisits installed, users can now run a series of `bash` commands that would lead to reproducing [HDFS-17590](https://issues.apache.org/jira/browse/HDFS-17590)&mdash;a previously unknown retry bug uncovered by WASABI. Note that HDFS is a module of Hadoop, so while the bug manifests in HDFS we will first need to clone and build Hadoop from source.
 
 1. Ensure the prerequisites are successfully installed (see "Getting Started" above)
    
-2. Build and install WASABI (see "Building and installing WASABI" above)
+2. Build and install WASABI by running the following commands:
+```bash
+cd $WORKSPACE_DIR/wasabi/wasabi-testing
+mvn clean install -U -fn -B -Dinstrumentation.target=hadoop -DskipTests 2>&1 | tee wasabi-install.log
+```
+
+If successful users should see a message similar to 
+```bash
+...
+[INFO] ------------------------------------------------------------------------
+[INFO] BUILD SUCCESS
+[INFO] ------------------------------------------------------------------------
+[INFO] Total time:  36.384 s
+[INFO] Finished at: 2024-08-12T19:57:24Z
+[INFO] ------------------------------------------------------------------------
+```
 
 3. Clone Hadoop (note: HDFS is part of Hadoop),
 ```bash
-cd ~/sosp24_wasabi/benchmarks
+cd $WORKSPACE_DIR/benchmarks
 git clone https://github.com/apache/hadoop
 ```
 and check out version/commit `60867de`:
 ```bash
-cd ~/sosp24_wasabi/benchmarks/hadoop
+cd $WORKSPACE_DIR/benchmarks/hadoop
 git checkout 60867de
 ```
 Users can check whether `60867de` was successfully checked out by running
@@ -339,74 +162,10 @@ Date:   Mon Aug 21 10:05:34 2023 +0800
 mvn install -U -fn -B -DskipTests 2>&1 | tee wasabi-pass-install.log
 ```
 
-5. Run the test that WASABI uses to trigger HDFS-17590 to confirm that the bug does not get triggered without fault injection
-```bash
-mvn surefire:test -fn -B -Dtest=TestFSEditLogLoader 2>&1 | tee wasabi-pass-test.log
-```
-by checking that the test runs successfully. First, checking that there is no `NullPointerException`
-```bash
-grep -A10 -B2 "NullPointerException" wasabi-pass-test.log
-```
-which should yield no output, as well as that all such tests passed
-```bash
-grep "Tests run.*TestFSEditLogLoader" wasabi-pass-test.log
-```
-which should yield a line similar to this (note that number of tests might differ slightly)
-```bash
-[INFO] Tests run: 26, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 154.223 s - in org.apache.hadoop.hdfs.server.namenode.TestFSEditLogLoader 
-```
-
-6. Copy a modified `pom.xml` file that allows WASABI to instrument (weave) Hadoop by running
-```bash
-cp pom.xml pom-original.xml
-cp ~/sosp24_wasabi/wasabi/wasabi-testing/config/hadoop/pom-hadoop.xml pom.xml
-```
-Note that these commands are making a copy of the original `pom.xml` and replace it with a slightly edited version that instructs the AJC compiler to instrument (weave) WASABI. Also, these alterations are specific to version `60867de`. Checking out another Hadoop commit ID requires adjustments. We provide instructions on how to adapt an original `pom.xml`, [here](README.md#instrumentation-weaving-instructions).
-
-7. Instrument Hadoop with WASABI by running
-```bash
-mvn clean install -U -fn -B -DskipTests 2>&1 | tee wasabi-fail-install.log
-```
-
-8. Run the bug-triggering tests with fault injection
-```bash
-mvn surefire:test -fn -B -DconfigFile="$(echo $HOME)/sosp24_wasabi/wasabi/wasabi-testing/config/hadoop/example.conf" -Dtest=TestFSEditLogLoader 2>&1 | tee wasabi-fail-test.log
-```
-and check the log to for `NullPointerException` errors
-```bash
-grep -A10 -B2 "NullPointerException" wasabi-fail-test.log
-```
-which should yield
-```bash
-[ERROR] Tests run: 26, Failures: 0, Errors: 2, Skipped: 0, Time elapsed: 137.645 s <<< FAILURE! - in org.apache.hadoop.hdfs.server.namenode.TestFSEditLogLoader
-[ERROR] testErasureCodingPolicyOperations[0](org.apache.hadoop.hdfs.server.namenode.TestFSEditLogLoader)  Time elapsed: 22.691 s  <<< ERROR!
-java.lang.NullPointerException
-        at java.base/java.util.concurrent.ConcurrentHashMap.putVal(ConcurrentHashMap.java:1011)
-        at java.base/java.util.concurrent.ConcurrentHashMap.put(ConcurrentHashMap.java:1006)
-        at org.apache.hadoop.hdfs.DFSInputStream.addToLocalDeadNodes(DFSInputStream.java:184)
-        at org.apache.hadoop.hdfs.DFSStripedInputStream.createBlockReader(DFSStripedInputStream.java:279)
-        at org.apache.hadoop.hdfs.StripeReader.readChunk(StripeReader.java:304)
-        at org.apache.hadoop.hdfs.StripeReader.readStripe(StripeReader.java:335)
-        at org.apache.hadoop.hdfs.DFSStripedInputStream.readOneStripe(DFSStripedInputStream.java:320)
-        at org.apache.hadoop.hdfs.DFSStripedInputStream.readWithStrategy(DFSStripedInputStream.java:415)
-        at org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:919)
-        at java.base/java.io.DataInputStream.read(DataInputStream.java:102)
-```    
-
-## 7. Known issues
-
-### 7.1 AspectJ Maven plugin circular dependency and versioning issues
-
-WASABI imports plugins that might also be imported by the target application. Users need to manually resolve potential circular dependencies or plugin version incompatibilities. Users could also reference [this](https://github.com/dev-aspectj/aspectj-maven-plugin/issues/143) issue in the `aspectj-maven-plugin` repository for suggestions on how to tackle such issues.
-
-### 7.2 Build failures after weaving
-
-The AspectJ compiler and supporting plugins might not be able to weave (instrument) all modules of a target successfully. While users are encouraged to address this, we recommend disregarding modules that are not critical to the core functionality of the application (e.g., benchmarking modules) or that do not implement or test retry-related code.
-
-For example, when reproducing HDFS-17590, users might observe a "build failure" message at the end of the build and testing processes. This is expected, as a few benchmark-related components of Hadoop require extra configuration for the AJC to compile them successfully. However, WASABI does not need these components to build correctly in order to find retry bugs. For reference, this is the original build log that WASABI encountered when building Hadoop. Note that the core components of Hadoop (common and client), HDFS, Yarn, and MapReduce all built successfully.
+Users might observe a "build failure" message at the end of the build process. This is expected as a few benchmark-related components of Hadoop need more configuration to build properly with the ACJ compiler. WASABI does not need those components to find retry bugs. For reference, we attach our build log below. Note that the core components of Hadoop (common and client), HDFS, Yarn, and MapReduce all build successfully. 
 
 <details>
-<summary>Hadoop `60867de` build log (expand for details):</summary>
+<summary>Hadoop build log details:</summary>
 
 ```bash
 [INFO] ------------------------------------------------------------------------
@@ -535,12 +294,137 @@ For example, when reproducing HDFS-17590, users might observe a "build failure"
 ```
 </details>
 
-### 7.3 Bare metal versus containerized deployments
+5. Run the test that WASABI uses to trigger HDFS-17590 to confirm that the bug does not get triggered without fault injection
+```bash
+mvn surefire:test -fn -B -Dtest=TestFSEditLogLoader 2>&1 | tee wasabi-pass-test.log
+```
+by checking that the test runs successfully. First, checking that there is no `NullPointerException`
+```bash
+grep -A10 -B2 "NullPointerException" wasabi-pass-test.log
+```
+which should yield no output, as well as that all such tests passed
+```bash
+grep "Tests run.*TestFSEditLogLoader" wasabi-pass-test.log
+```
+which should yield a line similar to this (note that number of tests might differ slightly)
+```bash
+[INFO] Tests run: 26, Failures: 0, Errors: 0, Skipped: 0, Time elapsed: 154.223 s - in org.apache.hadoop.hdfs.server.namenode.TestFSEditLogLoader 
+```
+
+6. Copy a modified `pom.xml` file that allows WASABI to instrument (weave) Hadoop by running
+```bash
+cp pom.xml pom-original.xml
+cp $WORKSPACE_DIR/wasabi/wasabi-testing/config/hadoop/pom-hadoop.xml pom.xml
+```
+Note that these commands are making a copy of the original `pom.xml` and replace it with a slightly edited version that instructs the AJC compiler to instrument (weave) WASABI. Also, these alterations are specific to version `60867de`. Checking out another Hadoop commit ID requires adjustments. We provide instructions on how to adapt an original `pom.xml`, [here](README.md#instrumentation-weaving-instructions).
+
+7. Instrument Hadoop with WASABI by running
+```bash
+mvn clean install -U -fn -B -DskipTests 2>&1 | tee wasabi-fail-install.log
+```
 
-WWASABI was tested on a bare metal machine. Fundamentally, there are no limitations to running WASABI in a containerized environment. However, there are known issues related to the Hadoop and HBase benchmarks used to evaluate WASABI in our [paper](https://bastoica.github.io/files/papers/2024_sosp_wasabi.pdf).
+8. Run the bug-triggering tests with fault injection
+```bash
+mvn surefire:test -fn -B -DconfigFile="$WORKSPACE_DIR/wasabi/wasabi-testing/config/hadoop/example.conf" -Dtest=TestFSEditLogLoader 2>&1 | tee wasabi-fail-test.log
+```
+and check the log to for `NullPointerException` errors
+```bash
+grep -A10 -B2 "NullPointerException" wasabi-fail-test.log
+```
+which should yield
+```bash
+[ERROR] Tests run: 26, Failures: 0, Errors: 2, Skipped: 0, Time elapsed: 137.645 s <<< FAILURE! - in org.apache.hadoop.hdfs.server.namenode.TestFSEditLogLoader
+[ERROR] testErasureCodingPolicyOperations[0](org.apache.hadoop.hdfs.server.namenode.TestFSEditLogLoader)  Time elapsed: 22.691 s  <<< ERROR!
+java.lang.NullPointerException
+        at java.base/java.util.concurrent.ConcurrentHashMap.putVal(ConcurrentHashMap.java:1011)
+        at java.base/java.util.concurrent.ConcurrentHashMap.put(ConcurrentHashMap.java:1006)
+        at org.apache.hadoop.hdfs.DFSInputStream.addToLocalDeadNodes(DFSInputStream.java:184)
+        at org.apache.hadoop.hdfs.DFSStripedInputStream.createBlockReader(DFSStripedInputStream.java:279)
+        at org.apache.hadoop.hdfs.StripeReader.readChunk(StripeReader.java:304)
+        at org.apache.hadoop.hdfs.StripeReader.readStripe(StripeReader.java:335)
+        at org.apache.hadoop.hdfs.DFSStripedInputStream.readOneStripe(DFSStripedInputStream.java:320)
+        at org.apache.hadoop.hdfs.DFSStripedInputStream.readWithStrategy(DFSStripedInputStream.java:415)
+        at org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:919)
+        at java.base/java.io.DataInputStream.read(DataInputStream.java:102)
+```    
+
+## 4. Comprehensive Evaluation (<12h, ~1h human effort)
+
+### 4.1. Running Individual Benchmarks
+
+To run indidual benchmarks and trigger the corresponding retry bugs described in our paper [1], we provide `run.py`&mdash;a Python script designed to manage the different phases of WASABI. Overall, each benchmark takes approximately two hours to complete.
+
+`run.py` operates in several distinct phases, that closely follow those described in Figure 1 [1]:
+
+1. **Setup**: Clones the necessary repositories and checks out specific versions required for evaluation.
+2. **Preparation**: Manages and customizes the pom.xml files for each benchmark to facilitate instrumented builds.
+3. **Bug triggering**: Executes the tests with WASABI instrumentation to trigger potential bugs.
+4. **Log analysis**: Analyzes the test logs to identify and report bugs.
+
+`run.py` accepts several command-line arguments that allow you to select the phase to execute and choose the benchmarks to evaluate.
+
+* `--phase`: Determines which phase of the pipeline to execute with the following options available:
+  * `setup`: Clones the necessary repositories and checks out specific versions.
+  * `prep`: Prepares the environment by renaming the original pom.xml files and replacing them with customized versions.
+  * `bug-triggering`: Executes the test cases using WASABI instrumentation to trigger bugs.
+  * `bug-oracles`: Analyzes the test logs for any anomalies or errors that indicate bugs.
+  * `all`: Runs all the above phases in sequence.
+* `--benchmark`: Specifies which benchmarks to evaluate, with the following options available: `hadoop`, `hbase`, `hive`, `cassandra`, and `elstaicsearch`. 
+
+Users can run all phases in one command, either iterating through the benchmarks individually
+```bash
+cd $WORKSPACE_DIR/wasabi/wasabi-testing/utils
+python3 run.py --phase all --benchmark hadoop
+```
+However, we recommend users run all benchmarks (Hadoop-common, HDFS, MapReduce, Yarn, HBase, and Hive)a nd all phases, using the following one-liner:
+```bash
+cd $WORKSPACE_DIR/wasabi/wasabi-testing/utils
+for target in hadoop hbase hive; do python3 run.py --phase all --benchmark $target 2>&1 | tee -a wasabi-full-eval.log; done
+```
 
-In short, some Hadoop and HBase tests require access to a non-virtualized, physical network. Without this, users might encounter errors such as
+Optionally, user can choose to invoke individual phases of WASABI, by running
+```bash
+cd $WORKSPACE_DIR/wasabi/wasabi-testing/utils
+python3 run.py --phase bug-triggering --benchmark hadoop
 ```
-ERROR regionserver.HRegionServer: Master passed us a different hostname to use; was=grimlock, but now=169.254.3.1
+which yields an output similar to
+```bash
+**************************
+* hadoop: bug triggering *
+**************************
+Running tests for hadoop...
+Job count: 1 / 99
+Executing command: mvn -B -DconfigFile=/home/user/sosp24-ae/wasabi/wasabi-testing/config/hadoop/test_plan.conf -Dtest=Test1 surefire:test
+Running tests for hadoop...
+Job count: 2 / 99
+Executing command: mvn -B -DconfigFile=/home/user/sosp24-ae/wasabi/wasabi-testing/config/hadoop/test_plan.conf -Dtest=Test2 surefire:test
+Running tests for hadoop...
+...
+Job count: 10 / 99
+Executing command: mvn -B -DconfigFile=/home/user/sosp24-ae/wasabi/wasabi-testing/config/hadoop/test_plan.conf -Dtest=Test99 surefire:test
 ```
-These errors occur due to a hostname-to-IP mismatch in the network setup of your system, not because of an issue with WASABI. The likely cause is a misconfigured `/etc/hosts` file, multiple network interfaces on your machine, or running our tool in a containerized environment (e.g., docker).
+
+### 4.2. Weaving WASABI at load time
+
+> [!NOTE]
+> Cassandra and ElasticSearch use different build systems&mdash;Ant and Gradle, respectively&mdash;instead of Maven. As a result, integrating WASABI requires a separate, mostly manual process of load-time weaving rather than compile-time weaving (see details below). This process involves compiling, packaging, and making significant modifications to the build configuration files. Once the load-time weaving is successfully completed&mdash;essentially the "setup" and "preparation" phases described earlier&mdash;users can proceed to the final two phases by running the following commands
+> ```
+> python3 run.py --phase bug-triggering --benchmark cassandra
+> python3 run.py --phase bug-oracles --benchmark cassandra
+> ```
+> Due to requiring more manual effort, we recommend running the other six benchmarks (Hadoop-common, HDFS, MapReduce, Yarn, HBase, and Hive), before tackling Cassandra and ElasticSearch. These cover 39 out of 42 bugs reported in our evaluation [[1]](README.md#references).
+
+Users can ignore instructions on waving at load time, and running the `ElasticSearch` and `Cassandra`.
+
+### 4.3. Unpacking Results
+
+To generate results in Table 3 [[1]](README.md#references), users can run the following command
+```
+cd $WORKSPACE_DIR/wasabi/wasabi-testing/utils
+python3 display_bug_results.py | less
+```
+
+The scripts prints out two tables: the original Table 3 from our paper, as well as a breakdown of the bugs found by type and benchmark&mdash;essentially the complement of Table 3 [[1]](README.md#references).
+
+## 5. References
+[1] "[If At First You Don't Succeed, Try, Try, Again...? Insights and LLM-informed Tooling for Detecting Retry Bugs in Software Systems](https://bastoica.github.io/files/papers/2024_sosp_wasabi.pdf)". Bogdan Alexandru Stoica*, Utsav Sethi*, Yiming Su, Cyrus Zhou, Shan Lu, Jonathan Mace, Madan Musuvathi, Suman Nath (*equal contribution). The 30th Symposium on Operating Systems Principles (SOSP). Austin, TX, USA. November, 2024.