add venv support for custom evals

krisztianfekete · krisztianfekete · commit 777a92746329 · 2026-03-26T12:33:03.000+01:00
diff --git a/examples/custom_evaluators/eval_config.yaml b/examples/custom_evaluators/eval_config.yaml
@@ -33,3 +33,13 @@ evaluators:
     threshold: 0.110
     executor: local
 
+  # TODO switch to GitHub once PR is approved.
+  - name: bertscore
+    type: code
+    path: ../evaluators/evaluators/bertscore/bertscore.py
+    threshold: 0.7
+    timeout: 300
+    config:
+      expected: "There are two Helm releases installed in the cluster: kagent in namespace kagent (revision 2, deployed, chart kagent-0.7.14) and kagent-crds in namespace kagent (revision 1, deployed, chart kagent-crds-0.7.14)."
+      metric: "f1"
+
diff --git a/src/agentevals/custom_evaluators.py b/src/agentevals/custom_evaluators.py
@@ -68,8 +68,12 @@ def extensions(self) -> tuple[str, ...]:
         """File extensions this runtime handles (e.g. ``(".py",)``)."""
 
     @abc.abstractmethod
-    def build_command(self, path: Path) -> list[str]:
-        """Return the argv list to execute *path*."""
+    def build_command(self, path: Path, python: Path | None = None) -> list[str]:
+        """Return the argv list to execute *path*.
+
+        For Python runtimes, *python* may point to a venv interpreter.
+        Non-Python runtimes ignore this parameter.
+        """
 
     def is_available(self) -> bool:
         """Return True if the runtime's interpreter is found on the system."""
@@ -89,8 +93,9 @@ def name(self) -> str:
     def extensions(self) -> tuple[str, ...]:
         return (".py",)
 
-    def build_command(self, path: Path) -> list[str]:
-        return [sys.executable, str(path)]
+    def build_command(self, path: Path, python: Path | None = None) -> list[str]:
+        exe = str(python) if python else sys.executable
+        return [exe, str(path)]
 
     def is_available(self) -> bool:
         return True
@@ -105,7 +110,7 @@ def name(self) -> str:
     def extensions(self) -> tuple[str, ...]:
         return (".js", ".ts")
 
-    def build_command(self, path: Path) -> list[str]:
+    def build_command(self, path: Path, python: Path | None = None) -> list[str]:
         node = shutil.which("node")
         if not node:
             raise RuntimeError("Node.js not found on PATH (required for .js/.ts evaluators)")
@@ -203,40 +208,45 @@ class SubprocessBackend(EvaluatorBackend):
     """Runs a local code file (.py, .js, .ts, …) as a subprocess.
 
     The correct interpreter is resolved from the file extension via the
-    :data:`_RUNTIMES` registry.
+    :data:`_RUNTIMES` registry.  When *venv_python* is provided, Python
+    evaluators run inside that virtual environment instead of the host
+    interpreter.
     """
 
-    def __init__(self, path: Path, timeout: int = 30):
+    def __init__(self, path: Path, timeout: int = 30, venv_python: Path | None = None):
         self._path = path.resolve()
         self._runtime = _resolve_runtime(self._path)
         self._timeout = timeout
+        self._venv_python = venv_python
 
         if not self._path.exists():
             raise FileNotFoundError(f"Evaluator file not found: {self._path}")
 
     async def run(self, eval_input: EvalInput, metric_name: str) -> EvalResult:
-        cmd = self._runtime.build_command(self._path)
+        cmd = self._runtime.build_command(self._path, self._venv_python)
         return await _run_subprocess(cmd, eval_input.model_dump_json(), self._timeout, metric_name)
 
 
 # ---------------------------------------------------------------------------
 # Executor factory
 # ---------------------------------------------------------------------------
 
-_EXECUTOR_FACTORIES: dict[str, Callable[[Path, int], EvaluatorBackend]] = {
-    "local": lambda path, timeout: SubprocessBackend(path, timeout),
+_EXECUTOR_FACTORIES: dict[str, Callable[..., EvaluatorBackend]] = {
+    "local": lambda path, timeout, venv_python=None: SubprocessBackend(path, timeout, venv_python),
 }
 
 
-def create_executor(executor_name: str, path: Path, timeout: int = 30) -> EvaluatorBackend:
+def create_executor(
+    executor_name: str, path: Path, timeout: int = 30, venv_python: Path | None = None
+) -> EvaluatorBackend:
     """Construct an EvaluatorBackend by executor name (e.g. 'local', 'docker')."""
     factory = _EXECUTOR_FACTORIES.get(executor_name)
     if factory is None:
         raise ValueError(f"Unknown executor '{executor_name}'. Available: {sorted(_EXECUTOR_FACTORIES.keys())}")
-    return factory(path, timeout)
+    return factory(path, timeout, venv_python)
 
 
-def register_executor(name: str, factory: Callable[[Path, int], EvaluatorBackend]) -> None:
+def register_executor(name: str, factory: Callable[..., EvaluatorBackend]) -> None:
     """Register a new executor factory (e.g. for Docker support)."""
     _EXECUTOR_FACTORIES[name] = factory
 
@@ -425,7 +435,25 @@ async def evaluate_custom_evaluator(
         evaluator_def = await get_default_resolver().resolve(evaluator_def)
 
     if isinstance(evaluator_def, CodeEvaluatorDef):
-        backend = create_executor(evaluator_def.executor, Path(evaluator_def.path), evaluator_def.timeout)
+        evaluator_path = Path(evaluator_def.path)
+
+        # Set up a venv if the evaluator ships a requirements.txt.
+        venv_python: Path | None = None
+        if evaluator_path.suffix == ".py":
+            from .evaluator.venv import ensure_venv_async
+
+            try:
+                venv_python = await ensure_venv_async(evaluator_path)
+            except Exception as exc:
+                logger.error("Failed to set up venv for '%s': %s", evaluator_def.name, exc)
+                return MetricResult(
+                    metric_name=evaluator_def.name,
+                    error=f"Dependency installation failed: {exc}",
+                )
+
+        backend = create_executor(
+            evaluator_def.executor, evaluator_path, evaluator_def.timeout, venv_python=venv_python
+        )
     else:
         raise ValueError(f"Unsupported custom evaluator type: {type(evaluator_def).__name__}")
 
diff --git a/src/agentevals/evaluator/sources.py b/src/agentevals/evaluator/sources.py
@@ -216,8 +216,22 @@ async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
             resp = await client.get(url, headers=self._headers(), timeout=30)
             resp.raise_for_status()
 
-        dest.parent.mkdir(parents=True, exist_ok=True)
-        dest.write_text(resp.text, encoding="utf-8")  # noqa: ASYNC240
+            dest.parent.mkdir(parents=True, exist_ok=True)
+            dest.write_text(resp.text, encoding="utf-8")  # noqa: ASYNC240
+
+            # Also try to fetch requirements.txt from the same directory.
+            ref_dir = str(Path(ref).parent)
+            req_ref = f"{ref_dir}/requirements.txt"
+            req_url = self._raw_url(req_ref)
+            try:
+                req_resp = await client.get(req_url, headers=self._headers(), timeout=15)
+                if req_resp.status_code == 200:
+                    req_dest = dest.parent / "requirements.txt"
+                    req_dest.write_text(req_resp.text, encoding="utf-8")  # noqa: ASYNC240
+                    logger.info("Downloaded requirements.txt for evaluator")
+            except httpx.HTTPError:
+                logger.debug("No requirements.txt found for evaluator (or download failed)")
+
         return dest
 
 
@@ -267,6 +281,12 @@ async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
 
         dest.parent.mkdir(parents=True, exist_ok=True)
         shutil.copy2(src, dest)
+
+        # Also copy requirements.txt if it exists alongside the source file.
+        req_src = src.parent / "requirements.txt"
+        if req_src.exists():
+            shutil.copy2(req_src, dest.parent / "requirements.txt")
+
         return dest
 
 
diff --git a/src/agentevals/evaluator/venv.py b/src/agentevals/evaluator/venv.py
@@ -0,0 +1,126 @@
+"""Virtual environment management for evaluators with dependencies.
+
+When an evaluator ships a ``requirements.txt`` alongside its entrypoint, we
+create a cached venv, install the dependencies (plus the evaluator SDK), and
+return the path to that venv's Python interpreter so the evaluator subprocess
+runs in isolation.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import logging
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+_VENV_CACHE_DIR = Path.home() / ".cache" / "agentevals" / "venvs"
+_HASH_FILE = ".requirements_hash"
+
+# Per-evaluator locks to prevent concurrent venv creation for the same evaluator.
+_venv_locks: dict[str, asyncio.Lock] = {}
+
+
+def _venv_python(venv_dir: Path) -> Path:
+    if sys.platform == "win32":
+        return venv_dir / "Scripts" / "python.exe"
+    return venv_dir / "bin" / "python"
+
+
+def _venv_key(evaluator_path: Path) -> str:
+    """Stable cache directory name derived from evaluator location."""
+    resolved = evaluator_path.resolve()
+    name = resolved.parent.name
+    path_hash = hashlib.sha256(str(resolved.parent).encode()).hexdigest()[:8]
+    return f"{name}-{path_hash}"
+
+
+def _is_venv_valid(venv_dir: Path, req_hash: str) -> bool:
+    hash_file = venv_dir / _HASH_FILE
+    return (
+        _venv_python(venv_dir).exists()
+        and hash_file.exists()
+        and hash_file.read_text().strip() == req_hash
+    )
+
+
+def _create_venv(venv_dir: Path, uv: str | None) -> None:
+    if venv_dir.exists():
+        shutil.rmtree(venv_dir)
+    cmd = (
+        [uv, "venv", str(venv_dir), "--python", sys.executable]
+        if uv
+        else [sys.executable, "-m", "venv", str(venv_dir)]
+    )
+    subprocess.run(cmd, check=True, capture_output=True)
+
+
+def _install_deps(venv_dir: Path, requirements: Path, uv: str | None) -> None:
+    python = str(_venv_python(venv_dir))
+    sdk_spec = "agentevals-evaluator-sdk"
+
+    if uv:
+        base = [uv, "pip", "install", "--python", python]
+    else:
+        base = [python, "-m", "pip", "install"]
+
+    subprocess.run(base + [sdk_spec], check=True, capture_output=True)
+    logger.info("Installing dependencies from %s ...", requirements.name)
+    subprocess.run(base + ["-r", str(requirements)], check=True)
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def ensure_venv(evaluator_path: Path) -> Path | None:
+    """Ensure a cached venv exists for *evaluator_path* if it has ``requirements.txt``.
+
+    Returns the venv Python path, or ``None`` if no venv is needed.
+    """
+    requirements = evaluator_path.resolve().parent / "requirements.txt"
+    if not requirements.exists():
+        return None
+
+    req_hash = hashlib.sha256(requirements.read_bytes()).hexdigest()
+    venv_dir = _VENV_CACHE_DIR / _venv_key(evaluator_path)
+
+    if _is_venv_valid(venv_dir, req_hash):
+        logger.debug("Using cached venv for %s at %s", evaluator_path.name, venv_dir)
+        return _venv_python(venv_dir)
+
+    uv = shutil.which("uv")
+    logger.info(
+        "Setting up environment for evaluator '%s' (using %s). "
+        "This may take a while on first run...",
+        evaluator_path.stem,
+        "uv" if uv else "venv+pip",
+    )
+
+    try:
+        _create_venv(venv_dir, uv)
+        _install_deps(venv_dir, requirements, uv)
+    except subprocess.CalledProcessError as exc:
+        stderr = exc.stderr.decode() if isinstance(exc.stderr, bytes) else (exc.stderr or "")
+        raise RuntimeError(
+            f"Failed to set up environment for evaluator '{evaluator_path.stem}': {stderr}"
+        ) from exc
+
+    (venv_dir / _HASH_FILE).write_text(req_hash)
+    logger.info("Environment ready for '%s'", evaluator_path.stem)
+    return _venv_python(venv_dir)
+
+
+async def ensure_venv_async(evaluator_path: Path) -> Path | None:
+    """Async wrapper around :func:`ensure_venv` with per-evaluator locking."""
+    venv_key = _venv_key(evaluator_path)
+    if venv_key not in _venv_locks:
+        _venv_locks[venv_key] = asyncio.Lock()
+
+    async with _venv_locks[venv_key]:
+        return await asyncio.to_thread(ensure_venv, evaluator_path)