Skip to content

Commit 777a927

Browse files
add venv support for custom evals
1 parent 8919baf commit 777a927

4 files changed

Lines changed: 200 additions & 16 deletions

File tree

examples/custom_evaluators/eval_config.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,13 @@ evaluators:
3333
threshold: 0.110
3434
executor: local
3535

36+
# TODO switch to GitHub once PR is approved.
37+
- name: bertscore
38+
type: code
39+
path: ../evaluators/evaluators/bertscore/bertscore.py
40+
threshold: 0.7
41+
timeout: 300
42+
config:
43+
expected: "There are two Helm releases installed in the cluster: kagent in namespace kagent (revision 2, deployed, chart kagent-0.7.14) and kagent-crds in namespace kagent (revision 1, deployed, chart kagent-crds-0.7.14)."
44+
metric: "f1"
45+

src/agentevals/custom_evaluators.py

Lines changed: 42 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,12 @@ def extensions(self) -> tuple[str, ...]:
6868
"""File extensions this runtime handles (e.g. ``(".py",)``)."""
6969

7070
@abc.abstractmethod
71-
def build_command(self, path: Path) -> list[str]:
72-
"""Return the argv list to execute *path*."""
71+
def build_command(self, path: Path, python: Path | None = None) -> list[str]:
72+
"""Return the argv list to execute *path*.
73+
74+
For Python runtimes, *python* may point to a venv interpreter.
75+
Non-Python runtimes ignore this parameter.
76+
"""
7377

7478
def is_available(self) -> bool:
7579
"""Return True if the runtime's interpreter is found on the system."""
@@ -89,8 +93,9 @@ def name(self) -> str:
8993
def extensions(self) -> tuple[str, ...]:
9094
return (".py",)
9195

92-
def build_command(self, path: Path) -> list[str]:
93-
return [sys.executable, str(path)]
96+
def build_command(self, path: Path, python: Path | None = None) -> list[str]:
97+
exe = str(python) if python else sys.executable
98+
return [exe, str(path)]
9499

95100
def is_available(self) -> bool:
96101
return True
@@ -105,7 +110,7 @@ def name(self) -> str:
105110
def extensions(self) -> tuple[str, ...]:
106111
return (".js", ".ts")
107112

108-
def build_command(self, path: Path) -> list[str]:
113+
def build_command(self, path: Path, python: Path | None = None) -> list[str]:
109114
node = shutil.which("node")
110115
if not node:
111116
raise RuntimeError("Node.js not found on PATH (required for .js/.ts evaluators)")
@@ -203,40 +208,45 @@ class SubprocessBackend(EvaluatorBackend):
203208
"""Runs a local code file (.py, .js, .ts, …) as a subprocess.
204209
205210
The correct interpreter is resolved from the file extension via the
206-
:data:`_RUNTIMES` registry.
211+
:data:`_RUNTIMES` registry. When *venv_python* is provided, Python
212+
evaluators run inside that virtual environment instead of the host
213+
interpreter.
207214
"""
208215

209-
def __init__(self, path: Path, timeout: int = 30):
216+
def __init__(self, path: Path, timeout: int = 30, venv_python: Path | None = None):
210217
self._path = path.resolve()
211218
self._runtime = _resolve_runtime(self._path)
212219
self._timeout = timeout
220+
self._venv_python = venv_python
213221

214222
if not self._path.exists():
215223
raise FileNotFoundError(f"Evaluator file not found: {self._path}")
216224

217225
async def run(self, eval_input: EvalInput, metric_name: str) -> EvalResult:
218-
cmd = self._runtime.build_command(self._path)
226+
cmd = self._runtime.build_command(self._path, self._venv_python)
219227
return await _run_subprocess(cmd, eval_input.model_dump_json(), self._timeout, metric_name)
220228

221229

222230
# ---------------------------------------------------------------------------
223231
# Executor factory
224232
# ---------------------------------------------------------------------------
225233

226-
_EXECUTOR_FACTORIES: dict[str, Callable[[Path, int], EvaluatorBackend]] = {
227-
"local": lambda path, timeout: SubprocessBackend(path, timeout),
234+
_EXECUTOR_FACTORIES: dict[str, Callable[..., EvaluatorBackend]] = {
235+
"local": lambda path, timeout, venv_python=None: SubprocessBackend(path, timeout, venv_python),
228236
}
229237

230238

231-
def create_executor(executor_name: str, path: Path, timeout: int = 30) -> EvaluatorBackend:
239+
def create_executor(
240+
executor_name: str, path: Path, timeout: int = 30, venv_python: Path | None = None
241+
) -> EvaluatorBackend:
232242
"""Construct an EvaluatorBackend by executor name (e.g. 'local', 'docker')."""
233243
factory = _EXECUTOR_FACTORIES.get(executor_name)
234244
if factory is None:
235245
raise ValueError(f"Unknown executor '{executor_name}'. Available: {sorted(_EXECUTOR_FACTORIES.keys())}")
236-
return factory(path, timeout)
246+
return factory(path, timeout, venv_python)
237247

238248

239-
def register_executor(name: str, factory: Callable[[Path, int], EvaluatorBackend]) -> None:
249+
def register_executor(name: str, factory: Callable[..., EvaluatorBackend]) -> None:
240250
"""Register a new executor factory (e.g. for Docker support)."""
241251
_EXECUTOR_FACTORIES[name] = factory
242252

@@ -425,7 +435,25 @@ async def evaluate_custom_evaluator(
425435
evaluator_def = await get_default_resolver().resolve(evaluator_def)
426436

427437
if isinstance(evaluator_def, CodeEvaluatorDef):
428-
backend = create_executor(evaluator_def.executor, Path(evaluator_def.path), evaluator_def.timeout)
438+
evaluator_path = Path(evaluator_def.path)
439+
440+
# Set up a venv if the evaluator ships a requirements.txt.
441+
venv_python: Path | None = None
442+
if evaluator_path.suffix == ".py":
443+
from .evaluator.venv import ensure_venv_async
444+
445+
try:
446+
venv_python = await ensure_venv_async(evaluator_path)
447+
except Exception as exc:
448+
logger.error("Failed to set up venv for '%s': %s", evaluator_def.name, exc)
449+
return MetricResult(
450+
metric_name=evaluator_def.name,
451+
error=f"Dependency installation failed: {exc}",
452+
)
453+
454+
backend = create_executor(
455+
evaluator_def.executor, evaluator_path, evaluator_def.timeout, venv_python=venv_python
456+
)
429457
else:
430458
raise ValueError(f"Unsupported custom evaluator type: {type(evaluator_def).__name__}")
431459

src/agentevals/evaluator/sources.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,8 +216,22 @@ async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
216216
resp = await client.get(url, headers=self._headers(), timeout=30)
217217
resp.raise_for_status()
218218

219-
dest.parent.mkdir(parents=True, exist_ok=True)
220-
dest.write_text(resp.text, encoding="utf-8") # noqa: ASYNC240
219+
dest.parent.mkdir(parents=True, exist_ok=True)
220+
dest.write_text(resp.text, encoding="utf-8") # noqa: ASYNC240
221+
222+
# Also try to fetch requirements.txt from the same directory.
223+
ref_dir = str(Path(ref).parent)
224+
req_ref = f"{ref_dir}/requirements.txt"
225+
req_url = self._raw_url(req_ref)
226+
try:
227+
req_resp = await client.get(req_url, headers=self._headers(), timeout=15)
228+
if req_resp.status_code == 200:
229+
req_dest = dest.parent / "requirements.txt"
230+
req_dest.write_text(req_resp.text, encoding="utf-8") # noqa: ASYNC240
231+
logger.info("Downloaded requirements.txt for evaluator")
232+
except httpx.HTTPError:
233+
logger.debug("No requirements.txt found for evaluator (or download failed)")
234+
221235
return dest
222236

223237

@@ -267,6 +281,12 @@ async def fetch_evaluator(self, ref: str, dest: Path) -> Path:
267281

268282
dest.parent.mkdir(parents=True, exist_ok=True)
269283
shutil.copy2(src, dest)
284+
285+
# Also copy requirements.txt if it exists alongside the source file.
286+
req_src = src.parent / "requirements.txt"
287+
if req_src.exists():
288+
shutil.copy2(req_src, dest.parent / "requirements.txt")
289+
270290
return dest
271291

272292

src/agentevals/evaluator/venv.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
"""Virtual environment management for evaluators with dependencies.
2+
3+
When an evaluator ships a ``requirements.txt`` alongside its entrypoint, we
4+
create a cached venv, install the dependencies (plus the evaluator SDK), and
5+
return the path to that venv's Python interpreter so the evaluator subprocess
6+
runs in isolation.
7+
"""
8+
9+
from __future__ import annotations
10+
11+
import asyncio
12+
import hashlib
13+
import logging
14+
import shutil
15+
import subprocess
16+
import sys
17+
from pathlib import Path
18+
19+
logger = logging.getLogger(__name__)
20+
21+
_VENV_CACHE_DIR = Path.home() / ".cache" / "agentevals" / "venvs"
22+
_HASH_FILE = ".requirements_hash"
23+
24+
# Per-evaluator locks to prevent concurrent venv creation for the same evaluator.
25+
_venv_locks: dict[str, asyncio.Lock] = {}
26+
27+
28+
def _venv_python(venv_dir: Path) -> Path:
29+
if sys.platform == "win32":
30+
return venv_dir / "Scripts" / "python.exe"
31+
return venv_dir / "bin" / "python"
32+
33+
34+
def _venv_key(evaluator_path: Path) -> str:
35+
"""Stable cache directory name derived from evaluator location."""
36+
resolved = evaluator_path.resolve()
37+
name = resolved.parent.name
38+
path_hash = hashlib.sha256(str(resolved.parent).encode()).hexdigest()[:8]
39+
return f"{name}-{path_hash}"
40+
41+
42+
def _is_venv_valid(venv_dir: Path, req_hash: str) -> bool:
43+
hash_file = venv_dir / _HASH_FILE
44+
return (
45+
_venv_python(venv_dir).exists()
46+
and hash_file.exists()
47+
and hash_file.read_text().strip() == req_hash
48+
)
49+
50+
51+
def _create_venv(venv_dir: Path, uv: str | None) -> None:
52+
if venv_dir.exists():
53+
shutil.rmtree(venv_dir)
54+
cmd = (
55+
[uv, "venv", str(venv_dir), "--python", sys.executable]
56+
if uv
57+
else [sys.executable, "-m", "venv", str(venv_dir)]
58+
)
59+
subprocess.run(cmd, check=True, capture_output=True)
60+
61+
62+
def _install_deps(venv_dir: Path, requirements: Path, uv: str | None) -> None:
63+
python = str(_venv_python(venv_dir))
64+
sdk_spec = "agentevals-evaluator-sdk"
65+
66+
if uv:
67+
base = [uv, "pip", "install", "--python", python]
68+
else:
69+
base = [python, "-m", "pip", "install"]
70+
71+
subprocess.run(base + [sdk_spec], check=True, capture_output=True)
72+
logger.info("Installing dependencies from %s ...", requirements.name)
73+
subprocess.run(base + ["-r", str(requirements)], check=True)
74+
75+
76+
# ---------------------------------------------------------------------------
77+
# Public API
78+
# ---------------------------------------------------------------------------
79+
80+
81+
def ensure_venv(evaluator_path: Path) -> Path | None:
82+
"""Ensure a cached venv exists for *evaluator_path* if it has ``requirements.txt``.
83+
84+
Returns the venv Python path, or ``None`` if no venv is needed.
85+
"""
86+
requirements = evaluator_path.resolve().parent / "requirements.txt"
87+
if not requirements.exists():
88+
return None
89+
90+
req_hash = hashlib.sha256(requirements.read_bytes()).hexdigest()
91+
venv_dir = _VENV_CACHE_DIR / _venv_key(evaluator_path)
92+
93+
if _is_venv_valid(venv_dir, req_hash):
94+
logger.debug("Using cached venv for %s at %s", evaluator_path.name, venv_dir)
95+
return _venv_python(venv_dir)
96+
97+
uv = shutil.which("uv")
98+
logger.info(
99+
"Setting up environment for evaluator '%s' (using %s). "
100+
"This may take a while on first run...",
101+
evaluator_path.stem,
102+
"uv" if uv else "venv+pip",
103+
)
104+
105+
try:
106+
_create_venv(venv_dir, uv)
107+
_install_deps(venv_dir, requirements, uv)
108+
except subprocess.CalledProcessError as exc:
109+
stderr = exc.stderr.decode() if isinstance(exc.stderr, bytes) else (exc.stderr or "")
110+
raise RuntimeError(
111+
f"Failed to set up environment for evaluator '{evaluator_path.stem}': {stderr}"
112+
) from exc
113+
114+
(venv_dir / _HASH_FILE).write_text(req_hash)
115+
logger.info("Environment ready for '%s'", evaluator_path.stem)
116+
return _venv_python(venv_dir)
117+
118+
119+
async def ensure_venv_async(evaluator_path: Path) -> Path | None:
120+
"""Async wrapper around :func:`ensure_venv` with per-evaluator locking."""
121+
venv_key = _venv_key(evaluator_path)
122+
if venv_key not in _venv_locks:
123+
_venv_locks[venv_key] = asyncio.Lock()
124+
125+
async with _venv_locks[venv_key]:
126+
return await asyncio.to_thread(ensure_venv, evaluator_path)

0 commit comments

Comments
 (0)