Skip to content

Commit a74e734

Browse files
add openai_eval type to delegate eval to OpenAI APIs
1 parent b117a0d commit a74e734

7 files changed

Lines changed: 316 additions & 4 deletions

File tree

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ streaming = [
3030
"opentelemetry-sdk>=1.20.0",
3131
"websockets>=12.0",
3232
]
33+
openai = [
34+
"openai>=2.0",
35+
]
3336

3437
[project.scripts]
3538
agentevals = "agentevals.cli:main"

src/agentevals/api/routes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
CodeEvaluatorDef,
2323
CustomEvaluatorDef,
2424
EvalRunConfig,
25+
OpenAIEvalDef,
2526
)
2627
from ..extraction import get_extractor
2728
from ..runner import RunResult, get_loader, load_eval_set, run_evaluation
@@ -58,6 +59,7 @@ def _camel_keys(obj: Any) -> Any:
5859
_TYPE_TO_MODEL = {
5960
"builtin": BuiltinMetricDef,
6061
"code": CodeEvaluatorDef,
62+
"openai_eval": OpenAIEvalDef,
6163
}
6264

6365

src/agentevals/config.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,51 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
5353
ref: str = Field(description="Source-specific reference (e.g. path within the repo).")
5454

5555

56+
_VALID_SIMILARITY_METRICS = frozenset({
57+
"fuzzy_match",
58+
"bleu",
59+
"gleu",
60+
"meteor",
61+
"cosine",
62+
"rouge_1",
63+
"rouge_2",
64+
"rouge_3",
65+
"rouge_4",
66+
"rouge_5",
67+
"rouge_l",
68+
})
69+
70+
71+
class OpenAIEvalDef(BaseModel):
72+
"""An evaluator that delegates grading to the OpenAI Evals API."""
73+
74+
type: Literal["openai_eval"] = "openai_eval"
75+
name: str
76+
threshold: float = 0.5
77+
timeout: int = Field(default=120, description="Max seconds to wait for the OpenAI eval run to complete.")
78+
grader: dict[str, Any] = Field(description="OpenAI grader config passed to testing_criteria.")
79+
80+
@field_validator("grader")
81+
@classmethod
82+
def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
83+
grader_type = v.get("type")
84+
if grader_type != "text_similarity":
85+
raise ValueError(
86+
f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'"
87+
)
88+
metric = v.get("evaluation_metric")
89+
if not metric:
90+
raise ValueError("'evaluation_metric' is required for text_similarity grader")
91+
if metric not in _VALID_SIMILARITY_METRICS:
92+
raise ValueError(
93+
f"Unknown evaluation_metric '{metric}'. "
94+
f"Valid: {sorted(_VALID_SIMILARITY_METRICS)}"
95+
)
96+
return v
97+
98+
5699
CustomEvaluatorDef = Annotated[
57-
BuiltinMetricDef | CodeEvaluatorDef | RemoteEvaluatorDef,
100+
BuiltinMetricDef | CodeEvaluatorDef | RemoteEvaluatorDef | OpenAIEvalDef,
58101
Field(discriminator="type"),
59102
]
60103

src/agentevals/custom_evaluators.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,9 +425,14 @@ async def evaluate_custom_evaluator(
425425
"""
426426
import inspect as _inspect
427427

428-
from .config import CodeEvaluatorDef, RemoteEvaluatorDef
428+
from .config import CodeEvaluatorDef, OpenAIEvalDef, RemoteEvaluatorDef
429429
from .runner import MetricResult
430430

431+
if isinstance(evaluator_def, OpenAIEvalDef):
432+
from .openai_eval_backend import evaluate_openai_eval
433+
434+
return await evaluate_openai_eval(evaluator_def, actual_invocations, expected_invocations)
435+
431436
if isinstance(evaluator_def, RemoteEvaluatorDef):
432437
from .evaluator.resolver import get_default_resolver
433438

src/agentevals/eval_config_loader.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
CodeEvaluatorDef,
1414
CustomEvaluatorDef,
1515
EvalRunConfig,
16+
OpenAIEvalDef,
1617
RemoteEvaluatorDef,
1718
)
1819

@@ -22,6 +23,7 @@
2223
"builtin": BuiltinMetricDef,
2324
"code": CodeEvaluatorDef,
2425
"remote": RemoteEvaluatorDef,
26+
"openai_eval": OpenAIEvalDef,
2527
}
2628

2729

@@ -42,7 +44,7 @@ def _parse_evaluator_entry(entry: dict[str, Any]) -> tuple[str | None, CustomEva
4244

4345
evaluator_type = entry.get("type")
4446
if not evaluator_type:
45-
raise ValueError(f"Evaluator entry '{name}' must have a 'type' field (builtin, code, or remote)")
47+
raise ValueError(f"Evaluator entry '{name}' must have a 'type' field ({', '.join(_TYPE_TO_MODEL)})")
4648

4749
if evaluator_type not in _TYPE_TO_MODEL:
4850
raise ValueError(
Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
"""OpenAI Evals API backend — delegates grading to the OpenAI Evals API.
2+
3+
Builds testing criteria from the evaluator config, submits invocation pairs
4+
as JSONL items, polls for completion, and maps per-item results back to a
5+
MetricResult.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import asyncio
11+
import logging
12+
import os
13+
import time
14+
from typing import Any
15+
16+
from google.adk.evaluation.eval_case import Invocation
17+
18+
from .config import OpenAIEvalDef
19+
from .custom_evaluators import _content_to_text
20+
21+
logger = logging.getLogger(__name__)
22+
23+
_POLL_INTERVAL_SECONDS = 2
24+
25+
_TEXT_PAIR_SCHEMA = {
26+
"type": "object",
27+
"properties": {
28+
"actual_response": {"type": "string"},
29+
"expected_response": {"type": "string"},
30+
},
31+
"required": ["actual_response", "expected_response"],
32+
}
33+
34+
35+
def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
36+
"""Build the OpenAI testing_criteria dict from the evaluator config.
37+
38+
Each grader type produces a different shape. Extend this function
39+
when adding support for new OpenAI grader types.
40+
"""
41+
grader = evaluator_def.grader
42+
grader_type = grader["type"]
43+
44+
if grader_type == "text_similarity":
45+
return {
46+
"type": "text_similarity",
47+
"name": evaluator_def.name,
48+
"input": "{{ item.actual_response }}",
49+
"reference": "{{ item.expected_response }}",
50+
"evaluation_metric": grader["evaluation_metric"],
51+
"pass_threshold": evaluator_def.threshold,
52+
}
53+
54+
raise ValueError(f"Unsupported grader type: {grader_type}")
55+
56+
57+
def _build_jsonl_items(
58+
actual_invocations: list[Invocation],
59+
expected_invocations: list[Invocation],
60+
) -> list[dict[str, Any]]:
61+
items = []
62+
for i, actual_inv in enumerate(actual_invocations):
63+
actual_text = _content_to_text(actual_inv.final_response)
64+
if i < len(expected_invocations):
65+
expected_text = _content_to_text(expected_invocations[i].final_response)
66+
else:
67+
expected_text = ""
68+
items.append({
69+
"item": {
70+
"actual_response": actual_text,
71+
"expected_response": expected_text,
72+
}
73+
})
74+
return items
75+
76+
77+
def _get_openai_client():
78+
try:
79+
from openai import OpenAI
80+
except ImportError as exc:
81+
raise ImportError(
82+
"The 'openai' package is required for openai_eval evaluators. "
83+
"Install it with: pip install 'agentevals-cli[openai]'"
84+
) from exc
85+
return OpenAI()
86+
87+
88+
def _extract_item_score(output_item: Any) -> float | None:
89+
results = getattr(output_item, "results", None)
90+
if not results:
91+
return None
92+
for r in results:
93+
if getattr(r, "score", None) is not None:
94+
return float(r.score)
95+
return None
96+
97+
98+
async def evaluate_openai_eval(
99+
evaluator_def: OpenAIEvalDef,
100+
actual_invocations: list[Invocation],
101+
expected_invocations: list[Invocation] | None,
102+
) -> Any:
103+
"""Run an evaluation via the OpenAI Evals API and return a MetricResult."""
104+
from .runner import MetricResult
105+
106+
if not os.environ.get("OPENAI_API_KEY"):
107+
return MetricResult(
108+
metric_name=evaluator_def.name,
109+
error="OPENAI_API_KEY environment variable is not set.",
110+
)
111+
112+
if expected_invocations is None:
113+
return MetricResult(
114+
metric_name=evaluator_def.name,
115+
error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
116+
)
117+
118+
items = _build_jsonl_items(actual_invocations, expected_invocations)
119+
if not items:
120+
return MetricResult(
121+
metric_name=evaluator_def.name,
122+
error="No invocations to evaluate.",
123+
)
124+
125+
testing_criteria = _build_testing_criteria(evaluator_def)
126+
eval_id = None
127+
128+
try:
129+
client = await asyncio.to_thread(_get_openai_client)
130+
131+
eval_obj = await asyncio.to_thread(
132+
client.evals.create,
133+
name=f"agentevals-{evaluator_def.name}",
134+
data_source_config={
135+
"type": "custom",
136+
"item_schema": _TEXT_PAIR_SCHEMA,
137+
"include_sample_schema": False,
138+
},
139+
testing_criteria=[testing_criteria],
140+
)
141+
eval_id = eval_obj.id
142+
logger.info("Created OpenAI eval %s for '%s'", eval_id, evaluator_def.name)
143+
144+
run = await asyncio.to_thread(
145+
client.evals.runs.create,
146+
eval_id=eval_id,
147+
name=f"agentevals-run-{evaluator_def.name}",
148+
data_source={
149+
"type": "jsonl",
150+
"source": {
151+
"type": "file_content",
152+
"content": items,
153+
},
154+
},
155+
)
156+
run_id = run.id
157+
logger.info("Created OpenAI eval run %s", run_id)
158+
159+
run = await _poll_run(client, eval_id, run_id, evaluator_def)
160+
if isinstance(run, MetricResult):
161+
return run
162+
163+
return await _collect_results(client, eval_id, run_id, run, evaluator_def)
164+
165+
except ImportError:
166+
raise
167+
except Exception as exc:
168+
logger.exception("OpenAI eval failed for '%s'", evaluator_def.name)
169+
return MetricResult(
170+
metric_name=evaluator_def.name,
171+
error=f"OpenAI Evals API error: {exc}",
172+
)
173+
finally:
174+
if eval_id:
175+
try:
176+
await asyncio.to_thread(client.evals.delete, eval_id)
177+
logger.debug("Cleaned up OpenAI eval %s", eval_id)
178+
except Exception:
179+
logger.debug("Failed to clean up OpenAI eval %s", eval_id, exc_info=True)
180+
181+
182+
async def _poll_run(client: Any, eval_id: str, run_id: str, evaluator_def: OpenAIEvalDef) -> Any:
183+
"""Poll until the run completes. Returns the run object, or a MetricResult on error/timeout."""
184+
from .runner import MetricResult
185+
186+
start_time = time.monotonic()
187+
while True:
188+
elapsed = time.monotonic() - start_time
189+
if elapsed > evaluator_def.timeout:
190+
return MetricResult(
191+
metric_name=evaluator_def.name,
192+
error=f"OpenAI eval run timed out after {evaluator_def.timeout}s.",
193+
)
194+
195+
run = await asyncio.to_thread(
196+
client.evals.runs.retrieve, run_id, eval_id=eval_id
197+
)
198+
199+
if run.status == "completed":
200+
return run
201+
if run.status in ("failed", "canceled"):
202+
return MetricResult(
203+
metric_name=evaluator_def.name,
204+
error=f"OpenAI eval run {run.status}: {getattr(run, 'error', 'unknown')}",
205+
)
206+
207+
await asyncio.sleep(_POLL_INTERVAL_SECONDS)
208+
209+
210+
async def _collect_results(
211+
client: Any, eval_id: str, run_id: str, run: Any, evaluator_def: OpenAIEvalDef
212+
) -> Any:
213+
"""Extract scores from a completed run and return a MetricResult."""
214+
from .runner import MetricResult
215+
216+
output_items_page = await asyncio.to_thread(
217+
client.evals.runs.output_items.list, run_id=run_id, eval_id=eval_id
218+
)
219+
output_items = list(output_items_page.data) if output_items_page.data else []
220+
221+
per_invocation_scores: list[float | None] = [
222+
_extract_item_score(item) for item in output_items
223+
]
224+
225+
valid_scores = [s for s in per_invocation_scores if s is not None]
226+
overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
227+
228+
result_counts = run.result_counts
229+
passed = result_counts.passed if result_counts else 0
230+
failed = result_counts.failed if result_counts else 0
231+
total = result_counts.total if result_counts else 0
232+
eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
233+
234+
details: dict[str, Any] = {
235+
"openai_eval_id": eval_id,
236+
"openai_run_id": run_id,
237+
"evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
238+
"result_counts": {"passed": passed, "failed": failed, "total": total},
239+
}
240+
per_criteria = getattr(run, "per_testing_criteria_results", None)
241+
if per_criteria:
242+
details["per_testing_criteria"] = [
243+
{"name": c.testing_criteria, "passed": c.passed, "failed": c.failed}
244+
for c in per_criteria
245+
]
246+
247+
return MetricResult(
248+
metric_name=evaluator_def.name,
249+
score=overall_score,
250+
eval_status=eval_status,
251+
per_invocation_scores=per_invocation_scores,
252+
details=details,
253+
)

0 commit comments

Comments
 (0)