|
| 1 | +"""OpenAI Evals API backend — delegates grading to the OpenAI Evals API. |
| 2 | +
|
| 3 | +Builds testing criteria from the evaluator config, submits invocation pairs |
| 4 | +as JSONL items, polls for completion, and maps per-item results back to a |
| 5 | +MetricResult. |
| 6 | +""" |
| 7 | + |
| 8 | +from __future__ import annotations |
| 9 | + |
| 10 | +import asyncio |
| 11 | +import logging |
| 12 | +import os |
| 13 | +import time |
| 14 | +from typing import Any |
| 15 | + |
| 16 | +from google.adk.evaluation.eval_case import Invocation |
| 17 | + |
| 18 | +from .config import OpenAIEvalDef |
| 19 | +from .custom_evaluators import _content_to_text |
| 20 | + |
| 21 | +logger = logging.getLogger(__name__) |
| 22 | + |
| 23 | +_POLL_INTERVAL_SECONDS = 2 |
| 24 | + |
| 25 | +_TEXT_PAIR_SCHEMA = { |
| 26 | + "type": "object", |
| 27 | + "properties": { |
| 28 | + "actual_response": {"type": "string"}, |
| 29 | + "expected_response": {"type": "string"}, |
| 30 | + }, |
| 31 | + "required": ["actual_response", "expected_response"], |
| 32 | +} |
| 33 | + |
| 34 | + |
| 35 | +def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]: |
| 36 | + """Build the OpenAI testing_criteria dict from the evaluator config. |
| 37 | +
|
| 38 | + Each grader type produces a different shape. Extend this function |
| 39 | + when adding support for new OpenAI grader types. |
| 40 | + """ |
| 41 | + grader = evaluator_def.grader |
| 42 | + grader_type = grader["type"] |
| 43 | + |
| 44 | + if grader_type == "text_similarity": |
| 45 | + return { |
| 46 | + "type": "text_similarity", |
| 47 | + "name": evaluator_def.name, |
| 48 | + "input": "{{ item.actual_response }}", |
| 49 | + "reference": "{{ item.expected_response }}", |
| 50 | + "evaluation_metric": grader["evaluation_metric"], |
| 51 | + "pass_threshold": evaluator_def.threshold, |
| 52 | + } |
| 53 | + |
| 54 | + raise ValueError(f"Unsupported grader type: {grader_type}") |
| 55 | + |
| 56 | + |
| 57 | +def _build_jsonl_items( |
| 58 | + actual_invocations: list[Invocation], |
| 59 | + expected_invocations: list[Invocation], |
| 60 | +) -> list[dict[str, Any]]: |
| 61 | + items = [] |
| 62 | + for i, actual_inv in enumerate(actual_invocations): |
| 63 | + actual_text = _content_to_text(actual_inv.final_response) |
| 64 | + if i < len(expected_invocations): |
| 65 | + expected_text = _content_to_text(expected_invocations[i].final_response) |
| 66 | + else: |
| 67 | + expected_text = "" |
| 68 | + items.append({ |
| 69 | + "item": { |
| 70 | + "actual_response": actual_text, |
| 71 | + "expected_response": expected_text, |
| 72 | + } |
| 73 | + }) |
| 74 | + return items |
| 75 | + |
| 76 | + |
| 77 | +def _get_openai_client(): |
| 78 | + try: |
| 79 | + from openai import OpenAI |
| 80 | + except ImportError as exc: |
| 81 | + raise ImportError( |
| 82 | + "The 'openai' package is required for openai_eval evaluators. " |
| 83 | + "Install it with: pip install 'agentevals-cli[openai]'" |
| 84 | + ) from exc |
| 85 | + return OpenAI() |
| 86 | + |
| 87 | + |
| 88 | +def _extract_item_score(output_item: Any) -> float | None: |
| 89 | + results = getattr(output_item, "results", None) |
| 90 | + if not results: |
| 91 | + return None |
| 92 | + for r in results: |
| 93 | + if getattr(r, "score", None) is not None: |
| 94 | + return float(r.score) |
| 95 | + return None |
| 96 | + |
| 97 | + |
| 98 | +async def evaluate_openai_eval( |
| 99 | + evaluator_def: OpenAIEvalDef, |
| 100 | + actual_invocations: list[Invocation], |
| 101 | + expected_invocations: list[Invocation] | None, |
| 102 | +) -> Any: |
| 103 | + """Run an evaluation via the OpenAI Evals API and return a MetricResult.""" |
| 104 | + from .runner import MetricResult |
| 105 | + |
| 106 | + if not os.environ.get("OPENAI_API_KEY"): |
| 107 | + return MetricResult( |
| 108 | + metric_name=evaluator_def.name, |
| 109 | + error="OPENAI_API_KEY environment variable is not set.", |
| 110 | + ) |
| 111 | + |
| 112 | + if expected_invocations is None: |
| 113 | + return MetricResult( |
| 114 | + metric_name=evaluator_def.name, |
| 115 | + error="OpenAI text_similarity grader requires expected invocations (golden eval set).", |
| 116 | + ) |
| 117 | + |
| 118 | + items = _build_jsonl_items(actual_invocations, expected_invocations) |
| 119 | + if not items: |
| 120 | + return MetricResult( |
| 121 | + metric_name=evaluator_def.name, |
| 122 | + error="No invocations to evaluate.", |
| 123 | + ) |
| 124 | + |
| 125 | + testing_criteria = _build_testing_criteria(evaluator_def) |
| 126 | + eval_id = None |
| 127 | + |
| 128 | + try: |
| 129 | + client = await asyncio.to_thread(_get_openai_client) |
| 130 | + |
| 131 | + eval_obj = await asyncio.to_thread( |
| 132 | + client.evals.create, |
| 133 | + name=f"agentevals-{evaluator_def.name}", |
| 134 | + data_source_config={ |
| 135 | + "type": "custom", |
| 136 | + "item_schema": _TEXT_PAIR_SCHEMA, |
| 137 | + "include_sample_schema": False, |
| 138 | + }, |
| 139 | + testing_criteria=[testing_criteria], |
| 140 | + ) |
| 141 | + eval_id = eval_obj.id |
| 142 | + logger.info("Created OpenAI eval %s for '%s'", eval_id, evaluator_def.name) |
| 143 | + |
| 144 | + run = await asyncio.to_thread( |
| 145 | + client.evals.runs.create, |
| 146 | + eval_id=eval_id, |
| 147 | + name=f"agentevals-run-{evaluator_def.name}", |
| 148 | + data_source={ |
| 149 | + "type": "jsonl", |
| 150 | + "source": { |
| 151 | + "type": "file_content", |
| 152 | + "content": items, |
| 153 | + }, |
| 154 | + }, |
| 155 | + ) |
| 156 | + run_id = run.id |
| 157 | + logger.info("Created OpenAI eval run %s", run_id) |
| 158 | + |
| 159 | + run = await _poll_run(client, eval_id, run_id, evaluator_def) |
| 160 | + if isinstance(run, MetricResult): |
| 161 | + return run |
| 162 | + |
| 163 | + return await _collect_results(client, eval_id, run_id, run, evaluator_def) |
| 164 | + |
| 165 | + except ImportError: |
| 166 | + raise |
| 167 | + except Exception as exc: |
| 168 | + logger.exception("OpenAI eval failed for '%s'", evaluator_def.name) |
| 169 | + return MetricResult( |
| 170 | + metric_name=evaluator_def.name, |
| 171 | + error=f"OpenAI Evals API error: {exc}", |
| 172 | + ) |
| 173 | + finally: |
| 174 | + if eval_id: |
| 175 | + try: |
| 176 | + await asyncio.to_thread(client.evals.delete, eval_id) |
| 177 | + logger.debug("Cleaned up OpenAI eval %s", eval_id) |
| 178 | + except Exception: |
| 179 | + logger.debug("Failed to clean up OpenAI eval %s", eval_id, exc_info=True) |
| 180 | + |
| 181 | + |
| 182 | +async def _poll_run(client: Any, eval_id: str, run_id: str, evaluator_def: OpenAIEvalDef) -> Any: |
| 183 | + """Poll until the run completes. Returns the run object, or a MetricResult on error/timeout.""" |
| 184 | + from .runner import MetricResult |
| 185 | + |
| 186 | + start_time = time.monotonic() |
| 187 | + while True: |
| 188 | + elapsed = time.monotonic() - start_time |
| 189 | + if elapsed > evaluator_def.timeout: |
| 190 | + return MetricResult( |
| 191 | + metric_name=evaluator_def.name, |
| 192 | + error=f"OpenAI eval run timed out after {evaluator_def.timeout}s.", |
| 193 | + ) |
| 194 | + |
| 195 | + run = await asyncio.to_thread( |
| 196 | + client.evals.runs.retrieve, run_id, eval_id=eval_id |
| 197 | + ) |
| 198 | + |
| 199 | + if run.status == "completed": |
| 200 | + return run |
| 201 | + if run.status in ("failed", "canceled"): |
| 202 | + return MetricResult( |
| 203 | + metric_name=evaluator_def.name, |
| 204 | + error=f"OpenAI eval run {run.status}: {getattr(run, 'error', 'unknown')}", |
| 205 | + ) |
| 206 | + |
| 207 | + await asyncio.sleep(_POLL_INTERVAL_SECONDS) |
| 208 | + |
| 209 | + |
| 210 | +async def _collect_results( |
| 211 | + client: Any, eval_id: str, run_id: str, run: Any, evaluator_def: OpenAIEvalDef |
| 212 | +) -> Any: |
| 213 | + """Extract scores from a completed run and return a MetricResult.""" |
| 214 | + from .runner import MetricResult |
| 215 | + |
| 216 | + output_items_page = await asyncio.to_thread( |
| 217 | + client.evals.runs.output_items.list, run_id=run_id, eval_id=eval_id |
| 218 | + ) |
| 219 | + output_items = list(output_items_page.data) if output_items_page.data else [] |
| 220 | + |
| 221 | + per_invocation_scores: list[float | None] = [ |
| 222 | + _extract_item_score(item) for item in output_items |
| 223 | + ] |
| 224 | + |
| 225 | + valid_scores = [s for s in per_invocation_scores if s is not None] |
| 226 | + overall_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0 |
| 227 | + |
| 228 | + result_counts = run.result_counts |
| 229 | + passed = result_counts.passed if result_counts else 0 |
| 230 | + failed = result_counts.failed if result_counts else 0 |
| 231 | + total = result_counts.total if result_counts else 0 |
| 232 | + eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED" |
| 233 | + |
| 234 | + details: dict[str, Any] = { |
| 235 | + "openai_eval_id": eval_id, |
| 236 | + "openai_run_id": run_id, |
| 237 | + "evaluation_metric": evaluator_def.grader.get("evaluation_metric"), |
| 238 | + "result_counts": {"passed": passed, "failed": failed, "total": total}, |
| 239 | + } |
| 240 | + per_criteria = getattr(run, "per_testing_criteria_results", None) |
| 241 | + if per_criteria: |
| 242 | + details["per_testing_criteria"] = [ |
| 243 | + {"name": c.testing_criteria, "passed": c.passed, "failed": c.failed} |
| 244 | + for c in per_criteria |
| 245 | + ] |
| 246 | + |
| 247 | + return MetricResult( |
| 248 | + metric_name=evaluator_def.name, |
| 249 | + score=overall_score, |
| 250 | + eval_status=eval_status, |
| 251 | + per_invocation_scores=per_invocation_scores, |
| 252 | + details=details, |
| 253 | + ) |
0 commit comments