Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions docs/custom-evaluators.md
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,26 @@ Remote evaluators are cached in `~/.cache/agentevals/evaluators/`. To force a re

You can delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) instead of running scoring logic locally. This requires `pip install "agentevals-cli[openai]"` and `OPENAI_API_KEY` to be set.

### Score Model Grader

Uses a model to score each response without a golden set. The model reads the response and returns a float.

```yaml
evaluators:
- name: quality_score
type: openai_eval
threshold: 0.7
grader:
type: score_model
model: gpt-4o-mini
input:
- role: user
content: "Rate the quality of this response from 0 to 1: {{ item.actual_response }}"
range: [0, 1]
```

The `range` field sets the min and max the model can return (defaults to `[0, 1]`). No eval set is needed.

### Text Similarity Grader

Compares the agent's response against a golden reference using text similarity metrics. Requires an eval set.
Expand Down
10 changes: 10 additions & 0 deletions examples/custom_evaluators/eval_config_openai_eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,13 @@ evaluators:
content: "Rate this response: {{ item.actual_response }}"
labels: [good, bad]
passing_labels: [good]
- name: quality_score
type: openai_eval
threshold: 0.7
grader:
type: score_model
model: gpt-4o-mini
input:
- role: user
content: "Rate the quality of this response from 0 to 1: {{ item.actual_response }}"
range: [0, 1]
8 changes: 7 additions & 1 deletion src/agentevals/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,14 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
invalid = [lbl for lbl in v["passing_labels"] if lbl not in v["labels"]]
if invalid:
raise ValueError(f"passing_labels contains labels not declared in labels: {invalid}")
elif grader_type == "score_model":
for field in ("model", "input"):
if not v.get(field):
raise ValueError(f"'{field}' is required for score_model grader")
else:
raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: label_model, text_similarity")
raise ValueError(
f"Unsupported grader type: '{grader_type}'. Supported: label_model, score_model, text_similarity"
)
return v


Expand Down
38 changes: 20 additions & 18 deletions src/agentevals/openai_eval_backend.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
"""OpenAI Evals API backend — delegates grading to the OpenAI Evals API.

Builds testing criteria from the evaluator config, submits invocation pairs
as JSONL items, polls for completion, and maps per-item results back to a
MetricResult.
"""
"""OpenAI Evals API backend."""

from __future__ import annotations

Expand Down Expand Up @@ -39,11 +34,6 @@


def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
"""Build the OpenAI testing_criteria dict from the evaluator config.

Each grader type produces a different shape. Extend this function
when adding support for new OpenAI grader types.
"""
grader = evaluator_def.grader
grader_type = grader["type"]

Expand All @@ -67,12 +57,23 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
"passing_labels": grader["passing_labels"],
}

if grader_type == "score_model":
return {
"type": "score_model",
"name": evaluator_def.name,
"model": grader["model"],
"input": grader["input"],
"range": grader.get("range", [0, 1]),
"pass_threshold": evaluator_def.threshold,
}

raise ValueError(f"Unsupported grader type: {grader_type}")


def _build_jsonl_items(
actual_invocations: list[Invocation],
expected_invocations: list[Invocation],
*,
include_expected: bool = True,
) -> list[dict[str, Any]]:
items = []
Expand Down Expand Up @@ -123,16 +124,14 @@ async def evaluate_openai_eval(
)

grader_type = evaluator_def.grader["type"]

if grader_type == "text_similarity" and expected_invocations is None:
needs_expected = grader_type == "text_similarity"
if needs_expected and expected_invocations is None:
return MetricResult(
metric_name=evaluator_def.name,
error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).",
)

items = _build_jsonl_items(
actual_invocations, expected_invocations or [], include_expected=(grader_type != "label_model")
)
items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=needs_expected)
if not items:
return MetricResult(
metric_name=evaluator_def.name,
Expand All @@ -145,7 +144,7 @@ async def evaluate_openai_eval(
try:
client = await asyncio.to_thread(_get_openai_client)

item_schema = _ACTUAL_ONLY_SCHEMA if grader_type == "label_model" else _TEXT_PAIR_SCHEMA
item_schema = _TEXT_PAIR_SCHEMA if needs_expected else _ACTUAL_ONLY_SCHEMA
eval_obj = await asyncio.to_thread(
client.evals.create,
name=f"agentevals-openai-{evaluator_def.name}",
Expand Down Expand Up @@ -252,6 +251,9 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
elif grader["type"] == "label_model":
details["model"] = grader.get("model")
details["passing_labels"] = grader.get("passing_labels")
elif grader["type"] == "score_model":
details["model"] = grader.get("model")
details["range"] = grader.get("range", [0, 1])
per_criteria = getattr(run, "per_testing_criteria_results", None)
if per_criteria:
details["per_testing_criteria"] = [
Expand Down
46 changes: 44 additions & 2 deletions tests/test_openai_eval_backend.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
from unittest.mock import MagicMock

import pytest

from agentevals.config import OpenAIEvalDef
from agentevals.openai_eval_backend import (
_build_jsonl_items,
Expand All @@ -21,6 +22,16 @@ def _label_grader(**overrides):
return base


def _score_grader(**overrides):
base = {
"type": "score_model",
"model": "gpt-4o-mini",
"input": [{"role": "user", "content": "Rate: {{ item.actual_response }}"}],
}
base.update(overrides)
return base


def _invocation(text: str):
inv = MagicMock()
inv.final_response.parts = [MagicMock(text=text)]
Expand Down Expand Up @@ -55,6 +66,15 @@ def test_label_model_passing_labels_not_in_labels(self):
with pytest.raises(Exception, match="passing_labels"):
OpenAIEvalDef(name="lm", grader=grader)

def test_score_model_valid(self):
d = OpenAIEvalDef(name="sc", grader=_score_grader())
assert d.grader["type"] == "score_model"

@pytest.mark.parametrize("field", ["model", "input"])
def test_score_model_missing_required_field(self, field):
with pytest.raises(Exception, match=field):
OpenAIEvalDef(name="sc", grader=_score_grader(**{field: None}))

def test_unsupported_grader_type(self):
with pytest.raises(Exception, match="Unsupported grader type"):
OpenAIEvalDef(name="x", grader={"type": "unknown"})
Expand All @@ -80,13 +100,28 @@ def test_label_model_shape(self):
assert c["passing_labels"] == ["good"]
assert c["input"] == grader["input"]

def test_score_model_shape(self):
grader = _score_grader(range=[0, 5])
d = OpenAIEvalDef(name="sc", grader=grader, threshold=0.6)
c = _build_testing_criteria(d)
assert c["type"] == "score_model"
assert c["model"] == "gpt-4o-mini"
assert c["range"] == [0, 5]
assert c["pass_threshold"] == 0.6
assert c["input"] == grader["input"]

def test_score_model_default_range(self):
d = OpenAIEvalDef(name="sc", grader=_score_grader())
c = _build_testing_criteria(d)
assert c["range"] == [0, 1]


class TestBuildJsonlItems:
def test_text_similarity_includes_expected(self):
items = _build_jsonl_items([_invocation("hello")], [_invocation("world")], include_expected=True)
assert "expected_response" in items[0]["item"]

def test_label_model_excludes_expected(self):
def test_excludes_expected_when_not_requested(self):
items = _build_jsonl_items([_invocation("hello")], [], include_expected=False)
assert "expected_response" not in items[0]["item"]

Expand Down Expand Up @@ -114,3 +149,10 @@ async def test_label_model_does_not_require_expected(self, monkeypatch):
d = OpenAIEvalDef(name="lm", grader=_label_grader())
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
assert "expected invocations" not in (result.error or "")

async def test_score_model_does_not_require_expected(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
d = OpenAIEvalDef(name="sc", grader=_score_grader())
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
assert "expected invocations" not in (result.error or "")