Skip to content

Commit 970a7c7

Browse files
committed
feat: add score_model grader type to OpenAI Evals backend
Closes #96 score_model scores agent responses using a model judge, no golden set needed. Config requires model and input. range defaults to [0, 1].
1 parent 43bc581 commit 970a7c7

5 files changed

Lines changed: 188 additions & 23 deletions

File tree

docs/custom-evaluators.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,26 @@ Remote evaluators are cached in `~/.cache/agentevals/evaluators/`. To force a re
291291

292292
You can delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) instead of running scoring logic locally. This requires `pip install "agentevals-cli[openai]"` and `OPENAI_API_KEY` to be set.
293293

294+
### Score Model Grader
295+
296+
Uses a model to score each response without a golden set. The model reads the response and returns a float.
297+
298+
```yaml
299+
evaluators:
300+
- name: quality_score
301+
type: openai_eval
302+
threshold: 0.7
303+
grader:
304+
type: score_model
305+
model: gpt-4o-mini
306+
input:
307+
- role: user
308+
content: "Rate the quality of this response from 0 to 1: {{ item.actual_response }}"
309+
range: [0, 1]
310+
```
311+
312+
The `range` field sets the min and max the model can return (defaults to `[0, 1]`). No eval set is needed.
313+
294314
### Text Similarity Grader
295315

296316
Compares the agent's response against a golden reference using text similarity metrics. Requires an eval set.

examples/custom_evaluators/eval_config.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,18 @@ evaluators:
2525
config:
2626
min_response_length: 20
2727

28+
# OpenAI score_model grader (no golden set needed)
29+
- name: quality_score
30+
type: openai_eval
31+
threshold: 0.7
32+
grader:
33+
type: score_model
34+
model: gpt-4o-mini
35+
input:
36+
- role: user
37+
content: "Rate the quality of this response from 0 to 1: {{ item.actual_response }}"
38+
range: [0, 1]
39+
2840
# Reference an evaluator from Github
2941
- name: random_evaluator
3042
type: remote

src/agentevals/config.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,18 @@ class OpenAIEvalDef(BaseModel):
8484
@classmethod
8585
def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
8686
grader_type = v.get("type")
87-
if grader_type != "text_similarity":
88-
raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
89-
metric = v.get("evaluation_metric")
90-
if not metric:
91-
raise ValueError("'evaluation_metric' is required for text_similarity grader")
92-
if metric not in _VALID_SIMILARITY_METRICS:
93-
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
87+
if grader_type == "text_similarity":
88+
metric = v.get("evaluation_metric")
89+
if not metric:
90+
raise ValueError("'evaluation_metric' is required for text_similarity grader")
91+
if metric not in _VALID_SIMILARITY_METRICS:
92+
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
93+
elif grader_type == "score_model":
94+
for field in ("model", "input"):
95+
if not v.get(field):
96+
raise ValueError(f"'{field}' is required for score_model grader")
97+
else:
98+
raise ValueError(f"Unsupported grader type: '{grader_type}'")
9499
return v
95100

96101

src/agentevals/openai_eval_backend.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,14 @@
3131
"required": ["actual_response", "expected_response"],
3232
}
3333

34+
_ACTUAL_ONLY_SCHEMA = {
35+
"type": "object",
36+
"properties": {
37+
"actual_response": {"type": "string"},
38+
},
39+
"required": ["actual_response"],
40+
}
41+
3442

3543
def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
3644
"""Build the OpenAI testing_criteria dict from the evaluator config.
@@ -51,28 +59,32 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
5159
"pass_threshold": evaluator_def.threshold,
5260
}
5361

62+
if grader_type == "score_model":
63+
return {
64+
"type": "score_model",
65+
"name": evaluator_def.name,
66+
"model": grader["model"],
67+
"input": grader["input"],
68+
"range": grader.get("range", [0, 1]),
69+
"pass_threshold": evaluator_def.threshold,
70+
}
71+
5472
raise ValueError(f"Unsupported grader type: {grader_type}")
5573

5674

5775
def _build_jsonl_items(
5876
actual_invocations: list[Invocation],
5977
expected_invocations: list[Invocation],
78+
*,
79+
include_expected: bool = True,
6080
) -> list[dict[str, Any]]:
6181
items = []
6282
for i, actual_inv in enumerate(actual_invocations):
6383
actual_text = _content_to_text(actual_inv.final_response)
64-
if i < len(expected_invocations):
65-
expected_text = _content_to_text(expected_invocations[i].final_response)
66-
else:
67-
expected_text = ""
68-
items.append(
69-
{
70-
"item": {
71-
"actual_response": actual_text,
72-
"expected_response": expected_text,
73-
}
74-
}
75-
)
84+
item: dict[str, Any] = {"actual_response": actual_text}
85+
if include_expected:
86+
item["expected_response"] = _content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else ""
87+
items.append({"item": item})
7688
return items
7789

7890

@@ -111,13 +123,15 @@ async def evaluate_openai_eval(
111123
error="OPENAI_API_KEY environment variable is not set.",
112124
)
113125

114-
if expected_invocations is None:
126+
grader_type = evaluator_def.grader.get("type")
127+
needs_expected = grader_type == "text_similarity"
128+
if needs_expected and expected_invocations is None:
115129
return MetricResult(
116130
metric_name=evaluator_def.name,
117-
error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
131+
error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).",
118132
)
119133

120-
items = _build_jsonl_items(actual_invocations, expected_invocations)
134+
items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=needs_expected)
121135
if not items:
122136
return MetricResult(
123137
metric_name=evaluator_def.name,
@@ -130,12 +144,13 @@ async def evaluate_openai_eval(
130144
try:
131145
client = await asyncio.to_thread(_get_openai_client)
132146

147+
item_schema = _TEXT_PAIR_SCHEMA if needs_expected else _ACTUAL_ONLY_SCHEMA
133148
eval_obj = await asyncio.to_thread(
134149
client.evals.create,
135150
name=f"agentevals-{evaluator_def.name}",
136151
data_source_config={
137152
"type": "custom",
138-
"item_schema": _TEXT_PAIR_SCHEMA,
153+
"item_schema": item_schema,
139154
"include_sample_schema": False,
140155
},
141156
testing_criteria=[testing_criteria],

tests/test_openai_eval_backend.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import pytest
2+
from unittest.mock import MagicMock
3+
4+
from agentevals.config import OpenAIEvalDef
5+
from agentevals.openai_eval_backend import (
6+
_build_jsonl_items,
7+
_build_testing_criteria,
8+
evaluate_openai_eval,
9+
)
10+
11+
12+
def _score_grader(**overrides):
13+
base = {
14+
"type": "score_model",
15+
"model": "gpt-4o-mini",
16+
"input": [{"role": "user", "content": "Rate: {{ item.actual_response }}"}],
17+
}
18+
base.update(overrides)
19+
return base
20+
21+
22+
def _invocation(text: str):
23+
inv = MagicMock()
24+
inv.final_response.parts = [MagicMock(text=text)]
25+
return inv
26+
27+
28+
class TestOpenAIEvalDefValidation:
29+
def test_text_similarity_valid(self):
30+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
31+
assert d.grader["type"] == "text_similarity"
32+
33+
def test_text_similarity_missing_metric(self):
34+
with pytest.raises(Exception, match="evaluation_metric"):
35+
OpenAIEvalDef(name="sim", grader={"type": "text_similarity"})
36+
37+
def test_text_similarity_bad_metric(self):
38+
with pytest.raises(Exception, match="Unknown evaluation_metric"):
39+
OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "invalid"})
40+
41+
def test_score_model_valid(self):
42+
d = OpenAIEvalDef(name="sc", grader=_score_grader())
43+
assert d.grader["type"] == "score_model"
44+
45+
@pytest.mark.parametrize("field", ["model", "input"])
46+
def test_score_model_missing_required_field(self, field):
47+
with pytest.raises(Exception, match=field):
48+
OpenAIEvalDef(name="sc", grader=_score_grader(**{field: None}))
49+
50+
def test_unsupported_grader_type(self):
51+
with pytest.raises(Exception, match="Unsupported grader type"):
52+
OpenAIEvalDef(name="x", grader={"type": "unknown"})
53+
54+
55+
class TestBuildTestingCriteria:
56+
def test_text_similarity_shape(self):
57+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}, threshold=0.7)
58+
c = _build_testing_criteria(d)
59+
assert c["type"] == "text_similarity"
60+
assert c["evaluation_metric"] == "bleu"
61+
assert c["pass_threshold"] == 0.7
62+
assert "{{ item.actual_response }}" in c["input"]
63+
assert "{{ item.expected_response }}" in c["reference"]
64+
65+
def test_score_model_shape(self):
66+
grader = _score_grader(range=[1, 5])
67+
d = OpenAIEvalDef(name="sc", grader=grader, threshold=3.0)
68+
c = _build_testing_criteria(d)
69+
assert c["type"] == "score_model"
70+
assert c["model"] == "gpt-4o-mini"
71+
assert c["range"] == [1, 5]
72+
assert c["pass_threshold"] == 3.0
73+
assert c["input"] == grader["input"]
74+
75+
def test_score_model_default_range(self):
76+
d = OpenAIEvalDef(name="sc", grader=_score_grader())
77+
c = _build_testing_criteria(d)
78+
assert c["range"] == [0, 1]
79+
80+
81+
class TestBuildJsonlItems:
82+
def test_text_similarity_includes_expected(self):
83+
items = _build_jsonl_items([_invocation("hello")], [_invocation("world")], include_expected=True)
84+
assert "expected_response" in items[0]["item"]
85+
86+
def test_score_model_excludes_expected(self):
87+
items = _build_jsonl_items([_invocation("hello")], [], include_expected=False)
88+
assert "expected_response" not in items[0]["item"]
89+
90+
def test_missing_expected_falls_back_to_empty(self):
91+
items = _build_jsonl_items([_invocation("hello")], [], include_expected=True)
92+
assert items[0]["item"]["expected_response"] == ""
93+
94+
95+
class TestEvaluateOpenAIEval:
96+
async def test_no_api_key_returns_error(self, monkeypatch):
97+
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
98+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
99+
result = await evaluate_openai_eval(d, [], [])
100+
assert "OPENAI_API_KEY" in (result.error or "")
101+
102+
async def test_text_similarity_requires_expected(self, monkeypatch):
103+
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
104+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
105+
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
106+
assert "expected invocations" in (result.error or "")
107+
108+
async def test_score_model_does_not_require_expected(self, monkeypatch):
109+
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
110+
monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
111+
d = OpenAIEvalDef(name="sc", grader=_score_grader())
112+
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
113+
assert "expected invocations" not in (result.error or "")

0 commit comments

Comments
 (0)