Skip to content

Commit 9efd28b

Browse files
committed
feat: add LabelModelGrader support for OpenAI Evals backend
Adds label_model grader type, validates passing_labels against labels, moves OpenAI grader example to a separate file.
1 parent 43bc581 commit 9efd28b

6 files changed

Lines changed: 204 additions & 24 deletions

File tree

docs/custom-evaluators.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,26 @@ The `grader.evaluation_metric` field selects the similarity algorithm:
317317
| `rouge_1` through `rouge_5` | Unigram through 5-gram overlap (F-measure) |
318318
| `rouge_l` | Longest common subsequence overlap (F-measure) |
319319

320+
### Label Model Grader
321+
322+
Scores responses without a golden set. The model reads each response and assigns a label from a fixed list. Passing labels are defined in the config.
323+
324+
```yaml
325+
evaluators:
326+
- name: quality_check
327+
type: openai_eval
328+
grader:
329+
type: label_model
330+
model: gpt-4o-mini
331+
input:
332+
- role: user
333+
content: "Rate this response: {{ item.actual_response }}"
334+
labels: [good, bad]
335+
passing_labels: [good]
336+
```
337+
338+
The `threshold` field is not used for `label_model`. A response passes if its assigned label is in `passing_labels`.
339+
320340
### How it works
321341

322342
Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.

examples/custom_evaluators/eval_config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,4 @@ evaluators:
3232
ref: evaluators/random_evaluator/random_evaluator.py
3333
threshold: 0.110
3434
executor: local
35+
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Eval config using OpenAI Evals API graders.
2+
# Requires OPENAI_API_KEY to be set.
3+
#
4+
# Run with:
5+
# agentevals run samples/helm.json \
6+
# --config examples/custom_evaluators/eval_config_openai_eval.yaml
7+
8+
evaluators:
9+
- name: quality_check
10+
type: openai_eval
11+
grader:
12+
type: label_model
13+
model: gpt-4o-mini
14+
input:
15+
- role: user
16+
content: "Rate this response: {{ item.actual_response }}"
17+
labels: [good, bad]
18+
passing_labels: [good]

src/agentevals/config.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,21 @@ class OpenAIEvalDef(BaseModel):
8484
@classmethod
8585
def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
8686
grader_type = v.get("type")
87-
if grader_type != "text_similarity":
88-
raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
89-
metric = v.get("evaluation_metric")
90-
if not metric:
91-
raise ValueError("'evaluation_metric' is required for text_similarity grader")
92-
if metric not in _VALID_SIMILARITY_METRICS:
93-
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
87+
if grader_type == "text_similarity":
88+
metric = v.get("evaluation_metric")
89+
if not metric:
90+
raise ValueError("'evaluation_metric' is required for text_similarity grader")
91+
if metric not in _VALID_SIMILARITY_METRICS:
92+
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
93+
elif grader_type == "label_model":
94+
for field in ("model", "input", "labels", "passing_labels"):
95+
if not v.get(field):
96+
raise ValueError(f"'{field}' is required for label_model grader")
97+
invalid = [lbl for lbl in v["passing_labels"] if lbl not in v["labels"]]
98+
if invalid:
99+
raise ValueError(f"passing_labels contains labels not declared in labels: {invalid}")
100+
else:
101+
raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: label_model, text_similarity")
94102
return v
95103

96104

src/agentevals/openai_eval_backend.py

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@
3131
"required": ["actual_response", "expected_response"],
3232
}
3333

34+
_ACTUAL_ONLY_SCHEMA = {
35+
"type": "object",
36+
"properties": {"actual_response": {"type": "string"}},
37+
"required": ["actual_response"],
38+
}
39+
3440

3541
def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
3642
"""Build the OpenAI testing_criteria dict from the evaluator config.
@@ -51,28 +57,31 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
5157
"pass_threshold": evaluator_def.threshold,
5258
}
5359

60+
if grader_type == "label_model":
61+
return {
62+
"type": "label_model",
63+
"name": evaluator_def.name,
64+
"model": grader["model"],
65+
"input": grader["input"],
66+
"labels": grader["labels"],
67+
"passing_labels": grader["passing_labels"],
68+
}
69+
5470
raise ValueError(f"Unsupported grader type: {grader_type}")
5571

5672

5773
def _build_jsonl_items(
5874
actual_invocations: list[Invocation],
5975
expected_invocations: list[Invocation],
76+
include_expected: bool = True,
6077
) -> list[dict[str, Any]]:
6178
items = []
6279
for i, actual_inv in enumerate(actual_invocations):
63-
actual_text = _content_to_text(actual_inv.final_response)
64-
if i < len(expected_invocations):
65-
expected_text = _content_to_text(expected_invocations[i].final_response)
66-
else:
67-
expected_text = ""
68-
items.append(
69-
{
70-
"item": {
71-
"actual_response": actual_text,
72-
"expected_response": expected_text,
73-
}
74-
}
75-
)
80+
entry: dict[str, Any] = {"actual_response": _content_to_text(actual_inv.final_response)}
81+
if include_expected:
82+
expected_text = _content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else ""
83+
entry["expected_response"] = expected_text
84+
items.append({"item": entry})
7685
return items
7786

7887

@@ -111,13 +120,15 @@ async def evaluate_openai_eval(
111120
error="OPENAI_API_KEY environment variable is not set.",
112121
)
113122

114-
if expected_invocations is None:
123+
grader_type = evaluator_def.grader["type"]
124+
125+
if grader_type == "text_similarity" and expected_invocations is None:
115126
return MetricResult(
116127
metric_name=evaluator_def.name,
117128
error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
118129
)
119130

120-
items = _build_jsonl_items(actual_invocations, expected_invocations)
131+
items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=(grader_type != "label_model"))
121132
if not items:
122133
return MetricResult(
123134
metric_name=evaluator_def.name,
@@ -130,12 +141,13 @@ async def evaluate_openai_eval(
130141
try:
131142
client = await asyncio.to_thread(_get_openai_client)
132143

144+
item_schema = _ACTUAL_ONLY_SCHEMA if grader_type == "label_model" else _TEXT_PAIR_SCHEMA
133145
eval_obj = await asyncio.to_thread(
134146
client.evals.create,
135147
name=f"agentevals-{evaluator_def.name}",
136148
data_source_config={
137149
"type": "custom",
138-
"item_schema": _TEXT_PAIR_SCHEMA,
150+
"item_schema": item_schema,
139151
"include_sample_schema": False,
140152
},
141153
testing_criteria=[testing_criteria],
@@ -225,12 +237,17 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
225237
total = result_counts.total if result_counts else 0
226238
eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
227239

240+
grader = evaluator_def.grader
228241
details: dict[str, Any] = {
229242
"openai_eval_id": eval_id,
230243
"openai_run_id": run_id,
231-
"evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
232244
"result_counts": {"passed": passed, "failed": failed, "total": total},
233245
}
246+
if grader["type"] == "text_similarity":
247+
details["evaluation_metric"] = grader.get("evaluation_metric")
248+
elif grader["type"] == "label_model":
249+
details["model"] = grader.get("model")
250+
details["passing_labels"] = grader.get("passing_labels")
234251
per_criteria = getattr(run, "per_testing_criteria_results", None)
235252
if per_criteria:
236253
details["per_testing_criteria"] = [

tests/test_openai_eval_backend.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import pytest
2+
from unittest.mock import MagicMock
3+
4+
from agentevals.config import OpenAIEvalDef
5+
from agentevals.openai_eval_backend import (
6+
_build_jsonl_items,
7+
_build_testing_criteria,
8+
evaluate_openai_eval,
9+
)
10+
11+
12+
def _label_grader(**overrides):
13+
base = {
14+
"type": "label_model",
15+
"model": "gpt-4o-mini",
16+
"input": [{"role": "user", "content": "Rate: {{ item.actual_response }}"}],
17+
"labels": ["good", "bad"],
18+
"passing_labels": ["good"],
19+
}
20+
base.update(overrides)
21+
return base
22+
23+
24+
def _invocation(text: str):
25+
inv = MagicMock()
26+
inv.final_response.parts = [MagicMock(text=text)]
27+
return inv
28+
29+
30+
class TestOpenAIEvalDefValidation:
31+
def test_text_similarity_valid(self):
32+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
33+
assert d.grader["type"] == "text_similarity"
34+
35+
def test_text_similarity_missing_metric(self):
36+
with pytest.raises(Exception, match="evaluation_metric"):
37+
OpenAIEvalDef(name="sim", grader={"type": "text_similarity"})
38+
39+
def test_text_similarity_bad_metric(self):
40+
with pytest.raises(Exception, match="Unknown evaluation_metric"):
41+
OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "invalid"})
42+
43+
def test_label_model_valid(self):
44+
d = OpenAIEvalDef(name="lm", grader=_label_grader())
45+
assert d.grader["type"] == "label_model"
46+
47+
@pytest.mark.parametrize("field", ["model", "input", "labels", "passing_labels"])
48+
def test_label_model_missing_required_field(self, field):
49+
with pytest.raises(Exception, match=field):
50+
OpenAIEvalDef(name="lm", grader=_label_grader(**{field: None}))
51+
52+
def test_label_model_passing_labels_not_in_labels(self):
53+
grader = _label_grader()
54+
grader["passing_labels"] = ["unknown"]
55+
with pytest.raises(Exception, match="passing_labels"):
56+
OpenAIEvalDef(name="lm", grader=grader)
57+
58+
def test_unsupported_grader_type(self):
59+
with pytest.raises(Exception, match="Unsupported grader type"):
60+
OpenAIEvalDef(name="x", grader={"type": "unknown"})
61+
62+
63+
class TestBuildTestingCriteria:
64+
def test_text_similarity_shape(self):
65+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}, threshold=0.7)
66+
c = _build_testing_criteria(d)
67+
assert c["type"] == "text_similarity"
68+
assert c["evaluation_metric"] == "bleu"
69+
assert c["pass_threshold"] == 0.7
70+
assert "{{ item.actual_response }}" in c["input"]
71+
assert "{{ item.expected_response }}" in c["reference"]
72+
73+
def test_label_model_shape(self):
74+
grader = _label_grader()
75+
d = OpenAIEvalDef(name="quality", grader=grader)
76+
c = _build_testing_criteria(d)
77+
assert c["type"] == "label_model"
78+
assert c["model"] == "gpt-4o-mini"
79+
assert c["labels"] == ["good", "bad"]
80+
assert c["passing_labels"] == ["good"]
81+
assert c["input"] == grader["input"]
82+
83+
84+
class TestBuildJsonlItems:
85+
def test_text_similarity_includes_expected(self):
86+
items = _build_jsonl_items([_invocation("hello")], [_invocation("world")], include_expected=True)
87+
assert "expected_response" in items[0]["item"]
88+
89+
def test_label_model_excludes_expected(self):
90+
items = _build_jsonl_items([_invocation("hello")], [], include_expected=False)
91+
assert "expected_response" not in items[0]["item"]
92+
93+
def test_missing_expected_falls_back_to_empty(self):
94+
items = _build_jsonl_items([_invocation("hello")], [], include_expected=True)
95+
assert items[0]["item"]["expected_response"] == ""
96+
97+
98+
class TestEvaluateOpenAIEval:
99+
async def test_no_api_key_returns_error(self, monkeypatch):
100+
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
101+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
102+
result = await evaluate_openai_eval(d, [], [])
103+
assert "OPENAI_API_KEY" in (result.error or "")
104+
105+
async def test_text_similarity_requires_expected(self, monkeypatch):
106+
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
107+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
108+
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
109+
assert "expected invocations" in (result.error or "")
110+
111+
async def test_label_model_does_not_require_expected(self, monkeypatch):
112+
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
113+
monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
114+
d = OpenAIEvalDef(name="lm", grader=_label_grader())
115+
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
116+
assert "expected invocations" not in (result.error or "")

0 commit comments

Comments
 (0)