Skip to content

Commit 1853e6a

Browse files
committed
feat: add LabelModelGrader support for OpenAI Evals backend
Adds label_model grader type alongside text_similarity. Config validates model, input, labels, and passing_labels fields. Items sent to OpenAI only include actual_response for label_model, since the expected behavior is encoded in labels and passing_labels. Details in results include model and passing_labels instead of evaluation_metric. Closes #97
1 parent 43bc581 commit 1853e6a

3 files changed

Lines changed: 184 additions & 25 deletions

File tree

src/agentevals/config.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,18 @@ class OpenAIEvalDef(BaseModel):
8484
@classmethod
8585
def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
8686
grader_type = v.get("type")
87-
if grader_type != "text_similarity":
88-
raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
89-
metric = v.get("evaluation_metric")
90-
if not metric:
91-
raise ValueError("'evaluation_metric' is required for text_similarity grader")
92-
if metric not in _VALID_SIMILARITY_METRICS:
93-
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
87+
if grader_type == "text_similarity":
88+
metric = v.get("evaluation_metric")
89+
if not metric:
90+
raise ValueError("'evaluation_metric' is required for text_similarity grader")
91+
if metric not in _VALID_SIMILARITY_METRICS:
92+
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
93+
elif grader_type == "label_model":
94+
for field in ("model", "input", "labels", "passing_labels"):
95+
if not v.get(field):
96+
raise ValueError(f"'{field}' is required for label_model grader")
97+
else:
98+
raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: label_model, text_similarity")
9499
return v
95100

96101

src/agentevals/openai_eval_backend.py

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@
3131
"required": ["actual_response", "expected_response"],
3232
}
3333

34+
_ACTUAL_ONLY_SCHEMA = {
35+
"type": "object",
36+
"properties": {"actual_response": {"type": "string"}},
37+
"required": ["actual_response"],
38+
}
39+
3440

3541
def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
3642
"""Build the OpenAI testing_criteria dict from the evaluator config.
@@ -51,28 +57,31 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
5157
"pass_threshold": evaluator_def.threshold,
5258
}
5359

60+
if grader_type == "label_model":
61+
return {
62+
"type": "label_model",
63+
"name": evaluator_def.name,
64+
"model": grader["model"],
65+
"input": grader["input"],
66+
"labels": grader["labels"],
67+
"passing_labels": grader["passing_labels"],
68+
}
69+
5470
raise ValueError(f"Unsupported grader type: {grader_type}")
5571

5672

5773
def _build_jsonl_items(
5874
actual_invocations: list[Invocation],
5975
expected_invocations: list[Invocation],
76+
include_expected: bool = True,
6077
) -> list[dict[str, Any]]:
6178
items = []
6279
for i, actual_inv in enumerate(actual_invocations):
63-
actual_text = _content_to_text(actual_inv.final_response)
64-
if i < len(expected_invocations):
65-
expected_text = _content_to_text(expected_invocations[i].final_response)
66-
else:
67-
expected_text = ""
68-
items.append(
69-
{
70-
"item": {
71-
"actual_response": actual_text,
72-
"expected_response": expected_text,
73-
}
74-
}
75-
)
80+
entry: dict[str, Any] = {"actual_response": _content_to_text(actual_inv.final_response)}
81+
if include_expected:
82+
expected_text = _content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else ""
83+
entry["expected_response"] = expected_text
84+
items.append({"item": entry})
7685
return items
7786

7887

@@ -111,13 +120,15 @@ async def evaluate_openai_eval(
111120
error="OPENAI_API_KEY environment variable is not set.",
112121
)
113122

114-
if expected_invocations is None:
123+
grader_type = evaluator_def.grader["type"]
124+
125+
if grader_type == "text_similarity" and expected_invocations is None:
115126
return MetricResult(
116127
metric_name=evaluator_def.name,
117-
error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
128+
error="text_similarity grader requires expected invocations (golden eval set).",
118129
)
119130

120-
items = _build_jsonl_items(actual_invocations, expected_invocations)
131+
items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=(grader_type != "label_model"))
121132
if not items:
122133
return MetricResult(
123134
metric_name=evaluator_def.name,
@@ -130,12 +141,13 @@ async def evaluate_openai_eval(
130141
try:
131142
client = await asyncio.to_thread(_get_openai_client)
132143

144+
item_schema = _ACTUAL_ONLY_SCHEMA if grader_type == "label_model" else _TEXT_PAIR_SCHEMA
133145
eval_obj = await asyncio.to_thread(
134146
client.evals.create,
135147
name=f"agentevals-{evaluator_def.name}",
136148
data_source_config={
137149
"type": "custom",
138-
"item_schema": _TEXT_PAIR_SCHEMA,
150+
"item_schema": item_schema,
139151
"include_sample_schema": False,
140152
},
141153
testing_criteria=[testing_criteria],
@@ -225,12 +237,17 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
225237
total = result_counts.total if result_counts else 0
226238
eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
227239

240+
grader = evaluator_def.grader
228241
details: dict[str, Any] = {
229242
"openai_eval_id": eval_id,
230243
"openai_run_id": run_id,
231-
"evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
232244
"result_counts": {"passed": passed, "failed": failed, "total": total},
233245
}
246+
if grader["type"] == "text_similarity":
247+
details["evaluation_metric"] = grader.get("evaluation_metric")
248+
elif grader["type"] == "label_model":
249+
details["model"] = grader.get("model")
250+
details["passing_labels"] = grader.get("passing_labels")
234251
per_criteria = getattr(run, "per_testing_criteria_results", None)
235252
if per_criteria:
236253
details["per_testing_criteria"] = [

tests/test_openai_eval_backend.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
"""Tests for OpenAI Evals backend config and criteria building."""
2+
3+
from __future__ import annotations
4+
5+
import pytest
6+
from unittest.mock import MagicMock
7+
8+
from agentevals.config import OpenAIEvalDef
9+
from agentevals.openai_eval_backend import (
10+
_ACTUAL_ONLY_SCHEMA,
11+
_TEXT_PAIR_SCHEMA,
12+
_build_jsonl_items,
13+
_build_testing_criteria,
14+
evaluate_openai_eval,
15+
)
16+
17+
18+
def _label_grader(**overrides):
19+
base = {
20+
"type": "label_model",
21+
"model": "gpt-4o-mini",
22+
"input": [{"role": "user", "content": "Rate: {{ item.actual_response }}"}],
23+
"labels": ["good", "bad"],
24+
"passing_labels": ["good"],
25+
}
26+
base.update(overrides)
27+
return base
28+
29+
30+
def _invocation(text: str):
31+
inv = MagicMock()
32+
inv.final_response.parts = [MagicMock(text=text)]
33+
return inv
34+
35+
36+
class TestOpenAIEvalDefValidation:
37+
def test_text_similarity_valid(self):
38+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
39+
assert d.grader["type"] == "text_similarity"
40+
41+
def test_text_similarity_missing_metric(self):
42+
with pytest.raises(Exception, match="evaluation_metric"):
43+
OpenAIEvalDef(name="sim", grader={"type": "text_similarity"})
44+
45+
def test_text_similarity_bad_metric(self):
46+
with pytest.raises(Exception, match="Unknown evaluation_metric"):
47+
OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "invalid"})
48+
49+
def test_label_model_valid(self):
50+
d = OpenAIEvalDef(name="lm", grader=_label_grader())
51+
assert d.grader["type"] == "label_model"
52+
53+
@pytest.mark.parametrize("field", ["model", "input", "labels", "passing_labels"])
54+
def test_label_model_missing_required_field(self, field):
55+
with pytest.raises(Exception, match=field):
56+
OpenAIEvalDef(name="lm", grader=_label_grader(**{field: None}))
57+
58+
def test_unsupported_grader_type(self):
59+
with pytest.raises(Exception, match="Unsupported grader type"):
60+
OpenAIEvalDef(name="x", grader={"type": "unknown"})
61+
62+
63+
class TestBuildTestingCriteria:
64+
def test_text_similarity_shape(self):
65+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}, threshold=0.7)
66+
c = _build_testing_criteria(d)
67+
assert c["type"] == "text_similarity"
68+
assert c["evaluation_metric"] == "bleu"
69+
assert c["pass_threshold"] == 0.7
70+
assert "{{ item.actual_response }}" in c["input"]
71+
assert "{{ item.expected_response }}" in c["reference"]
72+
73+
def test_label_model_shape(self):
74+
grader = _label_grader()
75+
d = OpenAIEvalDef(name="quality", grader=grader)
76+
c = _build_testing_criteria(d)
77+
assert c["type"] == "label_model"
78+
assert c["model"] == "gpt-4o-mini"
79+
assert c["labels"] == ["good", "bad"]
80+
assert c["passing_labels"] == ["good"]
81+
assert c["input"] == grader["input"]
82+
83+
84+
class TestBuildJsonlItems:
85+
def test_text_similarity_includes_expected(self):
86+
actual = [_invocation("hello")]
87+
expected = [_invocation("world")]
88+
items = _build_jsonl_items(actual, expected, include_expected=True)
89+
assert len(items) == 1
90+
assert "expected_response" in items[0]["item"]
91+
92+
def test_label_model_excludes_expected(self):
93+
actual = [_invocation("hello")]
94+
items = _build_jsonl_items(actual, [], include_expected=False)
95+
assert len(items) == 1
96+
assert "expected_response" not in items[0]["item"]
97+
assert items[0]["item"]["actual_response"] is not None
98+
99+
def test_empty_expected_falls_back_to_empty_string(self):
100+
actual = [_invocation("hello")]
101+
items = _build_jsonl_items(actual, [], include_expected=True)
102+
assert items[0]["item"]["expected_response"] == ""
103+
104+
105+
class TestSchemas:
106+
def test_actual_only_has_no_expected(self):
107+
assert "expected_response" not in _ACTUAL_ONLY_SCHEMA["properties"]
108+
109+
def test_text_pair_has_both(self):
110+
assert "actual_response" in _TEXT_PAIR_SCHEMA["properties"]
111+
assert "expected_response" in _TEXT_PAIR_SCHEMA["properties"]
112+
113+
114+
class TestEvaluateOpenAIEval:
115+
@pytest.mark.asyncio
116+
async def test_no_api_key_returns_error(self, monkeypatch):
117+
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
118+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
119+
result = await evaluate_openai_eval(d, [], [])
120+
assert result.error is not None
121+
assert "OPENAI_API_KEY" in result.error
122+
123+
@pytest.mark.asyncio
124+
async def test_text_similarity_requires_expected(self, monkeypatch):
125+
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
126+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
127+
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
128+
assert result.error is not None
129+
assert "expected invocations" in result.error
130+
131+
@pytest.mark.asyncio
132+
async def test_label_model_does_not_require_expected(self, monkeypatch):
133+
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
134+
monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
135+
d = OpenAIEvalDef(name="lm", grader=_label_grader())
136+
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
137+
assert "expected invocations" not in (result.error or "")

0 commit comments

Comments
 (0)