Skip to content

Commit c08c345

Browse files
committed
feat: add StringCheckGrader support for OpenAI Evals backend
1 parent 43bc581 commit c08c345

5 files changed

Lines changed: 195 additions & 27 deletions

File tree

docs/custom-evaluators.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,32 @@ The `grader.evaluation_metric` field selects the similarity algorithm:
317317
| `rouge_1` through `rouge_5` | Unigram through 5-gram overlap (F-measure) |
318318
| `rouge_l` | Longest common subsequence overlap (F-measure) |
319319

320+
### String Check Grader
321+
322+
Checks whether the agent response contains, equals, or matches a fixed reference string. No eval set is needed.
323+
324+
```yaml
325+
evaluators:
326+
- name: response_contains_hello
327+
type: openai_eval
328+
threshold: 0.8
329+
grader:
330+
type: string_check
331+
reference: "hello"
332+
operation: ilike
333+
```
334+
335+
The `operation` field controls how the check is applied:
336+
337+
| Operation | Description |
338+
|---|---|
339+
| `eq` | Exact match (case-sensitive) |
340+
| `ne` | Does not equal (case-sensitive) |
341+
| `like` | Contains the reference (case-sensitive) |
342+
| `ilike` | Contains the reference (case-insensitive) |
343+
344+
The `threshold` field sets the minimum pass rate across all invocations.
345+
320346
### How it works
321347

322348
Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Eval config using OpenAI Evals API graders.
2+
# Requires OPENAI_API_KEY to be set.
3+
#
4+
# Run with:
5+
# agentevals run samples/helm.json \
6+
# --config examples/custom_evaluators/eval_config_openai_eval.yaml
7+
8+
evaluators:
9+
- name: response_contains_hello
10+
type: openai_eval
11+
threshold: 0.8
12+
grader:
13+
type: string_check
14+
reference: "hello"
15+
operation: ilike

src/agentevals/config.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
5454
ref: str = Field(description="Source-specific reference (e.g. path within the repo).")
5555

5656

57+
_VALID_STRING_CHECK_OPERATIONS = frozenset({"eq", "ne", "like", "ilike"})
58+
5759
_VALID_SIMILARITY_METRICS = frozenset(
5860
{
5961
"fuzzy_match",
@@ -84,13 +86,21 @@ class OpenAIEvalDef(BaseModel):
8486
@classmethod
8587
def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
8688
grader_type = v.get("type")
87-
if grader_type != "text_similarity":
88-
raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
89-
metric = v.get("evaluation_metric")
90-
if not metric:
91-
raise ValueError("'evaluation_metric' is required for text_similarity grader")
92-
if metric not in _VALID_SIMILARITY_METRICS:
93-
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
89+
if grader_type == "text_similarity":
90+
metric = v.get("evaluation_metric")
91+
if not metric:
92+
raise ValueError("'evaluation_metric' is required for text_similarity grader")
93+
if metric not in _VALID_SIMILARITY_METRICS:
94+
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
95+
elif grader_type == "string_check":
96+
for field in ("reference", "operation"):
97+
if not v.get(field):
98+
raise ValueError(f"'{field}' is required for string_check grader")
99+
op = v["operation"]
100+
if op not in _VALID_STRING_CHECK_OPERATIONS:
101+
raise ValueError(f"Invalid operation '{op}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}")
102+
else:
103+
raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: string_check, text_similarity")
94104
return v
95105

96106

src/agentevals/openai_eval_backend.py

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@
3131
"required": ["actual_response", "expected_response"],
3232
}
3333

34+
_ACTUAL_ONLY_SCHEMA = {
35+
"type": "object",
36+
"properties": {"actual_response": {"type": "string"}},
37+
"required": ["actual_response"],
38+
}
39+
3440

3541
def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
3642
"""Build the OpenAI testing_criteria dict from the evaluator config.
@@ -51,28 +57,30 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
5157
"pass_threshold": evaluator_def.threshold,
5258
}
5359

60+
if grader_type == "string_check":
61+
return {
62+
"type": "string_check",
63+
"name": evaluator_def.name,
64+
"input": "{{ item.actual_response }}",
65+
"reference": grader["reference"],
66+
"operation": grader["operation"],
67+
}
68+
5469
raise ValueError(f"Unsupported grader type: {grader_type}")
5570

5671

5772
def _build_jsonl_items(
5873
actual_invocations: list[Invocation],
5974
expected_invocations: list[Invocation],
75+
*,
76+
include_expected: bool = True,
6077
) -> list[dict[str, Any]]:
6178
items = []
6279
for i, actual_inv in enumerate(actual_invocations):
63-
actual_text = _content_to_text(actual_inv.final_response)
64-
if i < len(expected_invocations):
65-
expected_text = _content_to_text(expected_invocations[i].final_response)
66-
else:
67-
expected_text = ""
68-
items.append(
69-
{
70-
"item": {
71-
"actual_response": actual_text,
72-
"expected_response": expected_text,
73-
}
74-
}
75-
)
80+
item: dict[str, Any] = {"actual_response": _content_to_text(actual_inv.final_response)}
81+
if include_expected:
82+
item["expected_response"] = _content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else ""
83+
items.append({"item": item})
7684
return items
7785

7886

@@ -111,13 +119,15 @@ async def evaluate_openai_eval(
111119
error="OPENAI_API_KEY environment variable is not set.",
112120
)
113121

114-
if expected_invocations is None:
122+
grader_type = evaluator_def.grader.get("type")
123+
needs_expected = grader_type == "text_similarity"
124+
if needs_expected and expected_invocations is None:
115125
return MetricResult(
116126
metric_name=evaluator_def.name,
117-
error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
127+
error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).",
118128
)
119129

120-
items = _build_jsonl_items(actual_invocations, expected_invocations)
130+
items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=needs_expected)
121131
if not items:
122132
return MetricResult(
123133
metric_name=evaluator_def.name,
@@ -130,12 +140,13 @@ async def evaluate_openai_eval(
130140
try:
131141
client = await asyncio.to_thread(_get_openai_client)
132142

143+
item_schema = _TEXT_PAIR_SCHEMA if needs_expected else _ACTUAL_ONLY_SCHEMA
133144
eval_obj = await asyncio.to_thread(
134145
client.evals.create,
135-
name=f"agentevals-{evaluator_def.name}",
146+
name=f"agentevals-openai-{evaluator_def.name}",
136147
data_source_config={
137148
"type": "custom",
138-
"item_schema": _TEXT_PAIR_SCHEMA,
149+
"item_schema": item_schema,
139150
"include_sample_schema": False,
140151
},
141152
testing_criteria=[testing_criteria],
@@ -146,7 +157,7 @@ async def evaluate_openai_eval(
146157
run = await asyncio.to_thread(
147158
client.evals.runs.create,
148159
eval_id=eval_id,
149-
name=f"agentevals-run-{evaluator_def.name}",
160+
name=f"agentevals-openai-run-{evaluator_def.name}",
150161
data_source={
151162
"type": "jsonl",
152163
"source": {
@@ -225,12 +236,16 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
225236
total = result_counts.total if result_counts else 0
226237
eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
227238

239+
grader = evaluator_def.grader
228240
details: dict[str, Any] = {
229241
"openai_eval_id": eval_id,
230242
"openai_run_id": run_id,
231-
"evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
232243
"result_counts": {"passed": passed, "failed": failed, "total": total},
233244
}
245+
if grader["type"] == "text_similarity":
246+
details["evaluation_metric"] = grader.get("evaluation_metric")
247+
elif grader["type"] == "string_check":
248+
details["operation"] = grader.get("operation")
234249
per_criteria = getattr(run, "per_testing_criteria_results", None)
235250
if per_criteria:
236251
details["per_testing_criteria"] = [

tests/test_openai_eval_backend.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import pytest
2+
from unittest.mock import MagicMock
3+
4+
from agentevals.config import OpenAIEvalDef
5+
from agentevals.openai_eval_backend import (
6+
_build_jsonl_items,
7+
_build_testing_criteria,
8+
evaluate_openai_eval,
9+
)
10+
11+
12+
def _string_check_grader(**overrides):
13+
base = {"type": "string_check", "reference": "hello", "operation": "ilike"}
14+
base.update(overrides)
15+
return base
16+
17+
18+
def _invocation(text: str):
19+
inv = MagicMock()
20+
inv.final_response.parts = [MagicMock(text=text)]
21+
return inv
22+
23+
24+
class TestOpenAIEvalDefValidation:
25+
def test_text_similarity_valid(self):
26+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
27+
assert d.grader["type"] == "text_similarity"
28+
29+
def test_text_similarity_missing_metric(self):
30+
with pytest.raises(Exception, match="evaluation_metric"):
31+
OpenAIEvalDef(name="sim", grader={"type": "text_similarity"})
32+
33+
def test_text_similarity_bad_metric(self):
34+
with pytest.raises(Exception, match="Unknown evaluation_metric"):
35+
OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bad"})
36+
37+
def test_string_check_valid(self):
38+
d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
39+
assert d.grader["type"] == "string_check"
40+
41+
@pytest.mark.parametrize("field", ["reference", "operation"])
42+
def test_string_check_missing_required(self, field):
43+
with pytest.raises(Exception, match=field):
44+
OpenAIEvalDef(name="sc", grader=_string_check_grader(**{field: None}))
45+
46+
def test_string_check_bad_operation(self):
47+
with pytest.raises(Exception, match="Invalid operation"):
48+
OpenAIEvalDef(name="sc", grader=_string_check_grader(operation="bad"))
49+
50+
def test_unsupported_grader_type(self):
51+
with pytest.raises(Exception, match="Unsupported grader type"):
52+
OpenAIEvalDef(name="x", grader={"type": "unknown"})
53+
54+
55+
class TestBuildTestingCriteria:
56+
def test_text_similarity_shape(self):
57+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}, threshold=0.7)
58+
c = _build_testing_criteria(d)
59+
assert c["type"] == "text_similarity"
60+
assert c["evaluation_metric"] == "bleu"
61+
assert c["pass_threshold"] == 0.7
62+
assert "{{ item.actual_response }}" in c["input"]
63+
assert "{{ item.expected_response }}" in c["reference"]
64+
65+
def test_string_check_shape(self):
66+
d = OpenAIEvalDef(name="sc", grader=_string_check_grader(reference="ok", operation="eq"))
67+
c = _build_testing_criteria(d)
68+
assert c["type"] == "string_check"
69+
assert c["reference"] == "ok"
70+
assert c["operation"] == "eq"
71+
assert "{{ item.actual_response }}" in c["input"]
72+
73+
74+
class TestBuildJsonlItems:
75+
def test_includes_expected_when_requested(self):
76+
items = _build_jsonl_items([_invocation("hi")], [_invocation("bye")], include_expected=True)
77+
assert "expected_response" in items[0]["item"]
78+
79+
def test_excludes_expected_for_string_check(self):
80+
items = _build_jsonl_items([_invocation("hi")], [], include_expected=False)
81+
assert "expected_response" not in items[0]["item"]
82+
83+
84+
class TestEvaluateOpenAIEval:
85+
async def test_no_api_key_returns_error(self, monkeypatch):
86+
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
87+
d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
88+
result = await evaluate_openai_eval(d, [], [])
89+
assert "OPENAI_API_KEY" in (result.error or "")
90+
91+
async def test_text_similarity_requires_expected(self, monkeypatch):
92+
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
93+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
94+
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
95+
assert "expected invocations" in (result.error or "")
96+
97+
async def test_string_check_does_not_require_expected(self, monkeypatch):
98+
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
99+
monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
100+
d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
101+
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
102+
assert "expected invocations" not in (result.error or "")

0 commit comments

Comments
 (0)