Skip to content

Commit 26844d3

Browse files
committed
feat: add StringCheckGrader support for OpenAI Evals backend
1 parent 43bc581 commit 26844d3

6 files changed

Lines changed: 199 additions & 34 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ evaluators:
240240
threshold: 0.7
241241
```
242242
243-
Evaluators with a `requirements.txt` get automatic virtual environment management. You can also use `type: remote` for community evaluators from GitHub, or `type: openai_eval` to delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) (requires `pip install "agentevals-cli[openai]"`).
243+
Evaluators with a `requirements.txt` get automatic virtual environment management. You can also use `type: remote` for community evaluators from GitHub, or `type: openai_eval` to delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) (requires `pip install "agentevals-cli[openai]"`). Supported grader types: `text_similarity` and `string_check`.
244244

245245
See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK helpers, and how to contribute evaluators.
246246

docs/custom-evaluators.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,32 @@ The `grader.evaluation_metric` field selects the similarity algorithm:
317317
| `rouge_1` through `rouge_5` | Unigram through 5-gram overlap (F-measure) |
318318
| `rouge_l` | Longest common subsequence overlap (F-measure) |
319319

320+
### String Check Grader
321+
322+
Checks whether the agent response contains, equals, or matches a fixed reference string. No eval set is needed.
323+
324+
```yaml
325+
evaluators:
326+
- name: response_contains_hello
327+
type: openai_eval
328+
threshold: 0.8
329+
grader:
330+
type: string_check
331+
reference: "hello"
332+
operation: ilike
333+
```
334+
335+
The `operation` field controls how the check is applied:
336+
337+
| Operation | Description |
338+
|---|---|
339+
| `eq` | Exact match (case-sensitive) |
340+
| `ne` | Does not equal (case-sensitive) |
341+
| `like` | Contains the reference (case-sensitive) |
342+
| `ilike` | Contains the reference (case-insensitive) |
343+
344+
Each invocation either passes or fails. The `threshold` field is not used by `string_check`.
345+
320346
### How it works
321347

322348
Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Eval config using OpenAI Evals API graders.
2+
# Requires OPENAI_API_KEY to be set.
3+
#
4+
# Run with:
5+
# agentevals run samples/helm.json \
6+
# --config examples/custom_evaluators/eval_config_openai_eval.yaml
7+
8+
evaluators:
9+
- name: response_contains_hello
10+
type: openai_eval
11+
threshold: 0.8
12+
grader:
13+
type: string_check
14+
reference: "hello"
15+
operation: ilike

src/agentevals/config.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
5454
ref: str = Field(description="Source-specific reference (e.g. path within the repo).")
5555

5656

57+
_VALID_STRING_CHECK_OPERATIONS = frozenset({"eq", "ne", "like", "ilike"})
58+
59+
_SUPPORTED_GRADER_TYPES = frozenset({"string_check", "text_similarity"})
60+
5761
_VALID_SIMILARITY_METRICS = frozenset(
5862
{
5963
"fuzzy_match",
@@ -84,13 +88,21 @@ class OpenAIEvalDef(BaseModel):
8488
@classmethod
8589
def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
8690
grader_type = v.get("type")
87-
if grader_type != "text_similarity":
88-
raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
89-
metric = v.get("evaluation_metric")
90-
if not metric:
91-
raise ValueError("'evaluation_metric' is required for text_similarity grader")
92-
if metric not in _VALID_SIMILARITY_METRICS:
93-
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
91+
if grader_type == "text_similarity":
92+
metric = v.get("evaluation_metric")
93+
if not metric:
94+
raise ValueError("'evaluation_metric' is required for text_similarity grader")
95+
if metric not in _VALID_SIMILARITY_METRICS:
96+
raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
97+
elif grader_type == "string_check":
98+
for field in ("reference", "operation"):
99+
if not v.get(field):
100+
raise ValueError(f"'{field}' is required for string_check grader")
101+
op = v["operation"]
102+
if op not in _VALID_STRING_CHECK_OPERATIONS:
103+
raise ValueError(f"Invalid operation '{op}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}")
104+
else:
105+
raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: {sorted(_SUPPORTED_GRADER_TYPES)}")
94106
return v
95107

96108

src/agentevals/openai_eval_backend.py

Lines changed: 36 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,4 @@
1-
"""OpenAI Evals API backend — delegates grading to the OpenAI Evals API.
2-
3-
Builds testing criteria from the evaluator config, submits invocation pairs
4-
as JSONL items, polls for completion, and maps per-item results back to a
5-
MetricResult.
6-
"""
1+
"""OpenAI Evals API backend."""
72

83
from __future__ import annotations
94

@@ -31,6 +26,12 @@
3126
"required": ["actual_response", "expected_response"],
3227
}
3328

29+
_ACTUAL_ONLY_SCHEMA = {
30+
"type": "object",
31+
"properties": {"actual_response": {"type": "string"}},
32+
"required": ["actual_response"],
33+
}
34+
3435

3536
def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
3637
"""Build the OpenAI testing_criteria dict from the evaluator config.
@@ -51,28 +52,30 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
5152
"pass_threshold": evaluator_def.threshold,
5253
}
5354

55+
if grader_type == "string_check":
56+
return {
57+
"type": "string_check",
58+
"name": evaluator_def.name,
59+
"input": "{{ item.actual_response }}",
60+
"reference": grader["reference"],
61+
"operation": grader["operation"],
62+
}
63+
5464
raise ValueError(f"Unsupported grader type: {grader_type}")
5565

5666

5767
def _build_jsonl_items(
5868
actual_invocations: list[Invocation],
5969
expected_invocations: list[Invocation],
70+
*,
71+
include_expected: bool = True,
6072
) -> list[dict[str, Any]]:
6173
items = []
6274
for i, actual_inv in enumerate(actual_invocations):
63-
actual_text = _content_to_text(actual_inv.final_response)
64-
if i < len(expected_invocations):
65-
expected_text = _content_to_text(expected_invocations[i].final_response)
66-
else:
67-
expected_text = ""
68-
items.append(
69-
{
70-
"item": {
71-
"actual_response": actual_text,
72-
"expected_response": expected_text,
73-
}
74-
}
75-
)
75+
item: dict[str, Any] = {"actual_response": _content_to_text(actual_inv.final_response)}
76+
if include_expected:
77+
item["expected_response"] = _content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else ""
78+
items.append({"item": item})
7679
return items
7780

7881

@@ -111,13 +114,15 @@ async def evaluate_openai_eval(
111114
error="OPENAI_API_KEY environment variable is not set.",
112115
)
113116

114-
if expected_invocations is None:
117+
grader_type = evaluator_def.grader.get("type")
118+
needs_expected = grader_type == "text_similarity"
119+
if needs_expected and expected_invocations is None:
115120
return MetricResult(
116121
metric_name=evaluator_def.name,
117-
error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
122+
error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).",
118123
)
119124

120-
items = _build_jsonl_items(actual_invocations, expected_invocations)
125+
items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=needs_expected)
121126
if not items:
122127
return MetricResult(
123128
metric_name=evaluator_def.name,
@@ -130,12 +135,13 @@ async def evaluate_openai_eval(
130135
try:
131136
client = await asyncio.to_thread(_get_openai_client)
132137

138+
item_schema = _TEXT_PAIR_SCHEMA if needs_expected else _ACTUAL_ONLY_SCHEMA
133139
eval_obj = await asyncio.to_thread(
134140
client.evals.create,
135-
name=f"agentevals-{evaluator_def.name}",
141+
name=f"agentevals-openai-{evaluator_def.name}",
136142
data_source_config={
137143
"type": "custom",
138-
"item_schema": _TEXT_PAIR_SCHEMA,
144+
"item_schema": item_schema,
139145
"include_sample_schema": False,
140146
},
141147
testing_criteria=[testing_criteria],
@@ -146,7 +152,7 @@ async def evaluate_openai_eval(
146152
run = await asyncio.to_thread(
147153
client.evals.runs.create,
148154
eval_id=eval_id,
149-
name=f"agentevals-run-{evaluator_def.name}",
155+
name=f"agentevals-openai-run-{evaluator_def.name}",
150156
data_source={
151157
"type": "jsonl",
152158
"source": {
@@ -225,12 +231,16 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
225231
total = result_counts.total if result_counts else 0
226232
eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
227233

234+
grader = evaluator_def.grader
228235
details: dict[str, Any] = {
229236
"openai_eval_id": eval_id,
230237
"openai_run_id": run_id,
231-
"evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
232238
"result_counts": {"passed": passed, "failed": failed, "total": total},
233239
}
240+
if grader["type"] == "text_similarity":
241+
details["evaluation_metric"] = grader.get("evaluation_metric")
242+
elif grader["type"] == "string_check":
243+
details["operation"] = grader.get("operation")
234244
per_criteria = getattr(run, "per_testing_criteria_results", None)
235245
if per_criteria:
236246
details["per_testing_criteria"] = [

tests/test_openai_eval_backend.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import pytest
2+
from unittest.mock import MagicMock
3+
4+
from agentevals.config import OpenAIEvalDef
5+
from agentevals.openai_eval_backend import (
6+
_build_jsonl_items,
7+
_build_testing_criteria,
8+
evaluate_openai_eval,
9+
)
10+
11+
12+
def _string_check_grader(**overrides):
13+
base = {"type": "string_check", "reference": "hello", "operation": "ilike"}
14+
base.update(overrides)
15+
return base
16+
17+
18+
def _invocation(text: str):
19+
inv = MagicMock()
20+
inv.final_response.parts = [MagicMock(text=text)]
21+
return inv
22+
23+
24+
class TestOpenAIEvalDefValidation:
25+
def test_text_similarity_valid(self):
26+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
27+
assert d.grader["type"] == "text_similarity"
28+
29+
def test_text_similarity_missing_metric(self):
30+
with pytest.raises(Exception, match="evaluation_metric"):
31+
OpenAIEvalDef(name="sim", grader={"type": "text_similarity"})
32+
33+
def test_text_similarity_bad_metric(self):
34+
with pytest.raises(Exception, match="Unknown evaluation_metric"):
35+
OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bad"})
36+
37+
def test_string_check_valid(self):
38+
d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
39+
assert d.grader["type"] == "string_check"
40+
41+
@pytest.mark.parametrize("field", ["reference", "operation"])
42+
def test_string_check_missing_required(self, field):
43+
with pytest.raises(Exception, match=field):
44+
OpenAIEvalDef(name="sc", grader=_string_check_grader(**{field: None}))
45+
46+
def test_string_check_bad_operation(self):
47+
with pytest.raises(Exception, match="Invalid operation"):
48+
OpenAIEvalDef(name="sc", grader=_string_check_grader(operation="bad"))
49+
50+
def test_unsupported_grader_type(self):
51+
with pytest.raises(Exception, match="Unsupported grader type"):
52+
OpenAIEvalDef(name="x", grader={"type": "unknown"})
53+
54+
55+
class TestBuildTestingCriteria:
56+
def test_text_similarity_shape(self):
57+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}, threshold=0.7)
58+
c = _build_testing_criteria(d)
59+
assert c["type"] == "text_similarity"
60+
assert c["evaluation_metric"] == "bleu"
61+
assert c["pass_threshold"] == 0.7
62+
assert "{{ item.actual_response }}" in c["input"]
63+
assert "{{ item.expected_response }}" in c["reference"]
64+
65+
def test_string_check_shape(self):
66+
d = OpenAIEvalDef(name="sc", grader=_string_check_grader(reference="ok", operation="eq"))
67+
c = _build_testing_criteria(d)
68+
assert c["type"] == "string_check"
69+
assert c["reference"] == "ok"
70+
assert c["operation"] == "eq"
71+
assert "{{ item.actual_response }}" in c["input"]
72+
73+
74+
class TestBuildJsonlItems:
75+
def test_includes_expected_when_requested(self):
76+
items = _build_jsonl_items([_invocation("hi")], [_invocation("bye")], include_expected=True)
77+
assert "expected_response" in items[0]["item"]
78+
79+
def test_excludes_expected_for_string_check(self):
80+
items = _build_jsonl_items([_invocation("hi")], [], include_expected=False)
81+
assert "expected_response" not in items[0]["item"]
82+
83+
84+
class TestEvaluateOpenAIEval:
85+
async def test_no_api_key_returns_error(self, monkeypatch):
86+
monkeypatch.delenv("OPENAI_API_KEY", raising=False)
87+
d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
88+
result = await evaluate_openai_eval(d, [], [])
89+
assert "OPENAI_API_KEY" in (result.error or "")
90+
91+
async def test_text_similarity_requires_expected(self, monkeypatch):
92+
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
93+
d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
94+
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
95+
assert "expected invocations" in (result.error or "")
96+
97+
async def test_string_check_does_not_require_expected(self, monkeypatch):
98+
monkeypatch.setenv("OPENAI_API_KEY", "test-key")
99+
monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
100+
d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
101+
result = await evaluate_openai_eval(d, [_invocation("hi")], None)
102+
assert "expected invocations" not in (result.error or "")

0 commit comments

Comments
 (0)