feat: add StringCheckGrader support for OpenAI Evals backend

mesutoezdil · mesutoezdil · commit 26844d3404df · 2026-05-15T12:33:02.000+02:00
diff --git a/README.md b/README.md
@@ -240,7 +240,7 @@ evaluators:
     threshold: 0.7
 ```
 
-Evaluators with a `requirements.txt` get automatic virtual environment management. You can also use `type: remote` for community evaluators from GitHub, or `type: openai_eval` to delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) (requires `pip install "agentevals-cli[openai]"`).
+Evaluators with a `requirements.txt` get automatic virtual environment management. You can also use `type: remote` for community evaluators from GitHub, or `type: openai_eval` to delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) (requires `pip install "agentevals-cli[openai]"`). Supported grader types: `text_similarity` and `string_check`.
 
 See the [Custom Evaluators guide](docs/custom-evaluators.md) for the full protocol reference, SDK helpers, and how to contribute evaluators.
 
diff --git a/docs/custom-evaluators.md b/docs/custom-evaluators.md
@@ -317,6 +317,32 @@ The `grader.evaluation_metric` field selects the similarity algorithm:
 | `rouge_1` through `rouge_5` | Unigram through 5-gram overlap (F-measure) |
 | `rouge_l` | Longest common subsequence overlap (F-measure) |
 
+### String Check Grader
+
+Checks whether the agent response contains, equals, or matches a fixed reference string. No eval set is needed.
+
+```yaml
+evaluators:
+  - name: response_contains_hello
+    type: openai_eval
+    threshold: 0.8
+    grader:
+      type: string_check
+      reference: "hello"
+      operation: ilike
+```
+
+The `operation` field controls how the check is applied:
+
+| Operation | Description |
+|---|---|
+| `eq` | Exact match (case-sensitive) |
+| `ne` | Does not equal (case-sensitive) |
+| `like` | Contains the reference (case-sensitive) |
+| `ilike` | Contains the reference (case-insensitive) |
+
+Each invocation either passes or fails. The `threshold` field is not used by `string_check`.
+
 ### How it works
 
 Under the hood, agentevals creates an ephemeral eval on OpenAI, submits the actual and expected responses as JSONL items, polls for results, and cleans up. The agent's response and the golden reference are both placed in the `item` namespace (with `include_sample_schema: false`), so OpenAI only grades the provided text without generating any model outputs.
diff --git a/examples/custom_evaluators/eval_config_openai_eval.yaml b/examples/custom_evaluators/eval_config_openai_eval.yaml
@@ -0,0 +1,15 @@
+# Eval config using OpenAI Evals API graders.
+# Requires OPENAI_API_KEY to be set.
+#
+# Run with:
+#   agentevals run samples/helm.json \
+#     --config examples/custom_evaluators/eval_config_openai_eval.yaml
+
+evaluators:
+  - name: response_contains_hello
+    type: openai_eval
+    threshold: 0.8
+    grader:
+      type: string_check
+      reference: "hello"
+      operation: ilike
diff --git a/src/agentevals/config.py b/src/agentevals/config.py
@@ -54,6 +54,10 @@ class RemoteEvaluatorDef(BaseEvaluatorDef):
     ref: str = Field(description="Source-specific reference (e.g. path within the repo).")
 
 
+_VALID_STRING_CHECK_OPERATIONS = frozenset({"eq", "ne", "like", "ilike"})
+
+_SUPPORTED_GRADER_TYPES = frozenset({"string_check", "text_similarity"})
+
 _VALID_SIMILARITY_METRICS = frozenset(
     {
         "fuzzy_match",
@@ -84,13 +88,21 @@ class OpenAIEvalDef(BaseModel):
     @classmethod
     def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
         grader_type = v.get("type")
-        if grader_type != "text_similarity":
-            raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
-        metric = v.get("evaluation_metric")
-        if not metric:
-            raise ValueError("'evaluation_metric' is required for text_similarity grader")
-        if metric not in _VALID_SIMILARITY_METRICS:
-            raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
+        if grader_type == "text_similarity":
+            metric = v.get("evaluation_metric")
+            if not metric:
+                raise ValueError("'evaluation_metric' is required for text_similarity grader")
+            if metric not in _VALID_SIMILARITY_METRICS:
+                raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
+        elif grader_type == "string_check":
+            for field in ("reference", "operation"):
+                if not v.get(field):
+                    raise ValueError(f"'{field}' is required for string_check grader")
+            op = v["operation"]
+            if op not in _VALID_STRING_CHECK_OPERATIONS:
+                raise ValueError(f"Invalid operation '{op}'. Valid: {sorted(_VALID_STRING_CHECK_OPERATIONS)}")
+        else:
+            raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: {sorted(_SUPPORTED_GRADER_TYPES)}")
         return v
 
 
diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py
@@ -1,9 +1,4 @@
-"""OpenAI Evals API backend — delegates grading to the OpenAI Evals API.
-
-Builds testing criteria from the evaluator config, submits invocation pairs
-as JSONL items, polls for completion, and maps per-item results back to a
-MetricResult.
-"""
+"""OpenAI Evals API backend."""
 
 from __future__ import annotations
 
@@ -31,6 +26,12 @@
     "required": ["actual_response", "expected_response"],
 }
 
+_ACTUAL_ONLY_SCHEMA = {
+    "type": "object",
+    "properties": {"actual_response": {"type": "string"}},
+    "required": ["actual_response"],
+}
+
 
 def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
     """Build the OpenAI testing_criteria dict from the evaluator config.
@@ -51,28 +52,30 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
             "pass_threshold": evaluator_def.threshold,
         }
 
+    if grader_type == "string_check":
+        return {
+            "type": "string_check",
+            "name": evaluator_def.name,
+            "input": "{{ item.actual_response }}",
+            "reference": grader["reference"],
+            "operation": grader["operation"],
+        }
+
     raise ValueError(f"Unsupported grader type: {grader_type}")
 
 
 def _build_jsonl_items(
     actual_invocations: list[Invocation],
     expected_invocations: list[Invocation],
+    *,
+    include_expected: bool = True,
 ) -> list[dict[str, Any]]:
     items = []
     for i, actual_inv in enumerate(actual_invocations):
-        actual_text = _content_to_text(actual_inv.final_response)
-        if i < len(expected_invocations):
-            expected_text = _content_to_text(expected_invocations[i].final_response)
-        else:
-            expected_text = ""
-        items.append(
-            {
-                "item": {
-                    "actual_response": actual_text,
-                    "expected_response": expected_text,
-                }
-            }
-        )
+        item: dict[str, Any] = {"actual_response": _content_to_text(actual_inv.final_response)}
+        if include_expected:
+            item["expected_response"] = _content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else ""
+        items.append({"item": item})
     return items
 
 
@@ -111,13 +114,15 @@ async def evaluate_openai_eval(
             error="OPENAI_API_KEY environment variable is not set.",
         )
 
-    if expected_invocations is None:
+    grader_type = evaluator_def.grader.get("type")
+    needs_expected = grader_type == "text_similarity"
+    if needs_expected and expected_invocations is None:
         return MetricResult(
             metric_name=evaluator_def.name,
-            error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
+            error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).",
         )
 
-    items = _build_jsonl_items(actual_invocations, expected_invocations)
+    items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=needs_expected)
     if not items:
         return MetricResult(
             metric_name=evaluator_def.name,
@@ -130,12 +135,13 @@ async def evaluate_openai_eval(
     try:
         client = await asyncio.to_thread(_get_openai_client)
 
+        item_schema = _TEXT_PAIR_SCHEMA if needs_expected else _ACTUAL_ONLY_SCHEMA
         eval_obj = await asyncio.to_thread(
             client.evals.create,
-            name=f"agentevals-{evaluator_def.name}",
+            name=f"agentevals-openai-{evaluator_def.name}",
             data_source_config={
                 "type": "custom",
-                "item_schema": _TEXT_PAIR_SCHEMA,
+                "item_schema": item_schema,
                 "include_sample_schema": False,
             },
             testing_criteria=[testing_criteria],
@@ -146,7 +152,7 @@ async def evaluate_openai_eval(
         run = await asyncio.to_thread(
             client.evals.runs.create,
             eval_id=eval_id,
-            name=f"agentevals-run-{evaluator_def.name}",
+            name=f"agentevals-openai-run-{evaluator_def.name}",
             data_source={
                 "type": "jsonl",
                 "source": {
@@ -225,12 +231,16 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
     total = result_counts.total if result_counts else 0
     eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
 
+    grader = evaluator_def.grader
     details: dict[str, Any] = {
         "openai_eval_id": eval_id,
         "openai_run_id": run_id,
-        "evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
         "result_counts": {"passed": passed, "failed": failed, "total": total},
     }
+    if grader["type"] == "text_similarity":
+        details["evaluation_metric"] = grader.get("evaluation_metric")
+    elif grader["type"] == "string_check":
+        details["operation"] = grader.get("operation")
     per_criteria = getattr(run, "per_testing_criteria_results", None)
     if per_criteria:
         details["per_testing_criteria"] = [
diff --git a/tests/test_openai_eval_backend.py b/tests/test_openai_eval_backend.py
@@ -0,0 +1,102 @@
+import pytest
+from unittest.mock import MagicMock
+
+from agentevals.config import OpenAIEvalDef
+from agentevals.openai_eval_backend import (
+    _build_jsonl_items,
+    _build_testing_criteria,
+    evaluate_openai_eval,
+)
+
+
+def _string_check_grader(**overrides):
+    base = {"type": "string_check", "reference": "hello", "operation": "ilike"}
+    base.update(overrides)
+    return base
+
+
+def _invocation(text: str):
+    inv = MagicMock()
+    inv.final_response.parts = [MagicMock(text=text)]
+    return inv
+
+
+class TestOpenAIEvalDefValidation:
+    def test_text_similarity_valid(self):
+        d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
+        assert d.grader["type"] == "text_similarity"
+
+    def test_text_similarity_missing_metric(self):
+        with pytest.raises(Exception, match="evaluation_metric"):
+            OpenAIEvalDef(name="sim", grader={"type": "text_similarity"})
+
+    def test_text_similarity_bad_metric(self):
+        with pytest.raises(Exception, match="Unknown evaluation_metric"):
+            OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bad"})
+
+    def test_string_check_valid(self):
+        d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
+        assert d.grader["type"] == "string_check"
+
+    @pytest.mark.parametrize("field", ["reference", "operation"])
+    def test_string_check_missing_required(self, field):
+        with pytest.raises(Exception, match=field):
+            OpenAIEvalDef(name="sc", grader=_string_check_grader(**{field: None}))
+
+    def test_string_check_bad_operation(self):
+        with pytest.raises(Exception, match="Invalid operation"):
+            OpenAIEvalDef(name="sc", grader=_string_check_grader(operation="bad"))
+
+    def test_unsupported_grader_type(self):
+        with pytest.raises(Exception, match="Unsupported grader type"):
+            OpenAIEvalDef(name="x", grader={"type": "unknown"})
+
+
+class TestBuildTestingCriteria:
+    def test_text_similarity_shape(self):
+        d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}, threshold=0.7)
+        c = _build_testing_criteria(d)
+        assert c["type"] == "text_similarity"
+        assert c["evaluation_metric"] == "bleu"
+        assert c["pass_threshold"] == 0.7
+        assert "{{ item.actual_response }}" in c["input"]
+        assert "{{ item.expected_response }}" in c["reference"]
+
+    def test_string_check_shape(self):
+        d = OpenAIEvalDef(name="sc", grader=_string_check_grader(reference="ok", operation="eq"))
+        c = _build_testing_criteria(d)
+        assert c["type"] == "string_check"
+        assert c["reference"] == "ok"
+        assert c["operation"] == "eq"
+        assert "{{ item.actual_response }}" in c["input"]
+
+
+class TestBuildJsonlItems:
+    def test_includes_expected_when_requested(self):
+        items = _build_jsonl_items([_invocation("hi")], [_invocation("bye")], include_expected=True)
+        assert "expected_response" in items[0]["item"]
+
+    def test_excludes_expected_for_string_check(self):
+        items = _build_jsonl_items([_invocation("hi")], [], include_expected=False)
+        assert "expected_response" not in items[0]["item"]
+
+
+class TestEvaluateOpenAIEval:
+    async def test_no_api_key_returns_error(self, monkeypatch):
+        monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+        d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
+        result = await evaluate_openai_eval(d, [], [])
+        assert "OPENAI_API_KEY" in (result.error or "")
+
+    async def test_text_similarity_requires_expected(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
+        d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
+        result = await evaluate_openai_eval(d, [_invocation("hi")], None)
+        assert "expected invocations" in (result.error or "")
+
+    async def test_string_check_does_not_require_expected(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
+        monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
+        d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
+        result = await evaluate_openai_eval(d, [_invocation("hi")], None)
+        assert "expected invocations" not in (result.error or "")