feat: add LabelModelGrader support for OpenAI Evals backend

mesutoezdil · mesutoezdil · commit 1853e6a15030 · 2026-05-12T19:43:33.000+02:00
Adds label_model grader type alongside text_similarity. Config validates model, input, labels, and passing_labels fields. Items sent to OpenAI only include actual_response for label_model, since the expected behavior is encoded in labels and passing_labels. Details in results include model and passing_labels instead of evaluation_metric. Closes #97
diff --git a/src/agentevals/config.py b/src/agentevals/config.py
@@ -84,13 +84,18 @@ class OpenAIEvalDef(BaseModel):
     @classmethod
     def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
         grader_type = v.get("type")
-        if grader_type != "text_similarity":
-            raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'")
-        metric = v.get("evaluation_metric")
-        if not metric:
-            raise ValueError("'evaluation_metric' is required for text_similarity grader")
-        if metric not in _VALID_SIMILARITY_METRICS:
-            raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
+        if grader_type == "text_similarity":
+            metric = v.get("evaluation_metric")
+            if not metric:
+                raise ValueError("'evaluation_metric' is required for text_similarity grader")
+            if metric not in _VALID_SIMILARITY_METRICS:
+                raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}")
+        elif grader_type == "label_model":
+            for field in ("model", "input", "labels", "passing_labels"):
+                if not v.get(field):
+                    raise ValueError(f"'{field}' is required for label_model grader")
+        else:
+            raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: label_model, text_similarity")
         return v
 
 
diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py
@@ -31,6 +31,12 @@
     "required": ["actual_response", "expected_response"],
 }
 
+_ACTUAL_ONLY_SCHEMA = {
+    "type": "object",
+    "properties": {"actual_response": {"type": "string"}},
+    "required": ["actual_response"],
+}
+
 
 def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
     """Build the OpenAI testing_criteria dict from the evaluator config.
@@ -51,28 +57,31 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
             "pass_threshold": evaluator_def.threshold,
         }
 
+    if grader_type == "label_model":
+        return {
+            "type": "label_model",
+            "name": evaluator_def.name,
+            "model": grader["model"],
+            "input": grader["input"],
+            "labels": grader["labels"],
+            "passing_labels": grader["passing_labels"],
+        }
+
     raise ValueError(f"Unsupported grader type: {grader_type}")
 
 
 def _build_jsonl_items(
     actual_invocations: list[Invocation],
     expected_invocations: list[Invocation],
+    include_expected: bool = True,
 ) -> list[dict[str, Any]]:
     items = []
     for i, actual_inv in enumerate(actual_invocations):
-        actual_text = _content_to_text(actual_inv.final_response)
-        if i < len(expected_invocations):
-            expected_text = _content_to_text(expected_invocations[i].final_response)
-        else:
-            expected_text = ""
-        items.append(
-            {
-                "item": {
-                    "actual_response": actual_text,
-                    "expected_response": expected_text,
-                }
-            }
-        )
+        entry: dict[str, Any] = {"actual_response": _content_to_text(actual_inv.final_response)}
+        if include_expected:
+            expected_text = _content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else ""
+            entry["expected_response"] = expected_text
+        items.append({"item": entry})
     return items
 
 
@@ -111,13 +120,15 @@ async def evaluate_openai_eval(
             error="OPENAI_API_KEY environment variable is not set.",
         )
 
-    if expected_invocations is None:
+    grader_type = evaluator_def.grader["type"]
+
+    if grader_type == "text_similarity" and expected_invocations is None:
         return MetricResult(
             metric_name=evaluator_def.name,
-            error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
+            error="text_similarity grader requires expected invocations (golden eval set).",
         )
 
-    items = _build_jsonl_items(actual_invocations, expected_invocations)
+    items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=(grader_type != "label_model"))
     if not items:
         return MetricResult(
             metric_name=evaluator_def.name,
@@ -130,12 +141,13 @@ async def evaluate_openai_eval(
     try:
         client = await asyncio.to_thread(_get_openai_client)
 
+        item_schema = _ACTUAL_ONLY_SCHEMA if grader_type == "label_model" else _TEXT_PAIR_SCHEMA
         eval_obj = await asyncio.to_thread(
             client.evals.create,
             name=f"agentevals-{evaluator_def.name}",
             data_source_config={
                 "type": "custom",
-                "item_schema": _TEXT_PAIR_SCHEMA,
+                "item_schema": item_schema,
                 "include_sample_schema": False,
             },
             testing_criteria=[testing_criteria],
@@ -225,12 +237,17 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
     total = result_counts.total if result_counts else 0
     eval_status = "PASSED" if failed == 0 and total > 0 else "FAILED"
 
+    grader = evaluator_def.grader
     details: dict[str, Any] = {
         "openai_eval_id": eval_id,
         "openai_run_id": run_id,
-        "evaluation_metric": evaluator_def.grader.get("evaluation_metric"),
         "result_counts": {"passed": passed, "failed": failed, "total": total},
     }
+    if grader["type"] == "text_similarity":
+        details["evaluation_metric"] = grader.get("evaluation_metric")
+    elif grader["type"] == "label_model":
+        details["model"] = grader.get("model")
+        details["passing_labels"] = grader.get("passing_labels")
     per_criteria = getattr(run, "per_testing_criteria_results", None)
     if per_criteria:
         details["per_testing_criteria"] = [
diff --git a/tests/test_openai_eval_backend.py b/tests/test_openai_eval_backend.py
@@ -0,0 +1,137 @@
+"""Tests for OpenAI Evals backend config and criteria building."""
+
+from __future__ import annotations
+
+import pytest
+from unittest.mock import MagicMock
+
+from agentevals.config import OpenAIEvalDef
+from agentevals.openai_eval_backend import (
+    _ACTUAL_ONLY_SCHEMA,
+    _TEXT_PAIR_SCHEMA,
+    _build_jsonl_items,
+    _build_testing_criteria,
+    evaluate_openai_eval,
+)
+
+
+def _label_grader(**overrides):
+    base = {
+        "type": "label_model",
+        "model": "gpt-4o-mini",
+        "input": [{"role": "user", "content": "Rate: {{ item.actual_response }}"}],
+        "labels": ["good", "bad"],
+        "passing_labels": ["good"],
+    }
+    base.update(overrides)
+    return base
+
+
+def _invocation(text: str):
+    inv = MagicMock()
+    inv.final_response.parts = [MagicMock(text=text)]
+    return inv
+
+
+class TestOpenAIEvalDefValidation:
+    def test_text_similarity_valid(self):
+        d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
+        assert d.grader["type"] == "text_similarity"
+
+    def test_text_similarity_missing_metric(self):
+        with pytest.raises(Exception, match="evaluation_metric"):
+            OpenAIEvalDef(name="sim", grader={"type": "text_similarity"})
+
+    def test_text_similarity_bad_metric(self):
+        with pytest.raises(Exception, match="Unknown evaluation_metric"):
+            OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "invalid"})
+
+    def test_label_model_valid(self):
+        d = OpenAIEvalDef(name="lm", grader=_label_grader())
+        assert d.grader["type"] == "label_model"
+
+    @pytest.mark.parametrize("field", ["model", "input", "labels", "passing_labels"])
+    def test_label_model_missing_required_field(self, field):
+        with pytest.raises(Exception, match=field):
+            OpenAIEvalDef(name="lm", grader=_label_grader(**{field: None}))
+
+    def test_unsupported_grader_type(self):
+        with pytest.raises(Exception, match="Unsupported grader type"):
+            OpenAIEvalDef(name="x", grader={"type": "unknown"})
+
+
+class TestBuildTestingCriteria:
+    def test_text_similarity_shape(self):
+        d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}, threshold=0.7)
+        c = _build_testing_criteria(d)
+        assert c["type"] == "text_similarity"
+        assert c["evaluation_metric"] == "bleu"
+        assert c["pass_threshold"] == 0.7
+        assert "{{ item.actual_response }}" in c["input"]
+        assert "{{ item.expected_response }}" in c["reference"]
+
+    def test_label_model_shape(self):
+        grader = _label_grader()
+        d = OpenAIEvalDef(name="quality", grader=grader)
+        c = _build_testing_criteria(d)
+        assert c["type"] == "label_model"
+        assert c["model"] == "gpt-4o-mini"
+        assert c["labels"] == ["good", "bad"]
+        assert c["passing_labels"] == ["good"]
+        assert c["input"] == grader["input"]
+
+
+class TestBuildJsonlItems:
+    def test_text_similarity_includes_expected(self):
+        actual = [_invocation("hello")]
+        expected = [_invocation("world")]
+        items = _build_jsonl_items(actual, expected, include_expected=True)
+        assert len(items) == 1
+        assert "expected_response" in items[0]["item"]
+
+    def test_label_model_excludes_expected(self):
+        actual = [_invocation("hello")]
+        items = _build_jsonl_items(actual, [], include_expected=False)
+        assert len(items) == 1
+        assert "expected_response" not in items[0]["item"]
+        assert items[0]["item"]["actual_response"] is not None
+
+    def test_empty_expected_falls_back_to_empty_string(self):
+        actual = [_invocation("hello")]
+        items = _build_jsonl_items(actual, [], include_expected=True)
+        assert items[0]["item"]["expected_response"] == ""
+
+
+class TestSchemas:
+    def test_actual_only_has_no_expected(self):
+        assert "expected_response" not in _ACTUAL_ONLY_SCHEMA["properties"]
+
+    def test_text_pair_has_both(self):
+        assert "actual_response" in _TEXT_PAIR_SCHEMA["properties"]
+        assert "expected_response" in _TEXT_PAIR_SCHEMA["properties"]
+
+
+class TestEvaluateOpenAIEval:
+    @pytest.mark.asyncio
+    async def test_no_api_key_returns_error(self, monkeypatch):
+        monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+        d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
+        result = await evaluate_openai_eval(d, [], [])
+        assert result.error is not None
+        assert "OPENAI_API_KEY" in result.error
+
+    @pytest.mark.asyncio
+    async def test_text_similarity_requires_expected(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
+        d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
+        result = await evaluate_openai_eval(d, [_invocation("hi")], None)
+        assert result.error is not None
+        assert "expected invocations" in result.error
+
+    @pytest.mark.asyncio
+    async def test_label_model_does_not_require_expected(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
+        monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
+        d = OpenAIEvalDef(name="lm", grader=_label_grader())
+        result = await evaluate_openai_eval(d, [_invocation("hi")], None)
+        assert "expected invocations" not in (result.error or "")