agentevals-dev · mesutoezdil · May 12, 2026 · May 15, 2026 · May 15, 2026
diff --git a/docs/custom-evaluators.md b/docs/custom-evaluators.md
@@ -291,6 +291,26 @@ Remote evaluators are cached in `~/.cache/agentevals/evaluators/`. To force a re
 
 You can delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) instead of running scoring logic locally. This requires `pip install "agentevals-cli[openai]"` and `OPENAI_API_KEY` to be set.
 
+### Score Model Grader
+
+Uses a model to score each response without a golden set. The model reads the response and returns a float.
+
+```yaml
+evaluators:
+  - name: quality_score
+    type: openai_eval
+    threshold: 0.7
+    grader:
+      type: score_model
+      model: gpt-4o-mini
+      input:
+        - role: user
+          content: "Rate the quality of this response from 0 to 1: {{ item.actual_response }}"
+      range: [0, 1]
+```
+
+The `range` field sets the min and max the model can return (defaults to `[0, 1]`). No eval set is needed.
+
 ### Text Similarity Grader
 
 Compares the agent's response against a golden reference using text similarity metrics. Requires an eval set.

diff --git a/examples/custom_evaluators/eval_config_openai_eval.yaml b/examples/custom_evaluators/eval_config_openai_eval.yaml
@@ -16,3 +16,13 @@ evaluators:
           content: "Rate this response: {{ item.actual_response }}"
       labels: [good, bad]
       passing_labels: [good]
+  - name: quality_score
+    type: openai_eval
+    threshold: 0.7
+    grader:
+      type: score_model
+      model: gpt-4o-mini
+      input:
+        - role: user
+          content: "Rate the quality of this response from 0 to 1: {{ item.actual_response }}"
+      range: [0, 1]
diff --git a/src/agentevals/config.py b/src/agentevals/config.py
@@ -113,8 +113,14 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]:
             invalid = [lbl for lbl in v["passing_labels"] if lbl not in v["labels"]]
             if invalid:
                 raise ValueError(f"passing_labels contains labels not declared in labels: {invalid}")
+        elif grader_type == "score_model":
+            for field in ("model", "input"):
+                if not v.get(field):
+                    raise ValueError(f"'{field}' is required for score_model grader")
         else:
-            raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: label_model, text_similarity")
+            raise ValueError(
+                f"Unsupported grader type: '{grader_type}'. Supported: label_model, score_model, text_similarity"
+            )
         return v
 
 

diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py
@@ -1,9 +1,4 @@
-"""OpenAI Evals API backend — delegates grading to the OpenAI Evals API.
-
-Builds testing criteria from the evaluator config, submits invocation pairs
-as JSONL items, polls for completion, and maps per-item results back to a
-MetricResult.
-"""
+"""OpenAI Evals API backend."""
 
 from __future__ import annotations
 
@@ -39,11 +34,6 @@
 
 
 def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
-    """Build the OpenAI testing_criteria dict from the evaluator config.
-
-    Each grader type produces a different shape.  Extend this function
-    when adding support for new OpenAI grader types.
-    """
     grader = evaluator_def.grader
     grader_type = grader["type"]
 
@@ -67,12 +57,23 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]:
             "passing_labels": grader["passing_labels"],
         }
 
+    if grader_type == "score_model":
+        return {
+            "type": "score_model",
+            "name": evaluator_def.name,
+            "model": grader["model"],
+            "input": grader["input"],
+            "range": grader.get("range", [0, 1]),
+            "pass_threshold": evaluator_def.threshold,
+        }
+
     raise ValueError(f"Unsupported grader type: {grader_type}")
 
 
 def _build_jsonl_items(
     actual_invocations: list[Invocation],
     expected_invocations: list[Invocation],
+    *,
     include_expected: bool = True,
 ) -> list[dict[str, Any]]:
     items = []
@@ -123,16 +124,14 @@ async def evaluate_openai_eval(
         )
 
     grader_type = evaluator_def.grader["type"]
-
-    if grader_type == "text_similarity" and expected_invocations is None:
+    needs_expected = grader_type == "text_similarity"
+    if needs_expected and expected_invocations is None:
         return MetricResult(
             metric_name=evaluator_def.name,
-            error="OpenAI text_similarity grader requires expected invocations (golden eval set).",
+            error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).",
         )
 
-    items = _build_jsonl_items(
-        actual_invocations, expected_invocations or [], include_expected=(grader_type != "label_model")
-    )
+    items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=needs_expected)
     if not items:
         return MetricResult(
             metric_name=evaluator_def.name,
@@ -145,7 +144,7 @@ async def evaluate_openai_eval(
     try:
         client = await asyncio.to_thread(_get_openai_client)
 
-        item_schema = _ACTUAL_ONLY_SCHEMA if grader_type == "label_model" else _TEXT_PAIR_SCHEMA
+        item_schema = _TEXT_PAIR_SCHEMA if needs_expected else _ACTUAL_ONLY_SCHEMA
         eval_obj = await asyncio.to_thread(
             client.evals.create,
             name=f"agentevals-openai-{evaluator_def.name}",
@@ -252,6 +251,9 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva
     elif grader["type"] == "label_model":
         details["model"] = grader.get("model")
         details["passing_labels"] = grader.get("passing_labels")
+    elif grader["type"] == "score_model":
+        details["model"] = grader.get("model")
+        details["range"] = grader.get("range", [0, 1])
     per_criteria = getattr(run, "per_testing_criteria_results", None)
     if per_criteria:
         details["per_testing_criteria"] = [

diff --git a/tests/test_openai_eval_backend.py b/tests/test_openai_eval_backend.py
@@ -1,6 +1,7 @@
-import pytest
 from unittest.mock import MagicMock
 
+import pytest
+
 from agentevals.config import OpenAIEvalDef
 from agentevals.openai_eval_backend import (
     _build_jsonl_items,
@@ -21,6 +22,16 @@ def _label_grader(**overrides):
     return base
 
 
+def _score_grader(**overrides):
+    base = {
+        "type": "score_model",
+        "model": "gpt-4o-mini",
+        "input": [{"role": "user", "content": "Rate: {{ item.actual_response }}"}],
+    }
+    base.update(overrides)
+    return base
+
+
 def _invocation(text: str):
     inv = MagicMock()
     inv.final_response.parts = [MagicMock(text=text)]
@@ -55,6 +66,15 @@ def test_label_model_passing_labels_not_in_labels(self):
         with pytest.raises(Exception, match="passing_labels"):
             OpenAIEvalDef(name="lm", grader=grader)
 
+    def test_score_model_valid(self):
+        d = OpenAIEvalDef(name="sc", grader=_score_grader())
+        assert d.grader["type"] == "score_model"
+
+    @pytest.mark.parametrize("field", ["model", "input"])
+    def test_score_model_missing_required_field(self, field):
+        with pytest.raises(Exception, match=field):
+            OpenAIEvalDef(name="sc", grader=_score_grader(**{field: None}))
+
     def test_unsupported_grader_type(self):
         with pytest.raises(Exception, match="Unsupported grader type"):
             OpenAIEvalDef(name="x", grader={"type": "unknown"})
@@ -80,13 +100,28 @@ def test_label_model_shape(self):
         assert c["passing_labels"] == ["good"]
         assert c["input"] == grader["input"]
 
+    def test_score_model_shape(self):
+        grader = _score_grader(range=[0, 5])
+        d = OpenAIEvalDef(name="sc", grader=grader, threshold=0.6)
+        c = _build_testing_criteria(d)
+        assert c["type"] == "score_model"
+        assert c["model"] == "gpt-4o-mini"
+        assert c["range"] == [0, 5]
+        assert c["pass_threshold"] == 0.6
+        assert c["input"] == grader["input"]
+
+    def test_score_model_default_range(self):
+        d = OpenAIEvalDef(name="sc", grader=_score_grader())
+        c = _build_testing_criteria(d)
+        assert c["range"] == [0, 1]
+
 
 class TestBuildJsonlItems:
     def test_text_similarity_includes_expected(self):
         items = _build_jsonl_items([_invocation("hello")], [_invocation("world")], include_expected=True)
         assert "expected_response" in items[0]["item"]
 
-    def test_label_model_excludes_expected(self):
+    def test_excludes_expected_when_not_requested(self):
         items = _build_jsonl_items([_invocation("hello")], [], include_expected=False)
         assert "expected_response" not in items[0]["item"]
 
@@ -114,3 +149,10 @@ async def test_label_model_does_not_require_expected(self, monkeypatch):
         d = OpenAIEvalDef(name="lm", grader=_label_grader())
         result = await evaluate_openai_eval(d, [_invocation("hi")], None)
         assert "expected invocations" not in (result.error or "")
+
+    async def test_score_model_does_not_require_expected(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
+        monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
+        d = OpenAIEvalDef(name="sc", grader=_score_grader())
+        result = await evaluate_openai_eval(d, [_invocation("hi")], None)
+        assert "expected invocations" not in (result.error or "")