From cdd39e16d7dbec4fb5221359f527c213f82fa12c Mon Sep 17 00:00:00 2001 From: mesutoezdil Date: Tue, 12 May 2026 20:17:12 +0200 Subject: [PATCH 1/2] feat: add score_model grader type to OpenAI Evals backend Closes #96 score_model scores agent responses using a model judge, no golden set needed. Config requires model and input. range defaults to [0, 1]. --- docs/custom-evaluators.md | 20 ++++ examples/custom_evaluators/eval_config.yaml | 12 +++ src/agentevals/config.py | 19 ++-- src/agentevals/openai_eval_backend.py | 59 +++++----- tests/test_openai_eval_backend.py | 113 ++++++++++++++++++++ 5 files changed, 189 insertions(+), 34 deletions(-) create mode 100644 tests/test_openai_eval_backend.py diff --git a/docs/custom-evaluators.md b/docs/custom-evaluators.md index 592dd25..9f1443d 100644 --- a/docs/custom-evaluators.md +++ b/docs/custom-evaluators.md @@ -291,6 +291,26 @@ Remote evaluators are cached in `~/.cache/agentevals/evaluators/`. To force a re You can delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) instead of running scoring logic locally. This requires `pip install "agentevals-cli[openai]"` and `OPENAI_API_KEY` to be set. +### Score Model Grader + +Uses a model to score each response without a golden set. The model reads the response and returns a float. + +```yaml +evaluators: + - name: quality_score + type: openai_eval + threshold: 0.7 + grader: + type: score_model + model: gpt-4o-mini + input: + - role: user + content: "Rate the quality of this response from 0 to 1: {{ item.actual_response }}" + range: [0, 1] +``` + +The `range` field sets the min and max the model can return (defaults to `[0, 1]`). No eval set is needed. + ### Text Similarity Grader Compares the agent's response against a golden reference using text similarity metrics. Requires an eval set. diff --git a/examples/custom_evaluators/eval_config.yaml b/examples/custom_evaluators/eval_config.yaml index d3bd261..b7eac09 100644 --- a/examples/custom_evaluators/eval_config.yaml +++ b/examples/custom_evaluators/eval_config.yaml @@ -25,6 +25,18 @@ evaluators: config: min_response_length: 20 + # OpenAI score_model grader (no golden set needed) + - name: quality_score + type: openai_eval + threshold: 0.7 + grader: + type: score_model + model: gpt-4o-mini + input: + - role: user + content: "Rate the quality of this response from 0 to 1: {{ item.actual_response }}" + range: [0, 1] + # Reference an evaluator from Github - name: random_evaluator type: remote diff --git a/src/agentevals/config.py b/src/agentevals/config.py index da8e776..cffe5a1 100644 --- a/src/agentevals/config.py +++ b/src/agentevals/config.py @@ -84,13 +84,18 @@ class OpenAIEvalDef(BaseModel): @classmethod def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]: grader_type = v.get("type") - if grader_type != "text_similarity": - raise ValueError(f"Only 'text_similarity' grader type is currently supported, got '{grader_type}'") - metric = v.get("evaluation_metric") - if not metric: - raise ValueError("'evaluation_metric' is required for text_similarity grader") - if metric not in _VALID_SIMILARITY_METRICS: - raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}") + if grader_type == "text_similarity": + metric = v.get("evaluation_metric") + if not metric: + raise ValueError("'evaluation_metric' is required for text_similarity grader") + if metric not in _VALID_SIMILARITY_METRICS: + raise ValueError(f"Unknown evaluation_metric '{metric}'. Valid: {sorted(_VALID_SIMILARITY_METRICS)}") + elif grader_type == "score_model": + for field in ("model", "input"): + if not v.get(field): + raise ValueError(f"'{field}' is required for score_model grader") + else: + raise ValueError(f"Unsupported grader type: '{grader_type}'") return v diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py index a6e9c00..dbd8fc5 100644 --- a/src/agentevals/openai_eval_backend.py +++ b/src/agentevals/openai_eval_backend.py @@ -1,9 +1,4 @@ -"""OpenAI Evals API backend — delegates grading to the OpenAI Evals API. - -Builds testing criteria from the evaluator config, submits invocation pairs -as JSONL items, polls for completion, and maps per-item results back to a -MetricResult. -""" +"""OpenAI Evals API backend.""" from __future__ import annotations @@ -31,13 +26,16 @@ "required": ["actual_response", "expected_response"], } +_ACTUAL_ONLY_SCHEMA = { + "type": "object", + "properties": { + "actual_response": {"type": "string"}, + }, + "required": ["actual_response"], +} -def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]: - """Build the OpenAI testing_criteria dict from the evaluator config. - Each grader type produces a different shape. Extend this function - when adding support for new OpenAI grader types. - """ +def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]: grader = evaluator_def.grader grader_type = grader["type"] @@ -51,28 +49,32 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]: "pass_threshold": evaluator_def.threshold, } + if grader_type == "score_model": + return { + "type": "score_model", + "name": evaluator_def.name, + "model": grader["model"], + "input": grader["input"], + "range": grader.get("range", [0, 1]), + "pass_threshold": evaluator_def.threshold, + } + raise ValueError(f"Unsupported grader type: {grader_type}") def _build_jsonl_items( actual_invocations: list[Invocation], expected_invocations: list[Invocation], + *, + include_expected: bool = True, ) -> list[dict[str, Any]]: items = [] for i, actual_inv in enumerate(actual_invocations): actual_text = _content_to_text(actual_inv.final_response) - if i < len(expected_invocations): - expected_text = _content_to_text(expected_invocations[i].final_response) - else: - expected_text = "" - items.append( - { - "item": { - "actual_response": actual_text, - "expected_response": expected_text, - } - } - ) + item: dict[str, Any] = {"actual_response": actual_text} + if include_expected: + item["expected_response"] = _content_to_text(expected_invocations[i].final_response) if i < len(expected_invocations) else "" + items.append({"item": item}) return items @@ -111,13 +113,15 @@ async def evaluate_openai_eval( error="OPENAI_API_KEY environment variable is not set.", ) - if expected_invocations is None: + grader_type = evaluator_def.grader.get("type") + needs_expected = grader_type == "text_similarity" + if needs_expected and expected_invocations is None: return MetricResult( metric_name=evaluator_def.name, - error="OpenAI text_similarity grader requires expected invocations (golden eval set).", + error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).", ) - items = _build_jsonl_items(actual_invocations, expected_invocations) + items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=needs_expected) if not items: return MetricResult( metric_name=evaluator_def.name, @@ -130,12 +134,13 @@ async def evaluate_openai_eval( try: client = await asyncio.to_thread(_get_openai_client) + item_schema = _TEXT_PAIR_SCHEMA if needs_expected else _ACTUAL_ONLY_SCHEMA eval_obj = await asyncio.to_thread( client.evals.create, name=f"agentevals-{evaluator_def.name}", data_source_config={ "type": "custom", - "item_schema": _TEXT_PAIR_SCHEMA, + "item_schema": item_schema, "include_sample_schema": False, }, testing_criteria=[testing_criteria], diff --git a/tests/test_openai_eval_backend.py b/tests/test_openai_eval_backend.py new file mode 100644 index 0000000..0c3d44d --- /dev/null +++ b/tests/test_openai_eval_backend.py @@ -0,0 +1,113 @@ +import pytest +from unittest.mock import MagicMock + +from agentevals.config import OpenAIEvalDef +from agentevals.openai_eval_backend import ( + _build_jsonl_items, + _build_testing_criteria, + evaluate_openai_eval, +) + + +def _score_grader(**overrides): + base = { + "type": "score_model", + "model": "gpt-4o-mini", + "input": [{"role": "user", "content": "Rate: {{ item.actual_response }}"}], + } + base.update(overrides) + return base + + +def _invocation(text: str): + inv = MagicMock() + inv.final_response.parts = [MagicMock(text=text)] + return inv + + +class TestOpenAIEvalDefValidation: + def test_text_similarity_valid(self): + d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}) + assert d.grader["type"] == "text_similarity" + + def test_text_similarity_missing_metric(self): + with pytest.raises(Exception, match="evaluation_metric"): + OpenAIEvalDef(name="sim", grader={"type": "text_similarity"}) + + def test_text_similarity_bad_metric(self): + with pytest.raises(Exception, match="Unknown evaluation_metric"): + OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "invalid"}) + + def test_score_model_valid(self): + d = OpenAIEvalDef(name="sc", grader=_score_grader()) + assert d.grader["type"] == "score_model" + + @pytest.mark.parametrize("field", ["model", "input"]) + def test_score_model_missing_required_field(self, field): + with pytest.raises(Exception, match=field): + OpenAIEvalDef(name="sc", grader=_score_grader(**{field: None})) + + def test_unsupported_grader_type(self): + with pytest.raises(Exception, match="Unsupported grader type"): + OpenAIEvalDef(name="x", grader={"type": "unknown"}) + + +class TestBuildTestingCriteria: + def test_text_similarity_shape(self): + d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}, threshold=0.7) + c = _build_testing_criteria(d) + assert c["type"] == "text_similarity" + assert c["evaluation_metric"] == "bleu" + assert c["pass_threshold"] == 0.7 + assert "{{ item.actual_response }}" in c["input"] + assert "{{ item.expected_response }}" in c["reference"] + + def test_score_model_shape(self): + grader = _score_grader(range=[1, 5]) + d = OpenAIEvalDef(name="sc", grader=grader, threshold=3.0) + c = _build_testing_criteria(d) + assert c["type"] == "score_model" + assert c["model"] == "gpt-4o-mini" + assert c["range"] == [1, 5] + assert c["pass_threshold"] == 3.0 + assert c["input"] == grader["input"] + + def test_score_model_default_range(self): + d = OpenAIEvalDef(name="sc", grader=_score_grader()) + c = _build_testing_criteria(d) + assert c["range"] == [0, 1] + + +class TestBuildJsonlItems: + def test_text_similarity_includes_expected(self): + items = _build_jsonl_items([_invocation("hello")], [_invocation("world")], include_expected=True) + assert "expected_response" in items[0]["item"] + + def test_score_model_excludes_expected(self): + items = _build_jsonl_items([_invocation("hello")], [], include_expected=False) + assert "expected_response" not in items[0]["item"] + + def test_missing_expected_falls_back_to_empty(self): + items = _build_jsonl_items([_invocation("hello")], [], include_expected=True) + assert items[0]["item"]["expected_response"] == "" + + +class TestEvaluateOpenAIEval: + async def test_no_api_key_returns_error(self, monkeypatch): + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}) + result = await evaluate_openai_eval(d, [], []) + assert "OPENAI_API_KEY" in (result.error or "") + + async def test_text_similarity_requires_expected(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}) + result = await evaluate_openai_eval(d, [_invocation("hi")], None) + assert "expected invocations" in (result.error or "") + + async def test_score_model_does_not_require_expected(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None) + d = OpenAIEvalDef(name="sc", grader=_score_grader()) + result = await evaluate_openai_eval(d, [_invocation("hi")], None) + assert "expected invocations" not in (result.error or "") From f95e793adad8c87ff4f4084ddb85a207377f4595 Mon Sep 17 00:00:00 2001 From: mesutoezdil Date: Fri, 15 May 2026 10:27:08 +0200 Subject: [PATCH 2/2] move score_model example to eval_config_openai_eval.yaml --- examples/custom_evaluators/eval_config.yaml | 12 ------------ .../eval_config_openai_eval.yaml | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 12 deletions(-) create mode 100644 examples/custom_evaluators/eval_config_openai_eval.yaml diff --git a/examples/custom_evaluators/eval_config.yaml b/examples/custom_evaluators/eval_config.yaml index b7eac09..d3bd261 100644 --- a/examples/custom_evaluators/eval_config.yaml +++ b/examples/custom_evaluators/eval_config.yaml @@ -25,18 +25,6 @@ evaluators: config: min_response_length: 20 - # OpenAI score_model grader (no golden set needed) - - name: quality_score - type: openai_eval - threshold: 0.7 - grader: - type: score_model - model: gpt-4o-mini - input: - - role: user - content: "Rate the quality of this response from 0 to 1: {{ item.actual_response }}" - range: [0, 1] - # Reference an evaluator from Github - name: random_evaluator type: remote diff --git a/examples/custom_evaluators/eval_config_openai_eval.yaml b/examples/custom_evaluators/eval_config_openai_eval.yaml new file mode 100644 index 0000000..97a82a6 --- /dev/null +++ b/examples/custom_evaluators/eval_config_openai_eval.yaml @@ -0,0 +1,18 @@ +# Eval config using OpenAI Evals API graders. +# Requires OPENAI_API_KEY to be set. +# +# Run with: +# agentevals run samples/helm.json \ +# --config examples/custom_evaluators/eval_config_openai_eval.yaml + +evaluators: + - name: quality_score + type: openai_eval + threshold: 0.7 + grader: + type: score_model + model: gpt-4o-mini + input: + - role: user + content: "Rate the quality of this response from 0 to 1: {{ item.actual_response }}" + range: [0, 1]