diff --git a/docs/custom-evaluators.md b/docs/custom-evaluators.md index fe6bb81..cf157fe 100644 --- a/docs/custom-evaluators.md +++ b/docs/custom-evaluators.md @@ -291,6 +291,26 @@ Remote evaluators are cached in `~/.cache/agentevals/evaluators/`. To force a re You can delegate grading to the [OpenAI Evals API](https://developers.openai.com/api/reference/resources/evals/methods/create) instead of running scoring logic locally. This requires `pip install "agentevals-cli[openai]"` and `OPENAI_API_KEY` to be set. +### Score Model Grader + +Uses a model to score each response without a golden set. The model reads the response and returns a float. + +```yaml +evaluators: + - name: quality_score + type: openai_eval + threshold: 0.7 + grader: + type: score_model + model: gpt-4o-mini + input: + - role: user + content: "Rate the quality of this response from 0 to 1: {{ item.actual_response }}" + range: [0, 1] +``` + +The `range` field sets the min and max the model can return (defaults to `[0, 1]`). No eval set is needed. + ### Text Similarity Grader Compares the agent's response against a golden reference using text similarity metrics. Requires an eval set. diff --git a/examples/custom_evaluators/eval_config_openai_eval.yaml b/examples/custom_evaluators/eval_config_openai_eval.yaml index fb04802..d509d85 100644 --- a/examples/custom_evaluators/eval_config_openai_eval.yaml +++ b/examples/custom_evaluators/eval_config_openai_eval.yaml @@ -16,3 +16,13 @@ evaluators: content: "Rate this response: {{ item.actual_response }}" labels: [good, bad] passing_labels: [good] + - name: quality_score + type: openai_eval + threshold: 0.7 + grader: + type: score_model + model: gpt-4o-mini + input: + - role: user + content: "Rate the quality of this response from 0 to 1: {{ item.actual_response }}" + range: [0, 1] diff --git a/src/agentevals/config.py b/src/agentevals/config.py index 38cedcf..a1df4b4 100644 --- a/src/agentevals/config.py +++ b/src/agentevals/config.py @@ -113,8 +113,14 @@ def _validate_grader(cls, v: dict[str, Any]) -> dict[str, Any]: invalid = [lbl for lbl in v["passing_labels"] if lbl not in v["labels"]] if invalid: raise ValueError(f"passing_labels contains labels not declared in labels: {invalid}") + elif grader_type == "score_model": + for field in ("model", "input"): + if not v.get(field): + raise ValueError(f"'{field}' is required for score_model grader") else: - raise ValueError(f"Unsupported grader type: '{grader_type}'. Supported: label_model, text_similarity") + raise ValueError( + f"Unsupported grader type: '{grader_type}'. Supported: label_model, score_model, text_similarity" + ) return v diff --git a/src/agentevals/openai_eval_backend.py b/src/agentevals/openai_eval_backend.py index e3c5cab..17b5bf3 100644 --- a/src/agentevals/openai_eval_backend.py +++ b/src/agentevals/openai_eval_backend.py @@ -1,9 +1,4 @@ -"""OpenAI Evals API backend — delegates grading to the OpenAI Evals API. - -Builds testing criteria from the evaluator config, submits invocation pairs -as JSONL items, polls for completion, and maps per-item results back to a -MetricResult. -""" +"""OpenAI Evals API backend.""" from __future__ import annotations @@ -39,11 +34,6 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]: - """Build the OpenAI testing_criteria dict from the evaluator config. - - Each grader type produces a different shape. Extend this function - when adding support for new OpenAI grader types. - """ grader = evaluator_def.grader grader_type = grader["type"] @@ -67,12 +57,23 @@ def _build_testing_criteria(evaluator_def: OpenAIEvalDef) -> dict[str, Any]: "passing_labels": grader["passing_labels"], } + if grader_type == "score_model": + return { + "type": "score_model", + "name": evaluator_def.name, + "model": grader["model"], + "input": grader["input"], + "range": grader.get("range", [0, 1]), + "pass_threshold": evaluator_def.threshold, + } + raise ValueError(f"Unsupported grader type: {grader_type}") def _build_jsonl_items( actual_invocations: list[Invocation], expected_invocations: list[Invocation], + *, include_expected: bool = True, ) -> list[dict[str, Any]]: items = [] @@ -123,16 +124,14 @@ async def evaluate_openai_eval( ) grader_type = evaluator_def.grader["type"] - - if grader_type == "text_similarity" and expected_invocations is None: + needs_expected = grader_type == "text_similarity" + if needs_expected and expected_invocations is None: return MetricResult( metric_name=evaluator_def.name, - error="OpenAI text_similarity grader requires expected invocations (golden eval set).", + error=f"OpenAI {grader_type} grader requires expected invocations (golden eval set).", ) - items = _build_jsonl_items( - actual_invocations, expected_invocations or [], include_expected=(grader_type != "label_model") - ) + items = _build_jsonl_items(actual_invocations, expected_invocations or [], include_expected=needs_expected) if not items: return MetricResult( metric_name=evaluator_def.name, @@ -145,7 +144,7 @@ async def evaluate_openai_eval( try: client = await asyncio.to_thread(_get_openai_client) - item_schema = _ACTUAL_ONLY_SCHEMA if grader_type == "label_model" else _TEXT_PAIR_SCHEMA + item_schema = _TEXT_PAIR_SCHEMA if needs_expected else _ACTUAL_ONLY_SCHEMA eval_obj = await asyncio.to_thread( client.evals.create, name=f"agentevals-openai-{evaluator_def.name}", @@ -252,6 +251,9 @@ async def _collect_results(client: Any, eval_id: str, run_id: str, run: Any, eva elif grader["type"] == "label_model": details["model"] = grader.get("model") details["passing_labels"] = grader.get("passing_labels") + elif grader["type"] == "score_model": + details["model"] = grader.get("model") + details["range"] = grader.get("range", [0, 1]) per_criteria = getattr(run, "per_testing_criteria_results", None) if per_criteria: details["per_testing_criteria"] = [ diff --git a/tests/test_openai_eval_backend.py b/tests/test_openai_eval_backend.py index c58f0bd..22e21ba 100644 --- a/tests/test_openai_eval_backend.py +++ b/tests/test_openai_eval_backend.py @@ -1,6 +1,7 @@ -import pytest from unittest.mock import MagicMock +import pytest + from agentevals.config import OpenAIEvalDef from agentevals.openai_eval_backend import ( _build_jsonl_items, @@ -21,6 +22,16 @@ def _label_grader(**overrides): return base +def _score_grader(**overrides): + base = { + "type": "score_model", + "model": "gpt-4o-mini", + "input": [{"role": "user", "content": "Rate: {{ item.actual_response }}"}], + } + base.update(overrides) + return base + + def _invocation(text: str): inv = MagicMock() inv.final_response.parts = [MagicMock(text=text)] @@ -55,6 +66,15 @@ def test_label_model_passing_labels_not_in_labels(self): with pytest.raises(Exception, match="passing_labels"): OpenAIEvalDef(name="lm", grader=grader) + def test_score_model_valid(self): + d = OpenAIEvalDef(name="sc", grader=_score_grader()) + assert d.grader["type"] == "score_model" + + @pytest.mark.parametrize("field", ["model", "input"]) + def test_score_model_missing_required_field(self, field): + with pytest.raises(Exception, match=field): + OpenAIEvalDef(name="sc", grader=_score_grader(**{field: None})) + def test_unsupported_grader_type(self): with pytest.raises(Exception, match="Unsupported grader type"): OpenAIEvalDef(name="x", grader={"type": "unknown"}) @@ -80,13 +100,28 @@ def test_label_model_shape(self): assert c["passing_labels"] == ["good"] assert c["input"] == grader["input"] + def test_score_model_shape(self): + grader = _score_grader(range=[0, 5]) + d = OpenAIEvalDef(name="sc", grader=grader, threshold=0.6) + c = _build_testing_criteria(d) + assert c["type"] == "score_model" + assert c["model"] == "gpt-4o-mini" + assert c["range"] == [0, 5] + assert c["pass_threshold"] == 0.6 + assert c["input"] == grader["input"] + + def test_score_model_default_range(self): + d = OpenAIEvalDef(name="sc", grader=_score_grader()) + c = _build_testing_criteria(d) + assert c["range"] == [0, 1] + class TestBuildJsonlItems: def test_text_similarity_includes_expected(self): items = _build_jsonl_items([_invocation("hello")], [_invocation("world")], include_expected=True) assert "expected_response" in items[0]["item"] - def test_label_model_excludes_expected(self): + def test_excludes_expected_when_not_requested(self): items = _build_jsonl_items([_invocation("hello")], [], include_expected=False) assert "expected_response" not in items[0]["item"] @@ -114,3 +149,10 @@ async def test_label_model_does_not_require_expected(self, monkeypatch): d = OpenAIEvalDef(name="lm", grader=_label_grader()) result = await evaluate_openai_eval(d, [_invocation("hi")], None) assert "expected invocations" not in (result.error or "") + + async def test_score_model_does_not_require_expected(self, monkeypatch): + monkeypatch.setenv("OPENAI_API_KEY", "test-key") + monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None) + d = OpenAIEvalDef(name="sc", grader=_score_grader()) + result = await evaluate_openai_eval(d, [_invocation("hi")], None) + assert "expected invocations" not in (result.error or "")