agentevals/tests/test_openai_eval_backend.py at 8ecb9a419ac018057175d86ea0a021dcf2f8136a · agentevals-dev/agentevals · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from unittest.mock import MagicMock

import pytest

from agentevals.config import OpenAIEvalDef
from agentevals.openai_eval_backend import (
    _build_jsonl_items,
    _build_testing_criteria,
    evaluate_openai_eval,
)


def _label_grader(**overrides):
    base = {
        "type": "label_model",
        "model": "gpt-4o-mini",
        "input": [{"role": "user", "content": "Rate: {{ item.actual_response }}"}],
        "labels": ["good", "bad"],
        "passing_labels": ["good"],
    }
    base.update(overrides)
    return base


def _string_check_grader(**overrides):
    base = {"type": "string_check", "reference": "hello", "operation": "ilike"}
    base.update(overrides)
    return base


def _invocation(text: str):
    inv = MagicMock()
    inv.final_response.parts = [MagicMock(text=text)]
    return inv


class TestOpenAIEvalDefValidation:
    def test_text_similarity_valid(self):
        d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
        assert d.grader["type"] == "text_similarity"

    def test_text_similarity_missing_metric(self):
        with pytest.raises(Exception, match="evaluation_metric"):
            OpenAIEvalDef(name="sim", grader={"type": "text_similarity"})

    def test_text_similarity_bad_metric(self):
        with pytest.raises(Exception, match="Unknown evaluation_metric"):
            OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "invalid"})

    def test_label_model_valid(self):
        d = OpenAIEvalDef(name="lm", grader=_label_grader())
        assert d.grader["type"] == "label_model"

    @pytest.mark.parametrize("field", ["model", "input", "labels", "passing_labels"])
    def test_label_model_missing_required_field(self, field):
        with pytest.raises(Exception, match=field):
            OpenAIEvalDef(name="lm", grader=_label_grader(**{field: None}))

    def test_label_model_passing_labels_not_in_labels(self):
        grader = _label_grader()
        grader["passing_labels"] = ["unknown"]
        with pytest.raises(Exception, match="passing_labels"):
            OpenAIEvalDef(name="lm", grader=grader)

    def test_string_check_valid(self):
        d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
        assert d.grader["type"] == "string_check"

    @pytest.mark.parametrize("field", ["reference", "operation"])
    def test_string_check_missing_required(self, field):
        with pytest.raises(Exception, match=field):
            OpenAIEvalDef(name="sc", grader=_string_check_grader(**{field: None}))

    def test_string_check_bad_operation(self):
        with pytest.raises(Exception, match="Invalid operation"):
            OpenAIEvalDef(name="sc", grader=_string_check_grader(operation="bad"))

    def test_unsupported_grader_type(self):
        with pytest.raises(Exception, match="Unsupported grader type"):
            OpenAIEvalDef(name="x", grader={"type": "unknown"})


class TestBuildTestingCriteria:
    def test_text_similarity_shape(self):
        d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"}, threshold=0.7)
        c = _build_testing_criteria(d)
        assert c["type"] == "text_similarity"
        assert c["evaluation_metric"] == "bleu"
        assert c["pass_threshold"] == 0.7
        assert "{{ item.actual_response }}" in c["input"]
        assert "{{ item.expected_response }}" in c["reference"]

    def test_label_model_shape(self):
        grader = _label_grader()
        d = OpenAIEvalDef(name="quality", grader=grader)
        c = _build_testing_criteria(d)
        assert c["type"] == "label_model"
        assert c["model"] == "gpt-4o-mini"
        assert c["labels"] == ["good", "bad"]
        assert c["passing_labels"] == ["good"]
        assert c["input"] == grader["input"]

    def test_string_check_shape(self):
        d = OpenAIEvalDef(name="sc", grader=_string_check_grader(reference="ok", operation="eq"))
        c = _build_testing_criteria(d)
        assert c["type"] == "string_check"
        assert c["reference"] == "ok"
        assert c["operation"] == "eq"
        assert "{{ item.actual_response }}" in c["input"]


class TestBuildJsonlItems:
    def test_includes_expected_when_requested(self):
        items = _build_jsonl_items([_invocation("hello")], [_invocation("world")], include_expected=True)
        assert "expected_response" in items[0]["item"]

    def test_excludes_expected_when_not_requested(self):
        items = _build_jsonl_items([_invocation("hello")], [], include_expected=False)
        assert "expected_response" not in items[0]["item"]

    def test_missing_expected_falls_back_to_empty(self):
        items = _build_jsonl_items([_invocation("hello")], [], include_expected=True)
        assert items[0]["item"]["expected_response"] == ""


class TestEvaluateOpenAIEval:
    async def test_no_api_key_returns_error(self, monkeypatch):
        monkeypatch.delenv("OPENAI_API_KEY", raising=False)
        d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
        result = await evaluate_openai_eval(d, [], [])
        assert "OPENAI_API_KEY" in (result.error or "")

    async def test_text_similarity_requires_expected(self, monkeypatch):
        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
        d = OpenAIEvalDef(name="sim", grader={"type": "text_similarity", "evaluation_metric": "bleu"})
        result = await evaluate_openai_eval(d, [_invocation("hi")], None)
        assert "expected invocations" in (result.error or "")

    async def test_label_model_does_not_require_expected(self, monkeypatch):
        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
        monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
        d = OpenAIEvalDef(name="lm", grader=_label_grader())
        result = await evaluate_openai_eval(d, [_invocation("hi")], None)
        assert "expected invocations" not in (result.error or "")

    async def test_string_check_does_not_require_expected(self, monkeypatch):
        monkeypatch.setenv("OPENAI_API_KEY", "test-key")
        monkeypatch.setattr("agentevals.openai_eval_backend._get_openai_client", lambda: None)
        d = OpenAIEvalDef(name="sc", grader=_string_check_grader())
        result = await evaluate_openai_eval(d, [_invocation("hi")], None)
        assert "expected invocations" not in (result.error or "")