Skip to content

Commit afa88fa

Browse files
committed
test: add unit tests for openai_eval backend (string_check + text_similarity)
Covers config validation, item schema selection, testing criteria building, JSONL item construction, score extraction, and full evaluate_openai_eval flow for both grader types. Mocks the OpenAI client to avoid API calls. Addresses review feedback requesting tests to catch regressions.
1 parent ec33312 commit afa88fa

1 file changed

Lines changed: 341 additions & 0 deletions

File tree

tests/test_openai_eval_backend.py

Lines changed: 341 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,341 @@
1+
"""Unit tests for the OpenAI Evals backend — covers both text_similarity and string_check graders."""
2+
3+
import asyncio
4+
from unittest.mock import MagicMock, patch, AsyncMock
5+
6+
import pytest
7+
from pydantic import ValidationError
8+
9+
from agentevals.config import OpenAIEvalDef
10+
from agentevals.openai_eval_backend import (
11+
_build_jsonl_items,
12+
_build_testing_criteria,
13+
_extract_item_score,
14+
_get_item_schema,
15+
evaluate_openai_eval,
16+
)
17+
18+
19+
# ── Helpers ────────────────────────────────────────────────────────────────────
20+
21+
22+
def _make_invocation(text: str):
23+
"""Build a minimal Invocation-like object with a final_response."""
24+
inv = MagicMock()
25+
inv.final_response = text
26+
return inv
27+
28+
29+
def _make_text_similarity_def(**overrides) -> OpenAIEvalDef:
30+
defaults = {
31+
"name": "test_similarity",
32+
"threshold": 0.7,
33+
"grader": {
34+
"type": "text_similarity",
35+
"evaluation_metric": "fuzzy_match",
36+
},
37+
}
38+
defaults.update(overrides)
39+
return OpenAIEvalDef(**defaults)
40+
41+
42+
def _make_string_check_def(**overrides) -> OpenAIEvalDef:
43+
defaults = {
44+
"name": "test_check",
45+
"grader": {
46+
"type": "string_check",
47+
"operation": "eq",
48+
"reference": "Paris",
49+
},
50+
}
51+
defaults.update(overrides)
52+
return OpenAIEvalDef(**defaults)
53+
54+
55+
# ── Config validation tests ────────────────────────────────────────────────────
56+
57+
58+
class TestOpenAIEvalDefValidation:
59+
"""Verify that the pydantic validator enforces correct grader configs."""
60+
61+
def test_text_similarity_requires_evaluation_metric(self):
62+
with pytest.raises(ValidationError, match="evaluation_metric"):
63+
OpenAIEvalDef(name="x", grader={"type": "text_similarity"})
64+
65+
def test_text_similarity_rejects_invalid_metric(self):
66+
with pytest.raises(ValidationError, match="Unknown evaluation_metric"):
67+
OpenAIEvalDef(
68+
name="x",
69+
grader={"type": "text_similarity", "evaluation_metric": "bogus"},
70+
)
71+
72+
def test_text_similarity_accepts_valid_metrics(self):
73+
for metric in ("fuzzy_match", "bleu", "cosine", "rouge_l"):
74+
d = OpenAIEvalDef(
75+
name="x",
76+
grader={"type": "text_similarity", "evaluation_metric": metric},
77+
)
78+
assert d.grader["evaluation_metric"] == metric
79+
80+
def test_string_check_requires_operation(self):
81+
with pytest.raises(ValidationError, match="operation"):
82+
OpenAIEvalDef(name="x", grader={"type": "string_check", "reference": "hi"})
83+
84+
def test_string_check_requires_reference(self):
85+
with pytest.raises(ValidationError, match="reference"):
86+
OpenAIEvalDef(name="x", grader={"type": "string_check", "operation": "eq"})
87+
88+
def test_string_check_rejects_invalid_operation(self):
89+
with pytest.raises(ValidationError, match="Unknown operation"):
90+
OpenAIEvalDef(
91+
name="x",
92+
grader={"type": "string_check", "operation": "contains", "reference": "hi"},
93+
)
94+
95+
def test_string_check_accepts_valid_operations(self):
96+
for op in ("eq", "ne", "like", "ilike"):
97+
d = OpenAIEvalDef(
98+
name="x",
99+
grader={"type": "string_check", "operation": op, "reference": "val"},
100+
)
101+
assert d.grader["operation"] == op
102+
103+
def test_unsupported_grader_type_raises(self):
104+
with pytest.raises(ValidationError, match="Unsupported grader type"):
105+
OpenAIEvalDef(name="x", grader={"type": "model_graded"})
106+
107+
108+
# ── Item schema tests ──────────────────────────────────────────────────────────
109+
110+
111+
class TestGetItemSchema:
112+
def test_string_check_schema_has_actual_only(self):
113+
schema = _get_item_schema("string_check")
114+
assert schema["required"] == ["actual_response"]
115+
assert "expected_response" not in schema["properties"]
116+
117+
def test_text_similarity_schema_has_both(self):
118+
schema = _get_item_schema("text_similarity")
119+
assert "actual_response" in schema["required"]
120+
assert "expected_response" in schema["required"]
121+
122+
123+
# ── Testing criteria tests ─────────────────────────────────────────────────────
124+
125+
126+
class TestBuildTestingCriteria:
127+
def test_text_similarity_criteria(self):
128+
ev = _make_text_similarity_def(threshold=0.8)
129+
criteria = _build_testing_criteria(ev)
130+
assert criteria["type"] == "text_similarity"
131+
assert criteria["evaluation_metric"] == "fuzzy_match"
132+
assert criteria["pass_threshold"] == 0.8
133+
assert criteria["reference"] == "{{ item.expected_response }}"
134+
135+
def test_string_check_criteria(self):
136+
ev = _make_string_check_def()
137+
criteria = _build_testing_criteria(ev)
138+
assert criteria["type"] == "string_check"
139+
assert criteria["operation"] == "eq"
140+
assert criteria["reference"] == "Paris"
141+
assert criteria["input"] == "{{ item.actual_response }}"
142+
assert "pass_threshold" not in criteria
143+
144+
def test_unsupported_grader_raises(self):
145+
ev = _make_text_similarity_def()
146+
# Bypass pydantic validation to test the function directly
147+
ev.grader = {"type": "unknown"}
148+
with pytest.raises(ValueError, match="Unsupported grader type"):
149+
_build_testing_criteria(ev)
150+
151+
152+
# ── JSONL item building tests ──────────────────────────────────────────────────
153+
154+
155+
class TestBuildJsonlItems:
156+
def test_text_similarity_includes_expected(self):
157+
actual = [_make_invocation("hello")]
158+
expected = [_make_invocation("hi")]
159+
160+
with patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x):
161+
items = _build_jsonl_items(actual, expected, grader_type="text_similarity")
162+
163+
assert len(items) == 1
164+
assert items[0]["item"]["actual_response"] == "hello"
165+
assert items[0]["item"]["expected_response"] == "hi"
166+
167+
def test_string_check_excludes_expected(self):
168+
actual = [_make_invocation("Paris")]
169+
expected = [_make_invocation("ignored")]
170+
171+
with patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x):
172+
items = _build_jsonl_items(actual, expected, grader_type="string_check")
173+
174+
assert len(items) == 1
175+
assert items[0]["item"]["actual_response"] == "Paris"
176+
assert "expected_response" not in items[0]["item"]
177+
178+
def test_missing_expected_uses_empty_string(self):
179+
actual = [_make_invocation("a"), _make_invocation("b")]
180+
expected = [_make_invocation("x")]
181+
182+
with patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x):
183+
items = _build_jsonl_items(actual, expected, grader_type="text_similarity")
184+
185+
assert items[1]["item"]["expected_response"] == ""
186+
187+
def test_empty_invocations_returns_empty(self):
188+
with patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x):
189+
items = _build_jsonl_items([], [], grader_type="string_check")
190+
assert items == []
191+
192+
193+
# ── Item score extraction ──────────────────────────────────────────────────────
194+
195+
196+
class TestExtractItemScore:
197+
def test_returns_score(self):
198+
item = MagicMock()
199+
result = MagicMock()
200+
result.score = 0.85
201+
item.results = [result]
202+
assert _extract_item_score(item) == 0.85
203+
204+
def test_returns_none_when_no_results(self):
205+
item = MagicMock()
206+
item.results = []
207+
assert _extract_item_score(item) is None
208+
209+
def test_returns_none_when_results_attr_missing(self):
210+
item = MagicMock(spec=[]) # no attributes
211+
assert _extract_item_score(item) is None
212+
213+
214+
# ── Integration tests (mocked OpenAI client) ───────────────────────────────────
215+
216+
217+
class TestEvaluateOpenAIEval:
218+
def _make_mock_client(self, run_status="completed", scores=None):
219+
"""Create a fully mocked OpenAI client."""
220+
client = MagicMock()
221+
222+
# evals.create
223+
eval_obj = MagicMock()
224+
eval_obj.id = "eval_123"
225+
client.evals.create.return_value = eval_obj
226+
227+
# evals.runs.create
228+
run_obj = MagicMock()
229+
run_obj.id = "run_456"
230+
client.evals.runs.create.return_value = run_obj
231+
232+
# evals.runs.retrieve
233+
completed_run = MagicMock()
234+
completed_run.status = run_status
235+
completed_run.result_counts = MagicMock()
236+
completed_run.result_counts.passed = len(scores or [])
237+
completed_run.result_counts.failed = 0
238+
completed_run.result_counts.total = len(scores or [])
239+
completed_run.per_testing_criteria_results = None
240+
client.evals.runs.retrieve.return_value = completed_run
241+
242+
# evals.runs.output_items.list
243+
output_items = []
244+
for s in (scores or []):
245+
item = MagicMock()
246+
result = MagicMock()
247+
result.score = s
248+
item.results = [result]
249+
output_items.append(item)
250+
page = MagicMock()
251+
page.data = output_items
252+
client.evals.runs.output_items.list.return_value = page
253+
254+
# evals.delete
255+
client.evals.delete.return_value = None
256+
257+
return client
258+
259+
@patch.dict("os.environ", {"OPENAI_API_KEY": "test-key"})
260+
@patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x)
261+
@patch("agentevals.openai_eval_backend._get_openai_client")
262+
def test_string_check_success(self, mock_get_client, mock_content):
263+
client = self._make_mock_client(scores=[1.0])
264+
mock_get_client.return_value = client
265+
266+
ev = _make_string_check_def()
267+
actual = [_make_invocation("Paris")]
268+
269+
result = asyncio.run(evaluate_openai_eval(ev, actual, None))
270+
271+
assert result.error is None
272+
assert result.score == 1.0
273+
assert result.eval_status == "PASSED"
274+
assert result.details["operation"] == "eq"
275+
276+
@patch.dict("os.environ", {"OPENAI_API_KEY": "test-key"})
277+
@patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x)
278+
@patch("agentevals.openai_eval_backend._get_openai_client")
279+
def test_text_similarity_requires_expected(self, mock_get_client, mock_content):
280+
ev = _make_text_similarity_def()
281+
actual = [_make_invocation("hello")]
282+
283+
result = asyncio.run(evaluate_openai_eval(ev, actual, None))
284+
285+
assert result.error is not None
286+
assert "expected invocations" in result.error
287+
288+
@patch.dict("os.environ", {"OPENAI_API_KEY": "test-key"})
289+
@patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x)
290+
@patch("agentevals.openai_eval_backend._get_openai_client")
291+
def test_text_similarity_success(self, mock_get_client, mock_content):
292+
client = self._make_mock_client(scores=[0.9, 0.8])
293+
mock_get_client.return_value = client
294+
295+
ev = _make_text_similarity_def()
296+
actual = [_make_invocation("hello"), _make_invocation("world")]
297+
expected = [_make_invocation("hi"), _make_invocation("earth")]
298+
299+
result = asyncio.run(evaluate_openai_eval(ev, actual, expected))
300+
301+
assert result.error is None
302+
assert result.score == pytest.approx(0.85)
303+
assert result.details["evaluation_metric"] == "fuzzy_match"
304+
305+
@patch.dict("os.environ", {"OPENAI_API_KEY": ""})
306+
def test_missing_api_key_returns_error(self):
307+
ev = _make_string_check_def()
308+
actual = [_make_invocation("Paris")]
309+
310+
result = asyncio.run(evaluate_openai_eval(ev, actual, None))
311+
312+
assert result.error is not None
313+
assert "OPENAI_API_KEY" in result.error
314+
315+
@patch.dict("os.environ", {"OPENAI_API_KEY": "test-key"})
316+
@patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x)
317+
@patch("agentevals.openai_eval_backend._get_openai_client")
318+
def test_string_check_no_expected_needed(self, mock_get_client, mock_content):
319+
"""string_check grader should work without expected_invocations (None)."""
320+
client = self._make_mock_client(scores=[1.0])
321+
mock_get_client.return_value = client
322+
323+
ev = _make_string_check_def()
324+
actual = [_make_invocation("Paris")]
325+
326+
result = asyncio.run(evaluate_openai_eval(ev, actual, None))
327+
328+
# Verify it didn't short-circuit with an error
329+
assert result.error is None
330+
assert result.eval_status == "PASSED"
331+
332+
@patch.dict("os.environ", {"OPENAI_API_KEY": "test-key"})
333+
@patch("agentevals.openai_eval_backend._content_to_text", side_effect=lambda x: x)
334+
@patch("agentevals.openai_eval_backend._get_openai_client")
335+
def test_empty_invocations_returns_error(self, mock_get_client, mock_content):
336+
ev = _make_string_check_def()
337+
338+
result = asyncio.run(evaluate_openai_eval(ev, [], None))
339+
340+
assert result.error is not None
341+
assert "No invocations" in result.error

0 commit comments

Comments
 (0)