Skip to content

Commit deaf52c

Browse files
committed
refactor: remove Custom deterministic evaluator
Remove the Custom evaluator class and all associated exports and tests. The Custom evaluator allowed users to pass arbitrary callback functions, but this functionality can be achieved by subclassing the base Evaluator directly, making the Custom wrapper unnecessary.
1 parent 0a7168b commit deaf52c

File tree

4 files changed

+4
-82
lines changed

4 files changed

+4
-82
lines changed

src/strands_evals/evaluators/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .coherence_evaluator import CoherenceEvaluator
22
from .conciseness_evaluator import ConcisenessEvaluator
3-
from .deterministic import Contains, Custom, Equals, StartsWith, ToolCalled
3+
from .deterministic import Contains, Equals, StartsWith, ToolCalled
44
from .evaluator import Evaluator
55
from .faithfulness_evaluator import FaithfulnessEvaluator
66
from .goal_success_rate_evaluator import GoalSuccessRateEvaluator
@@ -28,7 +28,6 @@
2828
"ConcisenessEvaluator",
2929
"CoherenceEvaluator",
3030
"Contains",
31-
"Custom",
3231
"Equals",
3332
"StartsWith",
3433
"ToolCalled",

src/strands_evals/evaluators/deterministic/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
from .output import Contains, Custom, Equals, StartsWith
1+
from .output import Contains, Equals, StartsWith
22
from .trajectory import ToolCalled
33

44
__all__ = [
55
"Contains",
6-
"Custom",
76
"Equals",
87
"StartsWith",
98
"ToolCalled",

src/strands_evals/evaluators/deterministic/output.py

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from collections.abc import Callable
2-
31
from typing_extensions import Any
42

53
from ...types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
@@ -82,15 +80,3 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
8280
return self.evaluate(evaluation_case)
8381

8482

85-
class Custom(Evaluator[InputT, OutputT]):
86-
"""Evaluates using a user-provided callback function."""
87-
88-
def __init__(self, fn: Callable[[EvaluationData[InputT, OutputT]], list[EvaluationOutput]]):
89-
super().__init__()
90-
self._fn = fn
91-
92-
def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
93-
return self._fn(evaluation_case)
94-
95-
async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
96-
return self.evaluate(evaluation_case)

tests/strands_evals/evaluators/deterministic/test_output.py

Lines changed: 2 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import pytest
22

3-
from strands_evals.evaluators.deterministic.output import Contains, Custom, Equals, StartsWith
4-
from strands_evals.types import EvaluationData, EvaluationOutput
3+
from strands_evals.evaluators.deterministic.output import Contains, Equals, StartsWith
4+
from strands_evals.types import EvaluationData
55

66

77
class TestEquals:
@@ -225,65 +225,3 @@ def test_to_dict(self):
225225
assert d["case_sensitive"] is False
226226

227227

228-
class TestCustom:
229-
def test_callback_pass(self):
230-
def check(case):
231-
return [EvaluationOutput(score=1.0, test_pass=True, reason="ok")]
232-
233-
evaluator = Custom(fn=check)
234-
data = EvaluationData(input="q", actual_output="anything")
235-
results = evaluator.evaluate(data)
236-
assert len(results) == 1
237-
assert results[0].test_pass is True
238-
assert results[0].score == 1.0
239-
240-
def test_callback_fail(self):
241-
def check(case):
242-
return [EvaluationOutput(score=0.0, test_pass=False, reason="bad")]
243-
244-
evaluator = Custom(fn=check)
245-
data = EvaluationData(input="q", actual_output="anything")
246-
results = evaluator.evaluate(data)
247-
assert results[0].test_pass is False
248-
249-
def test_callback_multiple_outputs(self):
250-
def check(case):
251-
return [
252-
EvaluationOutput(score=1.0, test_pass=True, reason="check 1"),
253-
EvaluationOutput(score=0.0, test_pass=False, reason="check 2"),
254-
]
255-
256-
evaluator = Custom(fn=check)
257-
data = EvaluationData(input="q", actual_output="anything")
258-
results = evaluator.evaluate(data)
259-
assert len(results) == 2
260-
assert results[0].test_pass is True
261-
assert results[1].test_pass is False
262-
263-
def test_callback_receives_full_evaluation_data(self):
264-
def check(case):
265-
has_metadata = case.metadata is not None and case.metadata.get("key") == "val"
266-
return [
267-
EvaluationOutput(score=1.0 if has_metadata else 0.0, test_pass=has_metadata, reason="metadata check")
268-
]
269-
270-
evaluator = Custom(fn=check)
271-
data = EvaluationData(input="q", actual_output="x", metadata={"key": "val"})
272-
results = evaluator.evaluate(data)
273-
assert results[0].test_pass is True
274-
275-
@pytest.mark.asyncio
276-
async def test_evaluate_async_delegates_to_evaluate(self):
277-
def check(case):
278-
return [EvaluationOutput(score=1.0, test_pass=True, reason="ok")]
279-
280-
evaluator = Custom(fn=check)
281-
data = EvaluationData(input="q", actual_output="anything")
282-
results = await evaluator.evaluate_async(data)
283-
assert results[0].test_pass is True
284-
285-
def test_to_dict_excludes_fn(self):
286-
evaluator = Custom(fn=lambda case: [])
287-
d = evaluator.to_dict()
288-
assert d["evaluator_type"] == "Custom"
289-
assert "fn" not in d

0 commit comments

Comments
 (0)