Skip to content

Commit ab93593

Browse files
committed
add tests for evaluating using raw criteria, putting the evaluator in charge of the validation
1 parent 2a6ff55 commit ab93593

2 files changed

Lines changed: 195 additions & 1 deletion

File tree

src/uipath/eval/coded_evaluators/base_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ def _canonical_json(self, obj: Any) -> str:
255255
ensure_ascii=False,
256256
)
257257

258-
async def evaluate_from_raw_criteria(
258+
async def validate_and_evaluate_criteria(
259259
self, agent_execution: AgentExecution, evaluation_criteria: Any
260260
) -> EvaluationResult:
261261
"""Evaluate the given data and return a result from a raw evaluation criteria."""

tests/evaluators/test_evaluator_methods.py

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,25 @@ async def test_exact_match_negated(
153153
assert isinstance(result, NumericEvaluationResult)
154154
assert result.score == 0.0
155155

156+
@pytest.mark.asyncio
157+
async def test_exact_match_validate_and_evaluate_criteria(
158+
self, sample_agent_execution: AgentExecution
159+
) -> None:
160+
"""Test exact match using validate_and_evaluate_criteria."""
161+
config = {
162+
"name": "ExactMatchTest",
163+
"case_sensitive": True,
164+
}
165+
evaluator = ExactMatchEvaluator.model_validate({"config": config})
166+
raw_criteria = {"expected_output": {"output": "Test output"}}
167+
168+
result = await evaluator.validate_and_evaluate_criteria(
169+
sample_agent_execution, raw_criteria
170+
)
171+
172+
assert isinstance(result, NumericEvaluationResult)
173+
assert result.score == 1.0
174+
156175

157176
class TestJsonSimilarityEvaluator:
158177
"""Test JsonSimilarityEvaluator.evaluate() method."""
@@ -200,6 +219,25 @@ async def test_json_similarity_partial_match(self) -> None:
200219
assert isinstance(result, NumericEvaluationResult)
201220
assert math.isclose(result.score, 0.666, abs_tol=1e-3)
202221

222+
@pytest.mark.asyncio
223+
async def test_json_similarity_validate_and_evaluate_criteria(self) -> None:
224+
"""Test JSON similarity using validate_and_evaluate_criteria."""
225+
execution = AgentExecution(
226+
agent_input={"input": "Test"},
227+
agent_output={"name": "John", "age": 30, "city": "NYC"},
228+
agent_trace=[],
229+
)
230+
config = {
231+
"name": "JsonSimilarityTest",
232+
}
233+
evaluator = JsonSimilarityEvaluator.model_validate({"config": config})
234+
raw_criteria = {"expected_output": {"name": "John", "age": 30, "city": "NYC"}}
235+
236+
result = await evaluator.validate_and_evaluate_criteria(execution, raw_criteria)
237+
238+
assert isinstance(result, NumericEvaluationResult)
239+
assert result.score == 1.0
240+
203241

204242
class TestToolCallOrderEvaluator:
205243
"""Test ToolCallOrderEvaluator.evaluate() method."""
@@ -266,6 +304,25 @@ async def test_tool_call_order_lcs_match(
266304
assert isinstance(result, NumericEvaluationResult)
267305
assert result.score == 0.75
268306

307+
@pytest.mark.asyncio
308+
async def test_tool_call_order_validate_and_evaluate_criteria(
309+
self, sample_agent_execution_with_trace: AgentExecution
310+
) -> None:
311+
"""Test tool call order using validate_and_evaluate_criteria."""
312+
config = {
313+
"name": "ToolOrderTest",
314+
"strict": True,
315+
}
316+
evaluator = ToolCallOrderEvaluator.model_validate({"config": config})
317+
raw_criteria = {"tool_calls_order": ["tool1", "tool2", "tool1", "tool2"]}
318+
319+
result = await evaluator.validate_and_evaluate_criteria(
320+
sample_agent_execution_with_trace, raw_criteria
321+
)
322+
323+
assert isinstance(result, NumericEvaluationResult)
324+
assert result.score == 1.0
325+
269326

270327
class TestToolCallCountEvaluator:
271328
"""Test ToolCallCountEvaluator.evaluate() method."""
@@ -346,6 +403,25 @@ async def test_tool_call_count_partial_match(
346403
assert isinstance(result, NumericEvaluationResult)
347404
assert result.score == 0.5
348405

406+
@pytest.mark.asyncio
407+
async def test_tool_call_count_validate_and_evaluate_criteria(
408+
self, sample_agent_execution_with_trace: AgentExecution
409+
) -> None:
410+
"""Test tool call count using validate_and_evaluate_criteria."""
411+
config = {
412+
"name": "ToolCountTest",
413+
"strict": True,
414+
}
415+
evaluator = ToolCallCountEvaluator.model_validate({"config": config})
416+
raw_criteria = {"tool_calls_count": {"tool1": ("=", 2), "tool2": ("=", 2)}}
417+
418+
result = await evaluator.validate_and_evaluate_criteria(
419+
sample_agent_execution_with_trace, raw_criteria
420+
)
421+
422+
assert isinstance(result, NumericEvaluationResult)
423+
assert result.score == 1.0
424+
349425

350426
class TestToolCallArgsEvaluator:
351427
"""Test ToolCallArgsEvaluator.evaluate() method."""
@@ -398,6 +474,32 @@ async def test_tool_call_args_partial_match(
398474
assert isinstance(result, NumericEvaluationResult)
399475
assert result.score == 0.75
400476

477+
@pytest.mark.asyncio
478+
async def test_tool_call_args_validate_and_evaluate_criteria(
479+
self, sample_agent_execution_with_trace: AgentExecution
480+
) -> None:
481+
"""Test tool call args using validate_and_evaluate_criteria."""
482+
config = {
483+
"name": "ToolArgsTest",
484+
"strict": True,
485+
}
486+
evaluator = ToolCallArgsEvaluator.model_validate({"config": config})
487+
raw_criteria = {
488+
"tool_calls": [
489+
{"name": "tool1", "args": {"arg1": "value1"}},
490+
{"name": "tool2", "args": {"arg2": "value2"}},
491+
{"name": "tool1", "args": {"arg1": "value1"}},
492+
{"name": "tool2", "args": {"arg2": "value2"}},
493+
]
494+
}
495+
496+
result = await evaluator.validate_and_evaluate_criteria(
497+
sample_agent_execution_with_trace, raw_criteria
498+
)
499+
500+
assert isinstance(result, NumericEvaluationResult)
501+
assert result.score == 1.0
502+
401503

402504
class TestLlmAsAJudgeEvaluator:
403505
"""Test LlmAsAJudgeEvaluator.evaluate() method."""
@@ -447,6 +549,52 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any:
447549
assert isinstance(result, NumericEvaluationResult), f"Result is {result}"
448550
assert result.score == 0.8, f"Result score is {result.score}"
449551

552+
@pytest.mark.asyncio
553+
async def test_llm_judge_validate_and_evaluate_criteria(
554+
self, sample_agent_execution: AgentExecution, mocker: "MockerFixture"
555+
) -> None:
556+
"""Test LLM judge using validate_and_evaluate_criteria."""
557+
# Mock the UiPath constructor to avoid authentication
558+
mock_uipath = mocker.MagicMock()
559+
mock_llm = mocker.MagicMock()
560+
mock_uipath.llm = mock_llm
561+
562+
# Mock the chat completions response as an async method
563+
mock_response = mocker.MagicMock()
564+
mock_response.choices = [
565+
mocker.MagicMock(
566+
message=mocker.MagicMock(
567+
content='{"score": 75, "justification": "Good response using raw criteria"}'
568+
)
569+
)
570+
]
571+
572+
# Make chat_completions an async method
573+
async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any:
574+
return mock_response
575+
576+
mock_llm.chat_completions = mock_chat_completions
577+
578+
# Mock the UiPath import and constructor
579+
mocker.patch("uipath.UiPath", return_value=mock_uipath)
580+
581+
config = {
582+
"name": "LlmJudgeTest",
583+
"prompt": "Rate this output: {{ActualOutput}} vs {{ExpectedOutput}}",
584+
"model": "gpt-4",
585+
}
586+
evaluator = LLMJudgeOutputEvaluator.model_validate({"config": config})
587+
raw_criteria = {"expected_output": "Expected output"}
588+
589+
result = await evaluator.validate_and_evaluate_criteria(
590+
sample_agent_execution, raw_criteria
591+
)
592+
593+
# Verify the result
594+
assert hasattr(result, "score")
595+
assert isinstance(result, NumericEvaluationResult)
596+
assert result.score == 0.75
597+
450598

451599
class TestLlmJudgeTrajectoryEvaluator:
452600
"""Test LlmJudgeTrajectoryEvaluator.evaluate() method."""
@@ -498,6 +646,52 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any:
498646
assert isinstance(result, NumericEvaluationResult)
499647
assert result.score == 0.9
500648

649+
@pytest.mark.asyncio
650+
async def test_llm_trajectory_validate_and_evaluate_criteria(
651+
self, sample_agent_execution: AgentExecution, mocker: "MockerFixture"
652+
) -> None:
653+
"""Test LLM trajectory judge using validate_and_evaluate_criteria."""
654+
# Mock the UiPath constructor to avoid authentication
655+
mock_uipath = mocker.MagicMock()
656+
mock_llm = mocker.MagicMock()
657+
mock_uipath.llm = mock_llm
658+
659+
# Mock the chat completions response as an async method
660+
mock_response = mocker.MagicMock()
661+
mock_response.choices = [
662+
mocker.MagicMock(
663+
message=mocker.MagicMock(
664+
content='{"score": 85, "justification": "The agent behavior was good using raw criteria"}'
665+
)
666+
)
667+
]
668+
669+
# Make chat_completions an async method
670+
async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any:
671+
return mock_response
672+
673+
mock_llm.chat_completions = mock_chat_completions
674+
675+
# Mock the UiPath import and constructor
676+
mocker.patch("uipath.UiPath", return_value=mock_uipath)
677+
678+
config = {
679+
"name": "LlmTrajectoryTest",
680+
"prompt": "Evaluate this trajectory: {{AgentRunHistory}} vs {{ExpectedAgentBehavior}} given the following input: {{UserOrSyntheticInput}} instructions: {{SimulationInstructions}}",
681+
"model": "gpt-4",
682+
}
683+
evaluator = LLMJudgeTrajectoryEvaluator.model_validate({"config": config})
684+
raw_criteria = {"expected_agent_behavior": "Agent should respond helpfully"}
685+
686+
result = await evaluator.validate_and_evaluate_criteria(
687+
sample_agent_execution, raw_criteria
688+
)
689+
690+
# Verify the result
691+
assert hasattr(result, "score")
692+
assert isinstance(result, NumericEvaluationResult)
693+
assert result.score == 0.85
694+
501695

502696
class TestEvaluatorErrorHandling:
503697
"""Test error handling in evaluators."""

0 commit comments

Comments
 (0)