@@ -153,6 +153,25 @@ async def test_exact_match_negated(
153153 assert isinstance (result , NumericEvaluationResult )
154154 assert result .score == 0.0
155155
156+ @pytest .mark .asyncio
157+ async def test_exact_match_validate_and_evaluate_criteria (
158+ self , sample_agent_execution : AgentExecution
159+ ) -> None :
160+ """Test exact match using validate_and_evaluate_criteria."""
161+ config = {
162+ "name" : "ExactMatchTest" ,
163+ "case_sensitive" : True ,
164+ }
165+ evaluator = ExactMatchEvaluator .model_validate ({"config" : config })
166+ raw_criteria = {"expected_output" : {"output" : "Test output" }}
167+
168+ result = await evaluator .validate_and_evaluate_criteria (
169+ sample_agent_execution , raw_criteria
170+ )
171+
172+ assert isinstance (result , NumericEvaluationResult )
173+ assert result .score == 1.0
174+
156175
157176class TestJsonSimilarityEvaluator :
158177 """Test JsonSimilarityEvaluator.evaluate() method."""
@@ -200,6 +219,25 @@ async def test_json_similarity_partial_match(self) -> None:
200219 assert isinstance (result , NumericEvaluationResult )
201220 assert math .isclose (result .score , 0.666 , abs_tol = 1e-3 )
202221
222+ @pytest .mark .asyncio
223+ async def test_json_similarity_validate_and_evaluate_criteria (self ) -> None :
224+ """Test JSON similarity using validate_and_evaluate_criteria."""
225+ execution = AgentExecution (
226+ agent_input = {"input" : "Test" },
227+ agent_output = {"name" : "John" , "age" : 30 , "city" : "NYC" },
228+ agent_trace = [],
229+ )
230+ config = {
231+ "name" : "JsonSimilarityTest" ,
232+ }
233+ evaluator = JsonSimilarityEvaluator .model_validate ({"config" : config })
234+ raw_criteria = {"expected_output" : {"name" : "John" , "age" : 30 , "city" : "NYC" }}
235+
236+ result = await evaluator .validate_and_evaluate_criteria (execution , raw_criteria )
237+
238+ assert isinstance (result , NumericEvaluationResult )
239+ assert result .score == 1.0
240+
203241
204242class TestToolCallOrderEvaluator :
205243 """Test ToolCallOrderEvaluator.evaluate() method."""
@@ -266,6 +304,25 @@ async def test_tool_call_order_lcs_match(
266304 assert isinstance (result , NumericEvaluationResult )
267305 assert result .score == 0.75
268306
307+ @pytest .mark .asyncio
308+ async def test_tool_call_order_validate_and_evaluate_criteria (
309+ self , sample_agent_execution_with_trace : AgentExecution
310+ ) -> None :
311+ """Test tool call order using validate_and_evaluate_criteria."""
312+ config = {
313+ "name" : "ToolOrderTest" ,
314+ "strict" : True ,
315+ }
316+ evaluator = ToolCallOrderEvaluator .model_validate ({"config" : config })
317+ raw_criteria = {"tool_calls_order" : ["tool1" , "tool2" , "tool1" , "tool2" ]}
318+
319+ result = await evaluator .validate_and_evaluate_criteria (
320+ sample_agent_execution_with_trace , raw_criteria
321+ )
322+
323+ assert isinstance (result , NumericEvaluationResult )
324+ assert result .score == 1.0
325+
269326
270327class TestToolCallCountEvaluator :
271328 """Test ToolCallCountEvaluator.evaluate() method."""
@@ -346,6 +403,25 @@ async def test_tool_call_count_partial_match(
346403 assert isinstance (result , NumericEvaluationResult )
347404 assert result .score == 0.5
348405
406+ @pytest .mark .asyncio
407+ async def test_tool_call_count_validate_and_evaluate_criteria (
408+ self , sample_agent_execution_with_trace : AgentExecution
409+ ) -> None :
410+ """Test tool call count using validate_and_evaluate_criteria."""
411+ config = {
412+ "name" : "ToolCountTest" ,
413+ "strict" : True ,
414+ }
415+ evaluator = ToolCallCountEvaluator .model_validate ({"config" : config })
416+ raw_criteria = {"tool_calls_count" : {"tool1" : ("=" , 2 ), "tool2" : ("=" , 2 )}}
417+
418+ result = await evaluator .validate_and_evaluate_criteria (
419+ sample_agent_execution_with_trace , raw_criteria
420+ )
421+
422+ assert isinstance (result , NumericEvaluationResult )
423+ assert result .score == 1.0
424+
349425
350426class TestToolCallArgsEvaluator :
351427 """Test ToolCallArgsEvaluator.evaluate() method."""
@@ -398,6 +474,32 @@ async def test_tool_call_args_partial_match(
398474 assert isinstance (result , NumericEvaluationResult )
399475 assert result .score == 0.75
400476
477+ @pytest .mark .asyncio
478+ async def test_tool_call_args_validate_and_evaluate_criteria (
479+ self , sample_agent_execution_with_trace : AgentExecution
480+ ) -> None :
481+ """Test tool call args using validate_and_evaluate_criteria."""
482+ config = {
483+ "name" : "ToolArgsTest" ,
484+ "strict" : True ,
485+ }
486+ evaluator = ToolCallArgsEvaluator .model_validate ({"config" : config })
487+ raw_criteria = {
488+ "tool_calls" : [
489+ {"name" : "tool1" , "args" : {"arg1" : "value1" }},
490+ {"name" : "tool2" , "args" : {"arg2" : "value2" }},
491+ {"name" : "tool1" , "args" : {"arg1" : "value1" }},
492+ {"name" : "tool2" , "args" : {"arg2" : "value2" }},
493+ ]
494+ }
495+
496+ result = await evaluator .validate_and_evaluate_criteria (
497+ sample_agent_execution_with_trace , raw_criteria
498+ )
499+
500+ assert isinstance (result , NumericEvaluationResult )
501+ assert result .score == 1.0
502+
401503
402504class TestLlmAsAJudgeEvaluator :
403505 """Test LlmAsAJudgeEvaluator.evaluate() method."""
@@ -447,6 +549,52 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any:
447549 assert isinstance (result , NumericEvaluationResult ), f"Result is { result } "
448550 assert result .score == 0.8 , f"Result score is { result .score } "
449551
552+ @pytest .mark .asyncio
553+ async def test_llm_judge_validate_and_evaluate_criteria (
554+ self , sample_agent_execution : AgentExecution , mocker : "MockerFixture"
555+ ) -> None :
556+ """Test LLM judge using validate_and_evaluate_criteria."""
557+ # Mock the UiPath constructor to avoid authentication
558+ mock_uipath = mocker .MagicMock ()
559+ mock_llm = mocker .MagicMock ()
560+ mock_uipath .llm = mock_llm
561+
562+ # Mock the chat completions response as an async method
563+ mock_response = mocker .MagicMock ()
564+ mock_response .choices = [
565+ mocker .MagicMock (
566+ message = mocker .MagicMock (
567+ content = '{"score": 75, "justification": "Good response using raw criteria"}'
568+ )
569+ )
570+ ]
571+
572+ # Make chat_completions an async method
573+ async def mock_chat_completions (* args : Any , ** kwargs : Any ) -> Any :
574+ return mock_response
575+
576+ mock_llm .chat_completions = mock_chat_completions
577+
578+ # Mock the UiPath import and constructor
579+ mocker .patch ("uipath.UiPath" , return_value = mock_uipath )
580+
581+ config = {
582+ "name" : "LlmJudgeTest" ,
583+ "prompt" : "Rate this output: {{ActualOutput}} vs {{ExpectedOutput}}" ,
584+ "model" : "gpt-4" ,
585+ }
586+ evaluator = LLMJudgeOutputEvaluator .model_validate ({"config" : config })
587+ raw_criteria = {"expected_output" : "Expected output" }
588+
589+ result = await evaluator .validate_and_evaluate_criteria (
590+ sample_agent_execution , raw_criteria
591+ )
592+
593+ # Verify the result
594+ assert hasattr (result , "score" )
595+ assert isinstance (result , NumericEvaluationResult )
596+ assert result .score == 0.75
597+
450598
451599class TestLlmJudgeTrajectoryEvaluator :
452600 """Test LlmJudgeTrajectoryEvaluator.evaluate() method."""
@@ -498,6 +646,52 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any:
498646 assert isinstance (result , NumericEvaluationResult )
499647 assert result .score == 0.9
500648
649+ @pytest .mark .asyncio
650+ async def test_llm_trajectory_validate_and_evaluate_criteria (
651+ self , sample_agent_execution : AgentExecution , mocker : "MockerFixture"
652+ ) -> None :
653+ """Test LLM trajectory judge using validate_and_evaluate_criteria."""
654+ # Mock the UiPath constructor to avoid authentication
655+ mock_uipath = mocker .MagicMock ()
656+ mock_llm = mocker .MagicMock ()
657+ mock_uipath .llm = mock_llm
658+
659+ # Mock the chat completions response as an async method
660+ mock_response = mocker .MagicMock ()
661+ mock_response .choices = [
662+ mocker .MagicMock (
663+ message = mocker .MagicMock (
664+ content = '{"score": 85, "justification": "The agent behavior was good using raw criteria"}'
665+ )
666+ )
667+ ]
668+
669+ # Make chat_completions an async method
670+ async def mock_chat_completions (* args : Any , ** kwargs : Any ) -> Any :
671+ return mock_response
672+
673+ mock_llm .chat_completions = mock_chat_completions
674+
675+ # Mock the UiPath import and constructor
676+ mocker .patch ("uipath.UiPath" , return_value = mock_uipath )
677+
678+ config = {
679+ "name" : "LlmTrajectoryTest" ,
680+ "prompt" : "Evaluate this trajectory: {{AgentRunHistory}} vs {{ExpectedAgentBehavior}} given the following input: {{UserOrSyntheticInput}} instructions: {{SimulationInstructions}}" ,
681+ "model" : "gpt-4" ,
682+ }
683+ evaluator = LLMJudgeTrajectoryEvaluator .model_validate ({"config" : config })
684+ raw_criteria = {"expected_agent_behavior" : "Agent should respond helpfully" }
685+
686+ result = await evaluator .validate_and_evaluate_criteria (
687+ sample_agent_execution , raw_criteria
688+ )
689+
690+ # Verify the result
691+ assert hasattr (result , "score" )
692+ assert isinstance (result , NumericEvaluationResult )
693+ assert result .score == 0.85
694+
501695
502696class TestEvaluatorErrorHandling :
503697 """Test error handling in evaluators."""
0 commit comments