diff --git a/deepeval/evaluate/execute.py b/deepeval/evaluate/execute.py index ab850e9048..9d6b59d993 100644 --- a/deepeval/evaluate/execute.py +++ b/deepeval/evaluate/execute.py @@ -1189,7 +1189,7 @@ def dfs( if has_task_completion or requires_trace: if llm_test_case is None: llm_test_case = LLMTestCase(input="None") - llm_test_case._trace_dict = ( + llm_test_case.trace_dict = ( trace_manager.create_nested_spans_dict(span) ) else: @@ -1291,7 +1291,7 @@ def dfs( if has_task_completion or requires_trace: if llm_test_case is None: llm_test_case = LLMTestCase(input="None") - llm_test_case._trace_dict = ( + llm_test_case.trace_dict = ( trace_manager.create_nested_spans_dict( current_trace.root_spans[0] ) @@ -1305,15 +1305,15 @@ def dfs( TraceSpanApiStatus.ERRORED ) if current_trace.root_spans: - current_trace.root_spans[0].status = ( - TraceSpanStatus.ERRORED - ) - current_trace.root_spans[0].error = ( - format_error_text( - DeepEvalError( - "Trace has metrics but no LLMTestCase (missing input/output). " - "Are you sure you called `update_current_trace()`?" - ) + current_trace.root_spans[ + 0 + ].status = TraceSpanStatus.ERRORED + current_trace.root_spans[ + 0 + ].error = format_error_text( + DeepEvalError( + "Trace has metrics but no LLMTestCase (missing input/output). " + "Are you sure you called `update_current_trace()`?" ) ) if progress and pbar_eval_id is not None: @@ -1997,7 +1997,7 @@ async def _a_execute_span_test_case( if requires_trace: if test_case is None: test_case = LLMTestCase(input="None") - test_case._trace_dict = trace_manager.create_nested_spans_dict(span) + test_case.trace_dict = trace_manager.create_nested_spans_dict(span) for metric in metrics: metric.skipped = False @@ -2096,7 +2096,7 @@ async def _a_execute_trace_test_case( if requires_trace: if test_case is None: test_case = LLMTestCase(input="None") - test_case._trace_dict = trace_manager.create_nested_spans_dict( + test_case.trace_dict = trace_manager.create_nested_spans_dict( trace.root_spans[0] ) @@ -2284,7 +2284,7 @@ def dfs( if requires_trace: if llm_test_case is None: llm_test_case = LLMTestCase(input="None") - llm_test_case._trace_dict = ( + llm_test_case.trace_dict = ( trace_manager.create_nested_spans_dict(span) ) else: @@ -2383,7 +2383,7 @@ def dfs( if requires_trace: if llm_test_case is None: llm_test_case = LLMTestCase(input="None") - llm_test_case._trace_dict = ( + llm_test_case.trace_dict = ( trace_manager.create_nested_spans_dict( current_trace.root_spans[0] ) @@ -2393,15 +2393,15 @@ def dfs( current_trace.status = TraceSpanStatus.ERRORED trace_api.status = TraceSpanApiStatus.ERRORED if current_trace.root_spans: - current_trace.root_spans[0].status = ( - TraceSpanStatus.ERRORED - ) - current_trace.root_spans[0].error = ( - format_error_text( - DeepEvalError( - "Trace has metrics but no LLMTestCase (missing input/output). " - "Are you sure you called `update_current_trace()`?" - ) + current_trace.root_spans[ + 0 + ].status = TraceSpanStatus.ERRORED + current_trace.root_spans[ + 0 + ].error = format_error_text( + DeepEvalError( + "Trace has metrics but no LLMTestCase (missing input/output). " + "Are you sure you called `update_current_trace()`?" ) ) if progress and pbar_eval_id is not None: diff --git a/deepeval/metrics/plan_adherence/plan_adherence.py b/deepeval/metrics/plan_adherence/plan_adherence.py index 8e46a28710..0e4b73c23f 100644 --- a/deepeval/metrics/plan_adherence/plan_adherence.py +++ b/deepeval/metrics/plan_adherence/plan_adherence.py @@ -179,7 +179,7 @@ async def a_measure( def _get_plan_adherence_score(self, task, plan, test_case): prompt = PlanAdherenceTemplate.evaluate_adherence( - task, "\n".join(plan), test_case._trace_dict + task, "\n".join(plan), test_case.trace_dict ) return generate_with_schema_and_extract( metric=self, @@ -191,7 +191,7 @@ def _get_plan_adherence_score(self, task, plan, test_case): async def _a_get_plan_adherence_score(self, task, plan, test_case): prompt = PlanAdherenceTemplate.evaluate_adherence( - task, "\n".join(plan), test_case._trace_dict + task, "\n".join(plan), test_case.trace_dict ) return await a_generate_with_schema_and_extract( metric=self, @@ -203,7 +203,7 @@ async def _a_get_plan_adherence_score(self, task, plan, test_case): def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan: prompt = PlanAdherenceTemplate.extract_plan_from_trace( - test_case._trace_dict + test_case.trace_dict ) return generate_with_schema_and_extract( metric=self, @@ -217,7 +217,7 @@ async def _a_extract_plan_from_trace( self, test_case: LLMTestCase ) -> AgentPlan: prompt = PlanAdherenceTemplate.extract_plan_from_trace( - test_case._trace_dict + test_case.trace_dict ) return await a_generate_with_schema_and_extract( metric=self, @@ -229,7 +229,7 @@ async def _a_extract_plan_from_trace( def _extract_task_from_trace(self, test_case: LLMTestCase) -> str: prompt = StepEfficiencyTemplate.extract_task_from_trace( - test_case._trace_dict + test_case.trace_dict ) return generate_with_schema_and_extract( metric=self, @@ -241,7 +241,7 @@ def _extract_task_from_trace(self, test_case: LLMTestCase) -> str: async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str: prompt = StepEfficiencyTemplate.extract_task_from_trace( - test_case._trace_dict + test_case.trace_dict ) return await a_generate_with_schema_and_extract( metric=self, diff --git a/deepeval/metrics/plan_quality/plan_quality.py b/deepeval/metrics/plan_quality/plan_quality.py index 014a7a5efc..442f3a04ef 100644 --- a/deepeval/metrics/plan_quality/plan_quality.py +++ b/deepeval/metrics/plan_quality/plan_quality.py @@ -205,7 +205,7 @@ async def _a_get_plan_quality_score(self, task, plan): def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan: prompt = PlanAdherenceTemplate.extract_plan_from_trace( - test_case._trace_dict + test_case.trace_dict ) return generate_with_schema_and_extract( metric=self, @@ -219,7 +219,7 @@ async def _a_extract_plan_from_trace( self, test_case: LLMTestCase ) -> AgentPlan: prompt = PlanAdherenceTemplate.extract_plan_from_trace( - test_case._trace_dict + test_case.trace_dict ) return await a_generate_with_schema_and_extract( metric=self, @@ -231,7 +231,7 @@ async def _a_extract_plan_from_trace( def _extract_task_from_trace(self, test_case: LLMTestCase) -> str: prompt = StepEfficiencyTemplate.extract_task_from_trace( - test_case._trace_dict + test_case.trace_dict ) return generate_with_schema_and_extract( metric=self, @@ -243,7 +243,7 @@ def _extract_task_from_trace(self, test_case: LLMTestCase) -> str: async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str: prompt = StepEfficiencyTemplate.extract_task_from_trace( - test_case._trace_dict + test_case.trace_dict ) return await a_generate_with_schema_and_extract( metric=self, diff --git a/deepeval/metrics/step_efficiency/step_efficiency.py b/deepeval/metrics/step_efficiency/step_efficiency.py index e56f89b3fa..58dc104ce9 100644 --- a/deepeval/metrics/step_efficiency/step_efficiency.py +++ b/deepeval/metrics/step_efficiency/step_efficiency.py @@ -156,9 +156,9 @@ async def a_measure( def _get_score( self, task: str, test_case: LLMTestCase ) -> EfficiencyVerdict: - if test_case._trace_dict is not None: + if test_case.trace_dict is not None: prompt = StepEfficiencyTemplate.get_execution_efficiency( - task, test_case._trace_dict + task, test_case.trace_dict ) return generate_with_schema_and_extract( @@ -172,9 +172,9 @@ def _get_score( async def _a_get_score( self, task: str, test_case: LLMTestCase ) -> EfficiencyVerdict: - if test_case._trace_dict is not None: + if test_case.trace_dict is not None: prompt = StepEfficiencyTemplate.get_execution_efficiency( - task, test_case._trace_dict + task, test_case.trace_dict ) return await a_generate_with_schema_and_extract( @@ -187,7 +187,7 @@ async def _a_get_score( def _extract_task_from_trace(self, test_case: LLMTestCase) -> str: prompt = StepEfficiencyTemplate.extract_task_from_trace( - test_case._trace_dict + test_case.trace_dict ) return generate_with_schema_and_extract( metric=self, @@ -199,7 +199,7 @@ def _extract_task_from_trace(self, test_case: LLMTestCase) -> str: async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str: prompt = StepEfficiencyTemplate.extract_task_from_trace( - test_case._trace_dict + test_case.trace_dict ) return await a_generate_with_schema_and_extract( metric=self, diff --git a/deepeval/metrics/task_completion/task_completion.py b/deepeval/metrics/task_completion/task_completion.py index 3ea982256c..3c3c565b49 100644 --- a/deepeval/metrics/task_completion/task_completion.py +++ b/deepeval/metrics/task_completion/task_completion.py @@ -187,10 +187,10 @@ async def _a_extract_task_and_outcome( self, test_case: LLMTestCase, ) -> Tuple: - has_trace: bool = isinstance(test_case._trace_dict, Dict) + has_trace: bool = isinstance(test_case.trace_dict, Dict) if has_trace: prompt = TaskCompletionTemplate.extract_task_and_outcome_from_trace( - trace=test_case._trace_dict + trace=test_case.trace_dict ) else: # TODO: Deprecate this soon @@ -211,10 +211,10 @@ def _extract_task_and_outcome( self, test_case: LLMTestCase, ) -> Tuple: - has_trace: bool = isinstance(test_case._trace_dict, Dict) + has_trace: bool = isinstance(test_case.trace_dict, Dict) if has_trace: prompt = TaskCompletionTemplate.extract_task_and_outcome_from_trace( - trace=test_case._trace_dict + trace=test_case.trace_dict ) else: # TODO: Deprecate this soon diff --git a/deepeval/test_case/llm_test_case.py b/deepeval/test_case/llm_test_case.py index 44ffc72a93..84e386c590 100644 --- a/deepeval/test_case/llm_test_case.py +++ b/deepeval/test_case/llm_test_case.py @@ -26,9 +26,9 @@ validate_mcp_servers, ) -_MLLM_IMAGE_REGISTRY: weakref.WeakValueDictionary[str, "MLLMImage"] = ( - weakref.WeakValueDictionary() -) +_MLLM_IMAGE_REGISTRY: weakref.WeakValueDictionary[ + str, "MLLMImage" +] = weakref.WeakValueDictionary() @dataclass @@ -374,7 +374,16 @@ class LLMTestCase(BaseModel): "customColumnKeyValues", "custom_column_key_values" ), ) - _trace_dict: Optional[Dict] = PrivateAttr(default=None) + trace_dict: Optional[Dict] = Field( + default=None, + description=( + "Nested span tree from tracing (same shape as TraceManager." + "create_nested_spans_dict). Pass a pre-recorded trace for " + "post-hoc evaluation with metrics that require_trace=True." + ), + serialization_alias="traceDict", + validation_alias=AliasChoices("traceDict", "trace_dict"), + ) _dataset_rank: Optional[int] = PrivateAttr(default=None) _dataset_alias: Optional[str] = PrivateAttr(default=None) _dataset_id: Optional[str] = PrivateAttr(default=None) diff --git a/examples/tracing/test_posthoc_evaluation.py b/examples/tracing/test_posthoc_evaluation.py new file mode 100644 index 0000000000..7268ad4cf6 --- /dev/null +++ b/examples/tracing/test_posthoc_evaluation.py @@ -0,0 +1,116 @@ +""" +Post-hoc evaluation using LLMTestCase.trace_dict +------------------------------------------------- +DeepEval's agentic trace metrics (TaskCompletionMetric, StepEfficiencyMetric, +PlanQualityMetric, PlanAdherenceMetric) work with two approaches: + +APPROACH 1 — @observe at runtime (standard DeepEval way) + Instrument functions with @observe before they run. DeepEval captures the + trace automatically and writes it into LLMTestCase.trace_dict after the call. + +APPROACH 2 — post-hoc via trace_dict (this file) + Your agent has *already* run and you have a saved trace (from a log file, + database, observability system, or a previous @observe run that was + serialised). Pass it directly as trace_dict= when constructing LLMTestCase. + No @observe, no live agent execution required. + +This is useful for: + - Agents you don't own or can't re-run (3rd-party pipelines) + - Offline / batch evaluation from logs + - CI workflows that replay saved traces + - Post-mortem analysis of production runs + +The trace dict shape must match what TraceManager.create_nested_spans_dict() +produces: + + { + "name": str, + "type": "agent" | "tool" | "llm" | "retriever" | "custom", + "input": {...}, + "output": {...}, + "children": [ ] + } + +Run: + python examples/tracing/test_posthoc_evaluation.py +""" + +import json + +from deepeval import evaluate +from deepeval.metrics import TaskCompletionMetric +from deepeval.test_case import LLMTestCase + + +# --------------------------------------------------------------------------- +# 1. A saved trace (in practice you'd load this from a JSON file / DB / etc.) +# --------------------------------------------------------------------------- +def load_saved_trace() -> dict: + """ + Simulates loading a pre-recorded agent trace. + In real usage: json.load(open("traces/run_2024-01-15.json")) + """ + return { + "name": "trip_planner_agent", + "type": "agent", + "input": {"input": "Plan a 2-day trip to Paris with restaurants."}, + "output": { + "output": [ + "Eiffel Tower", + "Louvre Museum", + "Le Jules Verne", + "Angelina Paris", + "Septime", + ] + }, + "children": [ + { + "name": "itinerary_generator", + "type": "tool", + "input": { + "inputParameters": {"destination": "Paris", "days": 2} + }, + "output": {"output": ["Eiffel Tower", "Louvre Museum"]}, + "children": [], + }, + { + "name": "restaurant_finder", + "type": "tool", + "input": {"inputParameters": {"city": "Paris"}}, + "output": { + "output": ["Le Jules Verne", "Angelina Paris", "Septime"] + }, + "children": [], + }, + ], + } + + +# --------------------------------------------------------------------------- +# 2. Build LLMTestCase with the saved trace — no @observe needed +# --------------------------------------------------------------------------- +saved_trace = load_saved_trace() + +test_case = LLMTestCase( + input="Plan a 2-day trip to Paris with restaurants.", + actual_output=( + "Eiffel Tower, Louvre Museum, Le Jules Verne, Angelina Paris, Septime" + ), + trace_dict=saved_trace, # post-hoc trace injection +) + +# trace_dict is also accepted via its camelCase alias for JSON round-trips: +# LLMTestCase.model_validate({"input": "...", "traceDict": saved_trace}) +assert test_case.trace_dict == saved_trace + +# --------------------------------------------------------------------------- +# 3. Evaluate — TaskCompletionMetric detects trace_dict and uses the +# trace-aware prompt path (same as it would with @observe at runtime) +# --------------------------------------------------------------------------- +task_completion = TaskCompletionMetric(threshold=0.5) + +print("Running post-hoc evaluation with saved trace …") +evaluate([test_case], [task_completion]) + +print(f"\nScore : {task_completion.score}") +print(f"Reason: {task_completion.reason}") diff --git a/tests/test_core/test_test_case/test_single_turn.py b/tests/test_core/test_test_case/test_single_turn.py index b61878b119..aca8bb6684 100644 --- a/tests/test_core/test_test_case/test_single_turn.py +++ b/tests/test_core/test_test_case/test_single_turn.py @@ -13,7 +13,6 @@ class TestLLMTestCaseInitialization: - def test_minimal_initialization(self): test_case = LLMTestCase(input="What is the capital of France?") @@ -35,13 +34,36 @@ def test_minimal_initialization(self): assert test_case.mcp_resources_called is None assert test_case.mcp_prompts_called is None - # Test private attributes have defaults - assert test_case._trace_dict is None + # trace_dict is public (optional nested span tree for post-hoc eval) + assert test_case.trace_dict is None + # Other private attributes have defaults assert test_case._dataset_rank is None assert test_case._dataset_alias is None assert test_case._dataset_id is None assert isinstance(test_case._identifier, str) + def test_trace_dict_constructor_and_alias(self): + nested = { + "name": "root_agent", + "type": "agent", + "input": {"input": "Book a flight to NYC"}, + "output": {"summary": "done"}, + "children": [], + } + tc = LLMTestCase( + input="Book a flight to NYC", + actual_output="Flight options listed.", + trace_dict=nested, + ) + assert tc.trace_dict == nested + tc2 = LLMTestCase.model_validate( + { + "input": "hi", + "traceDict": {"name": "a", "type": "agent", "children": []}, + } + ) + assert tc2.trace_dict["name"] == "a" + def test_full_initialization(self): tool_call = ToolCall( name="search_tool", @@ -96,7 +118,6 @@ def test_full_initialization(self): class TestLLMTestCaseCamelCaseInitialization: - def test_camelcase_field_initialization(self): input_text = "What is artificial intelligence?" actual_output_text = "AI is a branch of computer science..." @@ -235,7 +256,6 @@ def test_tool_call_camelcase_initialization(self): class TestLLMTestCaseTypeValidation: - def test_input_must_be_string(self): with pytest.raises(TypeError, match="'input' must be a string"): LLMTestCase(input=123) @@ -477,7 +497,6 @@ def test_tool_call_repr_minimal(self): class TestEdgeCases: - def test_empty_strings(self): test_case = LLMTestCase( input="", actual_output="", expected_output="", comments="" @@ -580,7 +599,6 @@ def test_deeply_nested_structures(self): class TestSerialization: - def test_serialization_aliases(self): test_case = LLMTestCase( input="test", @@ -655,7 +673,8 @@ def test_private_attributes_not_in_model_dump(self): model_dict = test_case.model_dump() - assert "_trace_dict" not in model_dict + assert "trace_dict" in model_dict + assert model_dict["trace_dict"] is None assert "_dataset_rank" not in model_dict assert "_dataset_alias" not in model_dict assert "_dataset_id" not in model_dict @@ -664,18 +683,18 @@ def test_private_attributes_not_in_model_dump(self): def test_private_attributes_accessible(self): test_case = LLMTestCase(input="test") - assert test_case._trace_dict is None + assert test_case.trace_dict is None assert test_case._dataset_rank is None assert test_case._dataset_alias is None assert test_case._dataset_id is None assert isinstance(test_case._identifier, str) - test_case._trace_dict = {"key": "value"} + test_case.trace_dict = {"key": "value"} test_case._dataset_rank = 1 test_case._dataset_alias = "test_alias" test_case._dataset_id = "test_id" - assert test_case._trace_dict == {"key": "value"} + assert test_case.trace_dict == {"key": "value"} assert test_case._dataset_rank == 1 assert test_case._dataset_alias == "test_alias" assert test_case._dataset_id == "test_id"