Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 24 additions & 24 deletions deepeval/evaluate/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -1189,7 +1189,7 @@ def dfs(
if has_task_completion or requires_trace:
if llm_test_case is None:
llm_test_case = LLMTestCase(input="None")
llm_test_case._trace_dict = (
llm_test_case.trace_dict = (
trace_manager.create_nested_spans_dict(span)
)
else:
Expand Down Expand Up @@ -1291,7 +1291,7 @@ def dfs(
if has_task_completion or requires_trace:
if llm_test_case is None:
llm_test_case = LLMTestCase(input="None")
llm_test_case._trace_dict = (
llm_test_case.trace_dict = (
trace_manager.create_nested_spans_dict(
current_trace.root_spans[0]
)
Expand All @@ -1305,15 +1305,15 @@ def dfs(
TraceSpanApiStatus.ERRORED
)
if current_trace.root_spans:
current_trace.root_spans[0].status = (
TraceSpanStatus.ERRORED
)
current_trace.root_spans[0].error = (
format_error_text(
DeepEvalError(
"Trace has metrics but no LLMTestCase (missing input/output). "
"Are you sure you called `update_current_trace()`?"
)
current_trace.root_spans[
0
].status = TraceSpanStatus.ERRORED
current_trace.root_spans[
0
].error = format_error_text(
DeepEvalError(
"Trace has metrics but no LLMTestCase (missing input/output). "
"Are you sure you called `update_current_trace()`?"
)
)
if progress and pbar_eval_id is not None:
Expand Down Expand Up @@ -1997,7 +1997,7 @@ async def _a_execute_span_test_case(
if requires_trace:
if test_case is None:
test_case = LLMTestCase(input="None")
test_case._trace_dict = trace_manager.create_nested_spans_dict(span)
test_case.trace_dict = trace_manager.create_nested_spans_dict(span)

for metric in metrics:
metric.skipped = False
Expand Down Expand Up @@ -2096,7 +2096,7 @@ async def _a_execute_trace_test_case(
if requires_trace:
if test_case is None:
test_case = LLMTestCase(input="None")
test_case._trace_dict = trace_manager.create_nested_spans_dict(
test_case.trace_dict = trace_manager.create_nested_spans_dict(
trace.root_spans[0]
)

Expand Down Expand Up @@ -2284,7 +2284,7 @@ def dfs(
if requires_trace:
if llm_test_case is None:
llm_test_case = LLMTestCase(input="None")
llm_test_case._trace_dict = (
llm_test_case.trace_dict = (
trace_manager.create_nested_spans_dict(span)
)
else:
Expand Down Expand Up @@ -2383,7 +2383,7 @@ def dfs(
if requires_trace:
if llm_test_case is None:
llm_test_case = LLMTestCase(input="None")
llm_test_case._trace_dict = (
llm_test_case.trace_dict = (
trace_manager.create_nested_spans_dict(
current_trace.root_spans[0]
)
Expand All @@ -2393,15 +2393,15 @@ def dfs(
current_trace.status = TraceSpanStatus.ERRORED
trace_api.status = TraceSpanApiStatus.ERRORED
if current_trace.root_spans:
current_trace.root_spans[0].status = (
TraceSpanStatus.ERRORED
)
current_trace.root_spans[0].error = (
format_error_text(
DeepEvalError(
"Trace has metrics but no LLMTestCase (missing input/output). "
"Are you sure you called `update_current_trace()`?"
)
current_trace.root_spans[
0
].status = TraceSpanStatus.ERRORED
current_trace.root_spans[
0
].error = format_error_text(
DeepEvalError(
"Trace has metrics but no LLMTestCase (missing input/output). "
"Are you sure you called `update_current_trace()`?"
)
)
if progress and pbar_eval_id is not None:
Expand Down
12 changes: 6 additions & 6 deletions deepeval/metrics/plan_adherence/plan_adherence.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ async def a_measure(

def _get_plan_adherence_score(self, task, plan, test_case):
prompt = PlanAdherenceTemplate.evaluate_adherence(
task, "\n".join(plan), test_case._trace_dict
task, "\n".join(plan), test_case.trace_dict
)
return generate_with_schema_and_extract(
metric=self,
Expand All @@ -191,7 +191,7 @@ def _get_plan_adherence_score(self, task, plan, test_case):

async def _a_get_plan_adherence_score(self, task, plan, test_case):
prompt = PlanAdherenceTemplate.evaluate_adherence(
task, "\n".join(plan), test_case._trace_dict
task, "\n".join(plan), test_case.trace_dict
)
return await a_generate_with_schema_and_extract(
metric=self,
Expand All @@ -203,7 +203,7 @@ async def _a_get_plan_adherence_score(self, task, plan, test_case):

def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan:
prompt = PlanAdherenceTemplate.extract_plan_from_trace(
test_case._trace_dict
test_case.trace_dict
)
return generate_with_schema_and_extract(
metric=self,
Expand All @@ -217,7 +217,7 @@ async def _a_extract_plan_from_trace(
self, test_case: LLMTestCase
) -> AgentPlan:
prompt = PlanAdherenceTemplate.extract_plan_from_trace(
test_case._trace_dict
test_case.trace_dict
)
return await a_generate_with_schema_and_extract(
metric=self,
Expand All @@ -229,7 +229,7 @@ async def _a_extract_plan_from_trace(

def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
prompt = StepEfficiencyTemplate.extract_task_from_trace(
test_case._trace_dict
test_case.trace_dict
)
return generate_with_schema_and_extract(
metric=self,
Expand All @@ -241,7 +241,7 @@ def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:

async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
prompt = StepEfficiencyTemplate.extract_task_from_trace(
test_case._trace_dict
test_case.trace_dict
)
return await a_generate_with_schema_and_extract(
metric=self,
Expand Down
8 changes: 4 additions & 4 deletions deepeval/metrics/plan_quality/plan_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ async def _a_get_plan_quality_score(self, task, plan):

def _extract_plan_from_trace(self, test_case: LLMTestCase) -> AgentPlan:
prompt = PlanAdherenceTemplate.extract_plan_from_trace(
test_case._trace_dict
test_case.trace_dict
)
return generate_with_schema_and_extract(
metric=self,
Expand All @@ -219,7 +219,7 @@ async def _a_extract_plan_from_trace(
self, test_case: LLMTestCase
) -> AgentPlan:
prompt = PlanAdherenceTemplate.extract_plan_from_trace(
test_case._trace_dict
test_case.trace_dict
)
return await a_generate_with_schema_and_extract(
metric=self,
Expand All @@ -231,7 +231,7 @@ async def _a_extract_plan_from_trace(

def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
prompt = StepEfficiencyTemplate.extract_task_from_trace(
test_case._trace_dict
test_case.trace_dict
)
return generate_with_schema_and_extract(
metric=self,
Expand All @@ -243,7 +243,7 @@ def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:

async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
prompt = StepEfficiencyTemplate.extract_task_from_trace(
test_case._trace_dict
test_case.trace_dict
)
return await a_generate_with_schema_and_extract(
metric=self,
Expand Down
12 changes: 6 additions & 6 deletions deepeval/metrics/step_efficiency/step_efficiency.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,9 @@ async def a_measure(
def _get_score(
self, task: str, test_case: LLMTestCase
) -> EfficiencyVerdict:
if test_case._trace_dict is not None:
if test_case.trace_dict is not None:
prompt = StepEfficiencyTemplate.get_execution_efficiency(
task, test_case._trace_dict
task, test_case.trace_dict
)

return generate_with_schema_and_extract(
Expand All @@ -172,9 +172,9 @@ def _get_score(
async def _a_get_score(
self, task: str, test_case: LLMTestCase
) -> EfficiencyVerdict:
if test_case._trace_dict is not None:
if test_case.trace_dict is not None:
prompt = StepEfficiencyTemplate.get_execution_efficiency(
task, test_case._trace_dict
task, test_case.trace_dict
)

return await a_generate_with_schema_and_extract(
Expand All @@ -187,7 +187,7 @@ async def _a_get_score(

def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:
prompt = StepEfficiencyTemplate.extract_task_from_trace(
test_case._trace_dict
test_case.trace_dict
)
return generate_with_schema_and_extract(
metric=self,
Expand All @@ -199,7 +199,7 @@ def _extract_task_from_trace(self, test_case: LLMTestCase) -> str:

async def _a_extract_task_from_trace(self, test_case: LLMTestCase) -> str:
prompt = StepEfficiencyTemplate.extract_task_from_trace(
test_case._trace_dict
test_case.trace_dict
)
return await a_generate_with_schema_and_extract(
metric=self,
Expand Down
8 changes: 4 additions & 4 deletions deepeval/metrics/task_completion/task_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,10 +187,10 @@ async def _a_extract_task_and_outcome(
self,
test_case: LLMTestCase,
) -> Tuple:
has_trace: bool = isinstance(test_case._trace_dict, Dict)
has_trace: bool = isinstance(test_case.trace_dict, Dict)
if has_trace:
prompt = TaskCompletionTemplate.extract_task_and_outcome_from_trace(
trace=test_case._trace_dict
trace=test_case.trace_dict
)
else:
# TODO: Deprecate this soon
Expand All @@ -211,10 +211,10 @@ def _extract_task_and_outcome(
self,
test_case: LLMTestCase,
) -> Tuple:
has_trace: bool = isinstance(test_case._trace_dict, Dict)
has_trace: bool = isinstance(test_case.trace_dict, Dict)
if has_trace:
prompt = TaskCompletionTemplate.extract_task_and_outcome_from_trace(
trace=test_case._trace_dict
trace=test_case.trace_dict
)
else:
# TODO: Deprecate this soon
Expand Down
17 changes: 13 additions & 4 deletions deepeval/test_case/llm_test_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
validate_mcp_servers,
)

_MLLM_IMAGE_REGISTRY: weakref.WeakValueDictionary[str, "MLLMImage"] = (
weakref.WeakValueDictionary()
)
_MLLM_IMAGE_REGISTRY: weakref.WeakValueDictionary[
str, "MLLMImage"
] = weakref.WeakValueDictionary()


@dataclass
Expand Down Expand Up @@ -374,7 +374,16 @@ class LLMTestCase(BaseModel):
"customColumnKeyValues", "custom_column_key_values"
),
)
_trace_dict: Optional[Dict] = PrivateAttr(default=None)
trace_dict: Optional[Dict] = Field(
default=None,
description=(
"Nested span tree from tracing (same shape as TraceManager."
"create_nested_spans_dict). Pass a pre-recorded trace for "
"post-hoc evaluation with metrics that require_trace=True."
),
serialization_alias="traceDict",
validation_alias=AliasChoices("traceDict", "trace_dict"),
)
_dataset_rank: Optional[int] = PrivateAttr(default=None)
_dataset_alias: Optional[str] = PrivateAttr(default=None)
_dataset_id: Optional[str] = PrivateAttr(default=None)
Expand Down
Loading
Loading