Skip to content

Commit d5f3e92

Browse files
PSU3PSU3
authored andcommitted
Merge branch 'main' into fix/preserve-local-model-names-in-parse_model_name
2 parents 7082066 + 1a5f92e commit d5f3e92

17 files changed

Lines changed: 474 additions & 382 deletions

File tree

CITATION.cff

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ authors:
66
- family-names: Vongthongsri
77
given-names: Kritin
88
title: deepeval
9-
version: 3.0.2
9+
version: 3.0.3
1010
date-released: "2025-05-28"
1111
url: https://confident-ai.com
1212
repository-code: https://github.com/confident-ai/deepeval

deepeval/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__: str = "3.0.2"
1+
__version__: str = "3.0.3"

deepeval/benchmarks/gsm8k/gsm8k.py

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Optional, Dict
1+
from typing import List, Optional, Dict, Union
22
from tqdm import tqdm
33

44
from deepeval.dataset import Golden
@@ -52,7 +52,10 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict:
5252
for idx, golden in enumerate(
5353
tqdm(goldens, desc=f"Processing {self.n_problems} problems")
5454
):
55-
prediction, score = self.predict(model, golden).values()
55+
result = self.predict(model, golden)
56+
prediction = result["prediction"]
57+
score = result["score"]
58+
5659
if score:
5760
overall_correct_predictions += 1
5861
predictions_row.append(
@@ -94,14 +97,17 @@ def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
9497
)
9598

9699
# Enforced model generation
100+
prediction = None
97101
try:
98102
res: NumberSchema = model.generate(
99103
prompt=prompt, schema=NumberSchema
100104
)
101-
prediction = str(res.answer)
102-
except TypeError:
105+
prediction = self._extract_prediction_from_response(res)
106+
except (TypeError, AttributeError) as e:
107+
103108
prompt += f"\n\n{self.confinement_instructions}"
104-
prediction = model.generate(prompt)
109+
res = model.generate(prompt)
110+
prediction = self._extract_prediction_from_response(res)
105111

106112
# For native models, shouldn't happen but just in case
107113
if isinstance(prediction, tuple):
@@ -114,6 +120,29 @@ def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
114120

115121
return {"prediction": prediction, "score": score}
116122

123+
def _extract_prediction_from_response(self, res) -> str:
124+
"""
125+
Extract prediction from model response, handling various response types.
126+
"""
127+
# Case 1: Response has .answer attribute (NumberSchema case)
128+
if hasattr(res, "answer"):
129+
return str(res.answer)
130+
131+
# Case 2: Response is a tuple
132+
elif isinstance(res, tuple):
133+
return self._extract_from_tuple(res)
134+
135+
else:
136+
return str(res)
137+
138+
def _extract_from_tuple(self, res: tuple) -> str:
139+
"""Extract prediction from tuple response."""
140+
if len(res) == 0:
141+
return ""
142+
first_elem = res[0]
143+
if hasattr(first_elem, "answer"):
144+
return str(first_elem.answer)
145+
117146
def load_benchmark_dataset(self) -> List[Golden]:
118147
from datasets import load_dataset
119148

deepeval/evaluate/execute.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from deepeval.tracing.tracing import (
1010
Observer,
11-
get_current_trace,
1211
trace_manager,
1312
Trace,
1413
BaseSpan,
@@ -19,6 +18,7 @@
1918
perf_counter_to_datetime,
2019
to_zod_compatible_iso,
2120
)
21+
from deepeval.tracing.context import current_trace_context
2222
from deepeval.tracing.api import (
2323
TraceApi,
2424
BaseApiSpan,
@@ -818,7 +818,7 @@ def evaluate_test_cases(
818818
loop.run_until_complete(observed_callback(golden.input))
819819
else:
820820
observed_callback(golden.input)
821-
current_trace: Trace = get_current_trace()
821+
current_trace: Trace = current_trace_context.get()
822822

823823
if pbar_callback is not None:
824824
pbar_callback.update(1)
@@ -1117,7 +1117,7 @@ async def a_execute_agentic_test_case(
11171117
await observed_callback(golden.input)
11181118
else:
11191119
observed_callback(golden.input)
1120-
current_trace: Trace = get_current_trace()
1120+
current_trace: Trace = current_trace_context.get()
11211121

11221122
if pbar_callback is not None:
11231123
pbar_callback.update(1)

deepeval/metrics/answer_relevancy/template.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
class AnswerRelevancyTemplate:
55
@staticmethod
66
def generate_statements(actual_output: str):
7-
return f"""Given the text, breakdown and generate a list of statements presented. Ambiguous statements and single words can also be considered as statements.
7+
return f"""Given the text, breakdown and generate a list of statements presented. Ambiguous statements and single words can be considered as statements, but only if outside of a coherent statement.
88
99
Example:
1010
Example text:
@@ -22,7 +22,7 @@ def generate_statements(actual_output: str):
2222
===== END OF EXAMPLE ======
2323
2424
**
25-
IMPORTANT: Please make sure to only return in JSON format, with the "statements" key mapping to a list of strings. No words or explanation is needed.
25+
IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "statements" key mapping to a list of strings. No words or explanation are needed. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
2626
**
2727
2828
Text:
@@ -41,10 +41,11 @@ def generate_verdicts(input: str, statements: str):
4141
The provided statements are statements made in the actual output.
4242
4343
**
44-
IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key mapping to a list of JSON objects.
44+
IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
4545
Example input:
4646
What features does the new laptop have?
4747
48+
Example:
4849
Example statements:
4950
[
5051
"The new laptop model has a high-resolution Retina display.",
@@ -81,6 +82,7 @@ def generate_verdicts(input: str, statements: str):
8182
}}
8283
]
8384
}}
85+
===== END OF EXAMPLE ======
8486
8587
Since you are going to generate a verdict for each statement, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of `statements`.
8688
**
@@ -104,13 +106,17 @@ def generate_reason(
104106
105107
106108
**
107-
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
109+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
110+
111+
Example:
108112
Example JSON:
109113
{{
110114
"reason": "The score is <answer_relevancy_score> because <your_reason>."
111115
}}
116+
===== END OF EXAMPLE ======
112117
**
113118
119+
114120
Answer Relevancy Score:
115121
{score}
116122

deepeval/metrics/g_eval/g_eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ def is_successful(self) -> bool:
348348
self.success = False
349349
else:
350350
try:
351-
self.score >= self.threshold
351+
self.success = self.score >= self.threshold
352352
except:
353353
self.success = False
354354
return self.success

deepeval/tracing/__init__.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
1-
from .tracing import (
2-
observe,
3-
update_current_span,
4-
update_current_trace,
1+
from .context import update_current_span, update_current_trace
2+
from .attributes import (
53
LlmAttributes,
64
RetrieverAttributes,
75
ToolAttributes,
86
AgentAttributes,
9-
get_current_trace,
10-
trace_manager,
117
)
8+
from .types import BaseSpan, Trace
9+
from .tracing import observe, trace_manager

deepeval/tracing/api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class MetricData(BaseModel):
3737
class SpanTestCase(BaseModel):
3838
input: str
3939
actual_output: str = Field(alias="actualOutput")
40-
expected_output: Optional[str] = Field(None, lias="expectedOutput")
40+
expected_output: Optional[str] = Field(None, alias="expectedOutput")
4141
retrieval_context: Optional[List[str]] = Field(
4242
None, alias="retrievalContext"
4343
)

deepeval/tracing/attributes.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from pydantic import BaseModel, Field
2+
from typing import Any, Dict, List, Optional, Union
3+
from deepeval.prompt import Prompt
4+
5+
6+
class AgentAttributes(BaseModel):
7+
# input
8+
input: Union[str, Dict, list]
9+
# output
10+
output: Union[str, Dict, list]
11+
12+
13+
class LlmAttributes(BaseModel):
14+
# input
15+
input: Union[str, List[Dict[str, Any]]]
16+
# output
17+
output: str
18+
prompt: Optional[Prompt] = None
19+
20+
# Optional variables
21+
input_token_count: Optional[int] = Field(
22+
None, serialization_alias="inputTokenCount"
23+
)
24+
output_token_count: Optional[int] = Field(
25+
None, serialization_alias="outputTokenCount"
26+
)
27+
28+
model_config = {"arbitrary_types_allowed": True}
29+
30+
31+
class RetrieverAttributes(BaseModel):
32+
# input
33+
embedding_input: str = Field(serialization_alias="embeddingInput")
34+
# output
35+
retrieval_context: List[str] = Field(serialization_alias="retrievalContext")
36+
37+
# Optional variables
38+
top_k: Optional[int] = Field(None, serialization_alias="topK")
39+
chunk_size: Optional[int] = Field(None, serialization_alias="chunkSize")
40+
41+
42+
# Don't have to call this manually, will be taken as input and output of function
43+
# Can be overridden by user
44+
class ToolAttributes(BaseModel):
45+
# input
46+
input_parameters: Optional[Dict[str, Any]] = Field(
47+
None, serialization_alias="inputParameters"
48+
)
49+
# output
50+
output: Optional[Any] = None
51+
52+
53+
Attributes = Union[
54+
AgentAttributes, LlmAttributes, RetrieverAttributes, ToolAttributes
55+
]

deepeval/tracing/context.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from typing import Any, Dict, List, Optional
2+
from contextvars import ContextVar
3+
from deepeval.tracing.types import BaseSpan, Trace
4+
from deepeval.test_case import LLMTestCase
5+
from deepeval.tracing.attributes import Attributes
6+
7+
8+
current_span_context: ContextVar[Optional[BaseSpan]] = ContextVar(
9+
"current_span", default=None
10+
)
11+
12+
current_trace_context: ContextVar[Optional[Trace]] = ContextVar(
13+
"current_trace", default=None
14+
)
15+
16+
17+
def update_current_span(
18+
test_case: Optional[LLMTestCase] = None,
19+
attributes: Optional[Attributes] = None,
20+
metadata: Optional[Dict[str, Any]] = None,
21+
):
22+
current_span = current_span_context.get()
23+
if not current_span:
24+
return
25+
if attributes:
26+
current_span.set_attributes(attributes)
27+
if test_case:
28+
current_span.llm_test_case = test_case
29+
if metadata:
30+
current_span.metadata = metadata
31+
32+
33+
def update_current_trace(
34+
tags: Optional[List[str]] = None,
35+
metadata: Optional[Dict[str, Any]] = None,
36+
thread_id: Optional[str] = None,
37+
user_id: Optional[str] = None,
38+
input: Optional[Any] = None,
39+
output: Optional[Any] = None,
40+
):
41+
current_trace = current_trace_context.get()
42+
if not current_trace:
43+
return
44+
if tags:
45+
current_trace.tags = tags
46+
if metadata:
47+
current_trace.metadata = metadata
48+
if thread_id:
49+
current_trace.thread_id = thread_id
50+
if user_id:
51+
current_trace.user_id = user_id
52+
if input:
53+
current_trace.input = input
54+
if output:
55+
current_trace.output = output

0 commit comments

Comments
 (0)