confident-ai
diff --git a/‎CITATION.cff‎
Lines changed: 1 addition & 1 deletion b/‎CITATION.cff‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deepeval/_version.py‎
Lines changed: 1 addition & 1 deletion b/‎deepeval/_version.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deepeval/benchmarks/gsm8k/gsm8k.py‎
Lines changed: 34 additions & 5 deletions b/‎deepeval/benchmarks/gsm8k/gsm8k.py‎
Lines changed: 34 additions & 5 deletions
diff --git a/‎deepeval/evaluate/execute.py‎
Lines changed: 3 additions & 3 deletions b/‎deepeval/evaluate/execute.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎deepeval/metrics/answer_relevancy/template.py‎
Lines changed: 10 additions & 4 deletions b/‎deepeval/metrics/answer_relevancy/template.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎deepeval/metrics/g_eval/g_eval.py‎
Lines changed: 1 addition & 1 deletion b/‎deepeval/metrics/g_eval/g_eval.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deepeval/tracing/__init__.py‎
Lines changed: 4 additions & 6 deletions b/‎deepeval/tracing/__init__.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎deepeval/tracing/api.py‎
Lines changed: 1 addition & 1 deletion b/‎deepeval/tracing/api.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deepeval/tracing/attributes.py‎
Lines changed: 55 additions & 0 deletions b/‎deepeval/tracing/attributes.py‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎deepeval/tracing/context.py‎
Lines changed: 55 additions & 0 deletions b/‎deepeval/tracing/context.py‎
Lines changed: 55 additions & 0 deletions
@@ -6,7 +6,7 @@ authors:
   - family-names: Vongthongsri
     given-names: Kritin
 title: deepeval
-version: 3.0.2
+version: 3.0.3
 date-released: "2025-05-28"
 url: https://confident-ai.com
 repository-code: https://github.com/confident-ai/deepeval
 
@@ -1 +1 @@
-__version__: str = "3.0.2"
+__version__: str = "3.0.3"
@@ -1,4 +1,4 @@
-from typing import List, Optional, Dict
+from typing import List, Optional, Dict, Union
 from tqdm import tqdm
 
 from deepeval.dataset import Golden
@@ -52,7 +52,10 @@ def evaluate(self, model: DeepEvalBaseLLM) -> Dict:
             for idx, golden in enumerate(
                 tqdm(goldens, desc=f"Processing {self.n_problems} problems")
             ):
-                prediction, score = self.predict(model, golden).values()
+                result = self.predict(model, golden)
+                prediction = result["prediction"]
+                score = result["score"]
+
                 if score:
                     overall_correct_predictions += 1
                 predictions_row.append(
@@ -94,14 +97,17 @@ def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
         )
 
         # Enforced model generation
+        prediction = None
         try:
             res: NumberSchema = model.generate(
                 prompt=prompt, schema=NumberSchema
             )
-            prediction = str(res.answer)
-        except TypeError:
+            prediction = self._extract_prediction_from_response(res)
+        except (TypeError, AttributeError) as e:
+
             prompt += f"\n\n{self.confinement_instructions}"
-            prediction = model.generate(prompt)
+            res = model.generate(prompt)
+            prediction = self._extract_prediction_from_response(res)
 
         # For native models, shouldn't happen but just in case
         if isinstance(prediction, tuple):
@@ -114,6 +120,29 @@ def predict(self, model: DeepEvalBaseLLM, golden: Golden) -> Dict:
 
         return {"prediction": prediction, "score": score}
 
+    def _extract_prediction_from_response(self, res) -> str:
+        """
+        Extract prediction from model response, handling various response types.
+        """
+        # Case 1: Response has .answer attribute (NumberSchema case)
+        if hasattr(res, "answer"):
+            return str(res.answer)
+
+        # Case 2: Response is a tuple
+        elif isinstance(res, tuple):
+            return self._extract_from_tuple(res)
+
+        else:
+            return str(res)
+
+    def _extract_from_tuple(self, res: tuple) -> str:
+        """Extract prediction from tuple response."""
+        if len(res) == 0:
+            return ""
+        first_elem = res[0]
+        if hasattr(first_elem, "answer"):
+            return str(first_elem.answer)
+
     def load_benchmark_dataset(self) -> List[Golden]:
         from datasets import load_dataset
 
 
@@ -8,7 +8,6 @@
 
 from deepeval.tracing.tracing import (
     Observer,
-    get_current_trace,
     trace_manager,
     Trace,
     BaseSpan,
@@ -19,6 +18,7 @@
     perf_counter_to_datetime,
     to_zod_compatible_iso,
 )
+from deepeval.tracing.context import current_trace_context
 from deepeval.tracing.api import (
     TraceApi,
     BaseApiSpan,
@@ -818,7 +818,7 @@ def evaluate_test_cases(
                         loop.run_until_complete(observed_callback(golden.input))
                     else:
                         observed_callback(golden.input)
-                    current_trace: Trace = get_current_trace()
+                    current_trace: Trace = current_trace_context.get()
 
                 if pbar_callback is not None:
                     pbar_callback.update(1)
@@ -1117,7 +1117,7 @@ async def a_execute_agentic_test_case(
             await observed_callback(golden.input)
         else:
             observed_callback(golden.input)
-        current_trace: Trace = get_current_trace()
+        current_trace: Trace = current_trace_context.get()
 
     if pbar_callback is not None:
         pbar_callback.update(1)
 
@@ -4,7 +4,7 @@
 class AnswerRelevancyTemplate:
     @staticmethod
     def generate_statements(actual_output: str):
-        return f"""Given the text, breakdown and generate a list of statements presented. Ambiguous statements and single words can also be considered as statements.
+        return f"""Given the text, breakdown and generate a list of statements presented. Ambiguous statements and single words can be considered as statements, but only if outside of a coherent statement.
 
 Example:
 Example text: 
@@ -22,7 +22,7 @@ def generate_statements(actual_output: str):
 ===== END OF EXAMPLE ======
         
 **
-IMPORTANT: Please make sure to only return in JSON format, with the "statements" key mapping to a list of strings. No words or explanation is needed.
+IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the "statements" key mapping to a list of strings. No words or explanation are needed. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
 **
 
 Text:
@@ -41,10 +41,11 @@ def generate_verdicts(input: str, statements: str):
 The provided statements are statements made in the actual output.
 
 **
-IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key mapping to a list of JSON objects.
+IMPORTANT: Please make sure to only return in valid and parseable JSON format, with the 'verdicts' key mapping to a list of JSON objects. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
 Example input: 
 What features does the new laptop have?
 
+Example:
 Example statements: 
 [
     "The new laptop model has a high-resolution Retina display.",
@@ -81,6 +82,7 @@ def generate_verdicts(input: str, statements: str):
         }}
     ]  
 }}
+===== END OF EXAMPLE ======
 
 Since you are going to generate a verdict for each statement, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of `statements`.
 **          
@@ -104,13 +106,17 @@ def generate_reason(
 
 
 **
-IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
+IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason. Ensure all strings are closed appropriately. Repair any invalid JSON before you output it.
+
+Example:
 Example JSON:
 {{
     "reason": "The score is <answer_relevancy_score> because <your_reason>."
 }}
+===== END OF EXAMPLE ======
 **
 
+
 Answer Relevancy Score:
 {score}
 
 
@@ -348,7 +348,7 @@ def is_successful(self) -> bool:
             self.success = False
         else:
             try:
-                self.score >= self.threshold
+                self.success = self.score >= self.threshold
             except:
                 self.success = False
         return self.success
 
@@ -1,11 +1,9 @@
-from .tracing import (
-    observe,
-    update_current_span,
-    update_current_trace,
+from .context import update_current_span, update_current_trace
+from .attributes import (
     LlmAttributes,
     RetrieverAttributes,
     ToolAttributes,
     AgentAttributes,
-    get_current_trace,
-    trace_manager,
 )
+from .types import BaseSpan, Trace
+from .tracing import observe, trace_manager
@@ -37,7 +37,7 @@ class MetricData(BaseModel):
 class SpanTestCase(BaseModel):
     input: str
     actual_output: str = Field(alias="actualOutput")
-    expected_output: Optional[str] = Field(None, lias="expectedOutput")
+    expected_output: Optional[str] = Field(None, alias="expectedOutput")
     retrieval_context: Optional[List[str]] = Field(
         None, alias="retrievalContext"
     )
 
@@ -0,0 +1,55 @@
+from pydantic import BaseModel, Field
+from typing import Any, Dict, List, Optional, Union
+from deepeval.prompt import Prompt
+
+
+class AgentAttributes(BaseModel):
+    # input
+    input: Union[str, Dict, list]
+    # output
+    output: Union[str, Dict, list]
+
+
+class LlmAttributes(BaseModel):
+    # input
+    input: Union[str, List[Dict[str, Any]]]
+    # output
+    output: str
+    prompt: Optional[Prompt] = None
+
+    # Optional variables
+    input_token_count: Optional[int] = Field(
+        None, serialization_alias="inputTokenCount"
+    )
+    output_token_count: Optional[int] = Field(
+        None, serialization_alias="outputTokenCount"
+    )
+
+    model_config = {"arbitrary_types_allowed": True}
+
+
+class RetrieverAttributes(BaseModel):
+    # input
+    embedding_input: str = Field(serialization_alias="embeddingInput")
+    # output
+    retrieval_context: List[str] = Field(serialization_alias="retrievalContext")
+
+    # Optional variables
+    top_k: Optional[int] = Field(None, serialization_alias="topK")
+    chunk_size: Optional[int] = Field(None, serialization_alias="chunkSize")
+
+
+# Don't have to call this manually, will be taken as input and output of function
+# Can be overridden by user
+class ToolAttributes(BaseModel):
+    # input
+    input_parameters: Optional[Dict[str, Any]] = Field(
+        None, serialization_alias="inputParameters"
+    )
+    # output
+    output: Optional[Any] = None
+
+
+Attributes = Union[
+    AgentAttributes, LlmAttributes, RetrieverAttributes, ToolAttributes
+]
@@ -0,0 +1,55 @@
+from typing import Any, Dict, List, Optional
+from contextvars import ContextVar
+from deepeval.tracing.types import BaseSpan, Trace
+from deepeval.test_case import LLMTestCase
+from deepeval.tracing.attributes import Attributes
+
+
+current_span_context: ContextVar[Optional[BaseSpan]] = ContextVar(
+    "current_span", default=None
+)
+
+current_trace_context: ContextVar[Optional[Trace]] = ContextVar(
+    "current_trace", default=None
+)
+
+
+def update_current_span(
+    test_case: Optional[LLMTestCase] = None,
+    attributes: Optional[Attributes] = None,
+    metadata: Optional[Dict[str, Any]] = None,
+):
+    current_span = current_span_context.get()
+    if not current_span:
+        return
+    if attributes:
+        current_span.set_attributes(attributes)
+    if test_case:
+        current_span.llm_test_case = test_case
+    if metadata:
+        current_span.metadata = metadata
+
+
+def update_current_trace(
+    tags: Optional[List[str]] = None,
+    metadata: Optional[Dict[str, Any]] = None,
+    thread_id: Optional[str] = None,
+    user_id: Optional[str] = None,
+    input: Optional[Any] = None,
+    output: Optional[Any] = None,
+):
+    current_trace = current_trace_context.get()
+    if not current_trace:
+        return
+    if tags:
+        current_trace.tags = tags
+    if metadata:
+        current_trace.metadata = metadata
+    if thread_id:
+        current_trace.thread_id = thread_id
+    if user_id:
+        current_trace.user_id = user_id
+    if input:
+        current_trace.input = input
+    if output:
+        current_trace.output = output
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__: str = "3.0.2"`
	`1`	`+__version__: str = "3.0.3"`
Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ class MetricData(BaseModel):`
`37`	`37`	`class SpanTestCase(BaseModel):`
`38`	`38`	`input: str`
`39`	`39`	`actual_output: str = Field(alias="actualOutput")`
`40`		`- expected_output: Optional[str] = Field(None, lias="expectedOutput")`
	`40`	`+ expected_output: Optional[str] = Field(None, alias="expectedOutput")`
`41`	`41`	`retrieval_context: Optional[List[str]] = Field(`
`42`	`42`	`None, alias="retrievalContext"`
`43`	`43`	`)`