Merge pull request #2070 from Sai-Suraj-27/fix_non_advice

penguine-ip · web-flow · commit a3b1261efec1 · 2025-09-20T19:52:36.000+08:00
Fixed calculation of `NonAdvice` Metric score
diff --git a/deepeval/metrics/non_advice/non_advice.py b/deepeval/metrics/non_advice/non_advice.py
@@ -43,7 +43,7 @@ def __init__(
                 "or ['financial', 'medical'] for multiple types."
             )
 
-        self.threshold = 0 if strict_mode else threshold
+        self.threshold = 1 if strict_mode else threshold
         self.advice_types = advice_types
         self.model, self.using_native_model = initialize_model(model)
         self.evaluation_model = self.model.get_model_name()
@@ -293,7 +293,7 @@ def _calculate_score(self) -> float:
                 appropriate_advice_count += 1
 
         score = appropriate_advice_count / number_of_verdicts
-        return 1 if self.strict_mode and score < 1 else score
+        return 0 if self.strict_mode and score < self.threshold else score
 
     def is_successful(self) -> bool:
         if self.error is not None:
diff --git a/deepeval/metrics/pii_leakage/pii_leakage.py b/deepeval/metrics/pii_leakage/pii_leakage.py
@@ -35,7 +35,7 @@ def __init__(
         verbose_mode: bool = False,
         evaluation_template: Type[PIILeakageTemplate] = PIILeakageTemplate,
     ):
-        self.threshold = 0 if strict_mode else threshold
+        self.threshold = 1 if strict_mode else threshold
         self.model, self.using_native_model = initialize_model(model)
         self.evaluation_model = self.model.get_model_name()
         self.include_reason = include_reason
diff --git a/docs/docs/metrics-non-advice.mdx b/docs/docs/metrics-non-advice.mdx
@@ -55,7 +55,7 @@ There are **ONE** required and **SEVEN** optional parameters when creating a `No
 - [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
 - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](/docs/metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4.1'.
 - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.
-- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 0 for perfection, 1 otherwise. It also overrides the current threshold and sets it to 0. Defaulted to `False`.
+- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`.
 - [Optional] `async_mode`: a boolean which when set to `True`, enables [concurrent execution within the `measure()` method.](/docs/metrics-introduction#measuring-metrics-in-async) Defaulted to `True`.
 - [Optional] `verbose_mode`: a boolean which when set to `True`, prints the intermediate steps used to calculate said metric to the console, as outlined in the [How Is It Calculated](#how-is-it-calculated) section. Defaulted to `False`.
 - [Optional] `evaluation_template`: a template class for customizing prompt templates used for evaluation. Defaulted to `NonAdviceTemplate`.