confident-ai · RitwijParmar · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/deepeval/cli/inspect.py b/deepeval/cli/inspect.py
@@ -14,7 +14,6 @@
 import typer
 from rich import print
 
-
 _INSTALL_HINT = (
     "[bold red]deepeval inspect[/bold red] requires extras that are not "
     "installed.\n"

diff --git a/deepeval/cli/test/command.py b/deepeval/cli/test/command.py
@@ -7,7 +7,7 @@
 import typer
 from typing_extensions import Annotated
 
-from deepeval.deepeval.config.settings import get_settings
+from deepeval.config.settings import get_settings
 from deepeval.telemetry import capture_evaluation_run
 from deepeval.test_run import (
     TEMP_FILE_PATH,

diff --git a/deepeval/inspect/widgets/_styling.py b/deepeval/inspect/widgets/_styling.py
@@ -12,7 +12,6 @@
 
 from deepeval.inspect.types import Trace, TraceOrSpan
 
-
 # `(glyph, tag, rich style)` per span type. Tags are full words rather
 # than abbreviations because the tree pane is wide enough to spell them
 # out, and "RETRIEVER" reads instantly while "RET" trips users into

diff --git a/deepeval/inspect/widgets/details.py b/deepeval/inspect/widgets/details.py
@@ -30,7 +30,6 @@
     type_prefix,
 )
 
-
 # Matches the TRACE tag so the eye learns "cyan = structure markers".
 _HEADER_ACCENT = "#8be9fd"
 _CTA_ACCENT = "#bd93f9"

diff --git a/deepeval/inspect/widgets/help_modal.py b/deepeval/inspect/widgets/help_modal.py
@@ -9,7 +9,6 @@
 from textual.screen import ModalScreen
 from textual.widgets import Static
 
-
 _HELP_ROWS = [
     ("↑ ↓ / k j", "move selection in the tree"),
     ("h / l", "go to parent / select child in the tree"),

diff --git a/deepeval/inspect/widgets/span_tree.py b/deepeval/inspect/widgets/span_tree.py
@@ -25,7 +25,6 @@
     type_prefix,
 )
 
-
 # Minimum gap (in cells) between the left content (name + metric badge +
 # optional ERRORED pill) and the right-aligned duration. Below this the
 # right column gives up trying to right-align and just leaves the

diff --git a/deepeval/metrics/arena_g_eval/template.py b/deepeval/metrics/arena_g_eval/template.py
@@ -46,8 +46,7 @@ def generate_arena_winner(
             "Be specific and grounded in the evaluation steps."
         )
 
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             You are a judge. Given the following evaluation steps, select the single contestant that best aligns with the evaluation steps.
 
             {ArenaGEvalTemplate.multimodal_rules if multimodal else ""}
@@ -88,16 +87,14 @@ def generate_arena_winner(
             }}
 
             JSON:
-        """
-        )
+        """)
 
     @staticmethod
     def rewrite_reason(
         reason: str,
         dummy_to_real_names: Dict[str, str],
     ):
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             Given the following reason that explains which contestant is the winner, rewrite the reason to REPLACE all contestant names with their real names.
 
             The contestant names are wrapped in $name$ format (e.g., $Alice$, $Bob$, $Charlie$).
@@ -129,5 +126,4 @@ def rewrite_reason(
             }}
 
             JSON:
-            """
-        )
+            """)
diff --git a/deepeval/metrics/argument_correctness/template.py b/deepeval/metrics/argument_correctness/template.py
@@ -19,8 +19,7 @@ def generate_verdicts(
 
         stringified_tools_called = repr(tools_called)
 
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             For the provided list of tool calls, determine whether each tool call input parameter is relevantly and correctly addresses the input.
 
             Please generate a list of JSON with two keys: `verdict` and `reason`.
@@ -99,8 +98,7 @@ def generate_verdicts(
             {stringified_tools_called}
 
             JSON:
-            """
-        )
+            """)
 
     @staticmethod
     def generate_reason(

diff --git a/deepeval/metrics/contextual_relevancy/template.py b/deepeval/metrics/contextual_relevancy/template.py
@@ -55,12 +55,10 @@ def generate_verdicts(
         # Conditional instructions based on mode
         extraction_instructions = ""
         if multimodal:
-            extraction_instructions = textwrap.dedent(
-                """
+            extraction_instructions = textwrap.dedent("""
                 If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement.
                 If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available.
-                """
-            ).strip()
+                """).strip()
         else:
             extraction_instructions = "You should first extract statements found in the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement."
 

diff --git a/deepeval/metrics/conversational_dag/templates.py b/deepeval/metrics/conversational_dag/templates.py
@@ -73,8 +73,7 @@ def generate_task_output(instructions: str, text: str):
 class ConversationalBinaryJudgementTemplate:
     @staticmethod
     def generate_binary_verdict(criteria: str, text: str):
-        return dedent(
-            f"""{criteria}
+        return dedent(f"""{criteria}
 
                 Below is the full conversation you should evaluate. Consider dialogue context, speaker roles, and how responses were handled.
 
@@ -95,17 +94,15 @@ def generate_binary_verdict(criteria: str, text: str):
                 }}
                 **
                 JSON:
-            """
-        )
+            """)
 
 
 class ConversationalNonBinaryJudgementTemplate:
     @staticmethod
     def generate_non_binary_verdict(
         criteria: str, text: str, options: List[str]
     ):
-        return dedent(
-            f"""{criteria}
+        return dedent(f"""{criteria}
 
                 You are evaluating the following conversation. Choose one of the options that best reflects the assistant's behavior.
 
@@ -128,5 +125,4 @@ def generate_non_binary_verdict(
                 }}
                 **
                 JSON:
-            """
-        )
+            """)
diff --git a/deepeval/metrics/dag/serialization/registry.py b/deepeval/metrics/dag/serialization/registry.py
@@ -15,7 +15,6 @@
 
 from .types import NodeType
 
-
 NODE_CLASSES: Dict[bool, Dict[NodeType, Type]] = {
     False: {
         NodeType.TASK: TaskNode,

diff --git a/deepeval/metrics/dag/serialization/serialization.py b/deepeval/metrics/dag/serialization/serialization.py
@@ -61,7 +61,6 @@
 from .registry import CLASS_TO_NODE_TYPE, NODE_CLASSES
 from .types import ChildType, NodeType
 
-
 # ----------------------------------------------------------------------------
 # Public API
 # ----------------------------------------------------------------------------

diff --git a/deepeval/metrics/faithfulness/template.py b/deepeval/metrics/faithfulness/template.py
@@ -93,8 +93,7 @@ def generate_verdicts(
     ):
         example_section = ""
         if multimodal:
-            example_section = textwrap.dedent(
-                """
+            example_section = textwrap.dedent("""
                 Example retrieval contexts: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist."
                 Example claims: ["Barack Obama is a caucasian male.", "Zurich is a city in London", "Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.", "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect.", "Einstein was a German chef."]
 
@@ -123,11 +122,9 @@ def generate_verdicts(
                     ]  
                 }}
                 ===== END OF EXAMPLE ======
-                """
-            )
+                """)
 
-        format_instruction = textwrap.dedent(
-            """
+        format_instruction = textwrap.dedent("""
             Expected JSON format:
             {{
                 "verdicts": [
@@ -144,31 +141,26 @@ def generate_verdicts(
                     }}
                 ]  
             }}
-            """
-        )
+            """)
 
         guidelines = ""
         if multimodal:
-            guidelines = textwrap.dedent(
-                """
+            guidelines = textwrap.dedent("""
                 The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
                 You DON'T have to provide a reason if the answer is 'yes'.
                 ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
                 Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
                 Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE.
                 If there are clear contradictions or any data or images that's not mentioned in the retrieval context, just provide 'no'.
-                """
-            )
+                """)
         else:
-            guidelines = textwrap.dedent(
-                """
+            guidelines = textwrap.dedent("""
                 Generate ONE verdict per claim - length of 'verdicts' MUST equal number of claims.
                 No 'reason' needed for 'yes' verdicts.
                 Only use 'no' if retrieval context DIRECTLY CONTRADICTS the claim - never use prior knowledge.
                 Use 'idk' for claims not backed up by context OR factually incorrect but non-contradictory - do not assume your knowledge.
                 Vague/speculative language in claims (e.g. 'may have', 'possibility') does NOT count as contradiction.
-                """
-            )
+                """)
 
         return textwrap.dedent(
             f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.

diff --git a/deepeval/metrics/g_eval/__init__.py b/deepeval/metrics/g_eval/__init__.py
@@ -1,4 +1,15 @@
-from .utils import Rubric
+from .utils import (
+    RetrievalContextBudgetReport,
+    RetrievalContextChunkBudget,
+    RetrievalContextEvidenceCoverage,
+    Rubric,
+)
 from .template import GEvalTemplate
 
-__all__ = ["Rubric", "GEvalTemplate"]
+__all__ = [
+    "RetrievalContextBudgetReport",
+    "RetrievalContextChunkBudget",
+    "RetrievalContextEvidenceCoverage",
+    "Rubric",
+    "GEvalTemplate",
+]