confident-ai · luochen211 · Jun 28, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/deepeval/cli/inspect.py b/deepeval/cli/inspect.py
@@ -14,7 +14,6 @@
 import typer
 from rich import print
 
-
 _INSTALL_HINT = (
     "[bold red]deepeval inspect[/bold red] requires extras that are not "
     "installed.\n"

diff --git a/deepeval/inspect/widgets/_styling.py b/deepeval/inspect/widgets/_styling.py
@@ -12,7 +12,6 @@
 
 from deepeval.inspect.types import Trace, TraceOrSpan
 
-
 # `(glyph, tag, rich style)` per span type. Tags are full words rather
 # than abbreviations because the tree pane is wide enough to spell them
 # out, and "RETRIEVER" reads instantly while "RET" trips users into

diff --git a/deepeval/inspect/widgets/details.py b/deepeval/inspect/widgets/details.py
@@ -30,7 +30,6 @@
     type_prefix,
 )
 
-
 # Matches the TRACE tag so the eye learns "cyan = structure markers".
 _HEADER_ACCENT = "#8be9fd"
 _CTA_ACCENT = "#bd93f9"

diff --git a/deepeval/inspect/widgets/help_modal.py b/deepeval/inspect/widgets/help_modal.py
@@ -9,7 +9,6 @@
 from textual.screen import ModalScreen
 from textual.widgets import Static
 
-
 _HELP_ROWS = [
     ("↑ ↓ / k j", "move selection in the tree"),
     ("h / l", "go to parent / select child in the tree"),

diff --git a/deepeval/inspect/widgets/span_tree.py b/deepeval/inspect/widgets/span_tree.py
@@ -25,7 +25,6 @@
     type_prefix,
 )
 
-
 # Minimum gap (in cells) between the left content (name + metric badge +
 # optional ERRORED pill) and the right-aligned duration. Below this the
 # right column gives up trying to right-align and just leaves the

diff --git a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py
@@ -30,12 +30,10 @@ def _contextual_relevancy_verdict_kwargs(multimodal: bool) -> Dict[str, str]:
     context_type = "context (image or string)" if multimodal else "context"
     statement_or_image = "statement or image" if multimodal else "statement"
     if multimodal:
-        extraction_instructions = textwrap.dedent(
-            """
+        extraction_instructions = textwrap.dedent("""
             If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement.
             If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available.
-            """
-        ).strip()
+            """).strip()
         empty_context_instruction = ""
     else:
         extraction_instructions = (

diff --git a/deepeval/metrics/dag/serialization/registry.py b/deepeval/metrics/dag/serialization/registry.py
@@ -15,7 +15,6 @@
 
 from .types import NodeType
 
-
 NODE_CLASSES: Dict[bool, Dict[NodeType, Type]] = {
     False: {
         NodeType.TASK: TaskNode,

diff --git a/deepeval/metrics/dag/serialization/serialization.py b/deepeval/metrics/dag/serialization/serialization.py
@@ -61,7 +61,6 @@
 from .registry import CLASS_TO_NODE_TYPE, NODE_CLASSES
 from .types import ChildType, NodeType
 
-
 # ----------------------------------------------------------------------------
 # Public API
 # ----------------------------------------------------------------------------

diff --git a/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py b/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py
@@ -287,15 +287,13 @@ def is_successful(self) -> bool:
     def _generate_reason(
         self,
     ) -> str:
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)} 
             and the lowest score from perceptual quality was {min(self.PQ_scores)}. These scores were combined to reflect the 
             overall effectiveness and quality of the AI-generated image(s).
             Reason for Semantic Consistency score: {self.SC_reasoning}
             Reason for Perceptual Quality score: {self.PQ_reasoning}
-        """
-        )
+        """)
 
     @property
     def __name__(self):

diff --git a/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py b/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py
@@ -289,15 +289,13 @@ def is_successful(self) -> bool:
         return self.success
 
     def _generate_reason(self) -> str:
-        return textwrap.dedent(
-            f"""
+        return textwrap.dedent(f"""
             The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)} 
             and the lowest score from perceptual quality was {min(self.PQ_scores)}. These scores were combined to reflect the 
             overall effectiveness and quality of the AI-generated image(s).
             Reason for Semantic Consistency score: {self.SC_reasoning}
             Reason for Perceptual Quality score: {self.PQ_reasoning}
-        """
-        )
+        """)
 
     @property
     def __name__(self):

diff --git a/deepeval/metrics/task_completion/task_completion.py b/deepeval/metrics/task_completion/task_completion.py
@@ -5,6 +5,7 @@
     construct_verbose_logs,
     check_llm_test_case_params,
     initialize_model,
+    print_tools_called,
     a_generate_with_schema_and_extract,
     generate_with_schema_and_extract,
 )
@@ -194,6 +195,9 @@ async def _a_extract_task_and_outcome(
                 input=test_case.input,
                 actual_output=test_case.actual_output,
                 tools_called=test_case.tools_called,
+                tools_called_formatted=print_tools_called(
+                    test_case.tools_called
+                ),
             )
         return await a_generate_with_schema_and_extract(
             metric=self,
@@ -220,6 +224,9 @@ def _extract_task_and_outcome(
                 input=test_case.input,
                 actual_output=test_case.actual_output,
                 tools_called=test_case.tools_called,
+                tools_called_formatted=print_tools_called(
+                    test_case.tools_called
+                ),
             )
         return generate_with_schema_and_extract(
             metric=self,

diff --git a/deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py b/deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py
@@ -31,12 +31,10 @@ def _contextual_relevancy_verdict_kwargs(multimodal: bool) -> Dict[str, str]:
     context_type = "context (image or string)" if multimodal else "context"
     statement_or_image = "statement or image" if multimodal else "statement"
     if multimodal:
-        extraction_instructions = textwrap.dedent(
-            """
+        extraction_instructions = textwrap.dedent("""
             If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement.
             If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available.
-            """
-        ).strip()
+            """).strip()
         empty_context_instruction = ""
     else:
         extraction_instructions = (

diff --git a/deepeval/scorer/scorer.py b/deepeval/scorer/scorer.py
@@ -444,8 +444,7 @@ def squad_score(
         evaluation_model: DeepEvalBaseLLM,
         using_native_evaluation_model: bool,
     ):
-        prompt = textwrap.dedent(
-            f"""
+        prompt = textwrap.dedent(f"""
             Given the question and context, evaluate if the prediction is correct based on the expected output.
             Ensure to account for cases where the prediction and expected output might differ in form, such as '2' versus 'two'.
 
@@ -459,8 +458,7 @@ def squad_score(
             {{
                 "answer": <number>
             }}
-        """
-        )
+        """)
 
         # Generate the score using the model
         if using_native_evaluation_model:

diff --git a/deepeval/simulator/controller/template.py b/deepeval/simulator/controller/template.py
@@ -6,8 +6,7 @@ class SimulatorControllerTemplate:
     def check_expected_outcome(
         previous_conversation: str, expected_outcome: str
     ) -> str:
-        prompt = textwrap.dedent(
-            f"""You are a Conversation Completion Checker.
+        prompt = textwrap.dedent(f"""You are a Conversation Completion Checker.
             Your task is to determine whether the conversation has achieved the expected outcome and should be terminated.
 
             Guidelines:
@@ -34,6 +33,5 @@ def check_expected_outcome(
             Conversation History:
             {previous_conversation}
             JSON Output:
-            """
-        )
+            """)
         return prompt
diff --git a/deepeval/simulator/template.py b/deepeval/simulator/template.py
@@ -59,8 +59,7 @@ def simulate_user_turn(
         previous_conversation = serialize_to_json(
             turns, indent=4, ensure_ascii=False
         )
-        prompt = textwrap.dedent(
-            f"""
+        prompt = textwrap.dedent(f"""
             Pretend you are a user of an LLM app. Your task is to generate the next user input in {language} 
             based on the provided scenario, user profile, and the previous conversation.
 
@@ -95,6 +94,5 @@ def simulate_user_turn(
             {previous_conversation}
 
             JSON Output:
-        """
-        )
+        """)
         return prompt