Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion deepeval/cli/inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import typer
from rich import print


_INSTALL_HINT = (
"[bold red]deepeval inspect[/bold red] requires extras that are not "
"installed.\n"
Expand Down
1 change: 0 additions & 1 deletion deepeval/inspect/widgets/_styling.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

from deepeval.inspect.types import Trace, TraceOrSpan


# `(glyph, tag, rich style)` per span type. Tags are full words rather
# than abbreviations because the tree pane is wide enough to spell them
# out, and "RETRIEVER" reads instantly while "RET" trips users into
Expand Down
1 change: 0 additions & 1 deletion deepeval/inspect/widgets/details.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
type_prefix,
)


# Matches the TRACE tag so the eye learns "cyan = structure markers".
_HEADER_ACCENT = "#8be9fd"
_CTA_ACCENT = "#bd93f9"
Expand Down
1 change: 0 additions & 1 deletion deepeval/inspect/widgets/help_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from textual.screen import ModalScreen
from textual.widgets import Static


_HELP_ROWS = [
("↑ ↓ / k j", "move selection in the tree"),
("h / l", "go to parent / select child in the tree"),
Expand Down
1 change: 0 additions & 1 deletion deepeval/inspect/widgets/span_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
type_prefix,
)


# Minimum gap (in cells) between the left content (name + metric badge +
# optional ERRORED pill) and the right-aligned duration. Below this the
# right column gives up trying to right-align and just leaves the
Expand Down
6 changes: 2 additions & 4 deletions deepeval/metrics/contextual_relevancy/contextual_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,10 @@ def _contextual_relevancy_verdict_kwargs(multimodal: bool) -> Dict[str, str]:
context_type = "context (image or string)" if multimodal else "context"
statement_or_image = "statement or image" if multimodal else "statement"
if multimodal:
extraction_instructions = textwrap.dedent(
"""
extraction_instructions = textwrap.dedent("""
If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement.
If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available.
"""
).strip()
""").strip()
empty_context_instruction = ""
else:
extraction_instructions = (
Expand Down
1 change: 0 additions & 1 deletion deepeval/metrics/dag/serialization/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

from .types import NodeType


NODE_CLASSES: Dict[bool, Dict[NodeType, Type]] = {
False: {
NodeType.TASK: TaskNode,
Expand Down
1 change: 0 additions & 1 deletion deepeval/metrics/dag/serialization/serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@
from .registry import CLASS_TO_NODE_TYPE, NODE_CLASSES
from .types import ChildType, NodeType


# ----------------------------------------------------------------------------
# Public API
# ----------------------------------------------------------------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,15 +287,13 @@ def is_successful(self) -> bool:
def _generate_reason(
self,
) -> str:
return textwrap.dedent(
f"""
return textwrap.dedent(f"""
The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)}
and the lowest score from perceptual quality was {min(self.PQ_scores)}. These scores were combined to reflect the
overall effectiveness and quality of the AI-generated image(s).
Reason for Semantic Consistency score: {self.SC_reasoning}
Reason for Perceptual Quality score: {self.PQ_reasoning}
"""
)
""")

@property
def __name__(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,15 +289,13 @@ def is_successful(self) -> bool:
return self.success

def _generate_reason(self) -> str:
return textwrap.dedent(
f"""
return textwrap.dedent(f"""
The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)}
and the lowest score from perceptual quality was {min(self.PQ_scores)}. These scores were combined to reflect the
overall effectiveness and quality of the AI-generated image(s).
Reason for Semantic Consistency score: {self.SC_reasoning}
Reason for Perceptual Quality score: {self.PQ_reasoning}
"""
)
""")

@property
def __name__(self):
Expand Down
7 changes: 7 additions & 0 deletions deepeval/metrics/task_completion/task_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
construct_verbose_logs,
check_llm_test_case_params,
initialize_model,
print_tools_called,
a_generate_with_schema_and_extract,
generate_with_schema_and_extract,
)
Expand Down Expand Up @@ -194,6 +195,9 @@ async def _a_extract_task_and_outcome(
input=test_case.input,
actual_output=test_case.actual_output,
tools_called=test_case.tools_called,
tools_called_formatted=print_tools_called(
test_case.tools_called
),
)
return await a_generate_with_schema_and_extract(
metric=self,
Expand All @@ -220,6 +224,9 @@ def _extract_task_and_outcome(
input=test_case.input,
actual_output=test_case.actual_output,
tools_called=test_case.tools_called,
tools_called_formatted=print_tools_called(
test_case.tools_called
),
)
return generate_with_schema_and_extract(
metric=self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,10 @@ def _contextual_relevancy_verdict_kwargs(multimodal: bool) -> Dict[str, str]:
context_type = "context (image or string)" if multimodal else "context"
statement_or_image = "statement or image" if multimodal else "statement"
if multimodal:
extraction_instructions = textwrap.dedent(
"""
extraction_instructions = textwrap.dedent("""
If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement.
If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available.
"""
).strip()
""").strip()
empty_context_instruction = ""
else:
extraction_instructions = (
Expand Down
6 changes: 2 additions & 4 deletions deepeval/scorer/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,8 +444,7 @@ def squad_score(
evaluation_model: DeepEvalBaseLLM,
using_native_evaluation_model: bool,
):
prompt = textwrap.dedent(
f"""
prompt = textwrap.dedent(f"""
Given the question and context, evaluate if the prediction is correct based on the expected output.
Ensure to account for cases where the prediction and expected output might differ in form, such as '2' versus 'two'.

Expand All @@ -459,8 +458,7 @@ def squad_score(
{{
"answer": <number>
}}
"""
)
""")

# Generate the score using the model
if using_native_evaluation_model:
Expand Down
6 changes: 2 additions & 4 deletions deepeval/simulator/controller/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@ class SimulatorControllerTemplate:
def check_expected_outcome(
previous_conversation: str, expected_outcome: str
) -> str:
prompt = textwrap.dedent(
f"""You are a Conversation Completion Checker.
prompt = textwrap.dedent(f"""You are a Conversation Completion Checker.
Your task is to determine whether the conversation has achieved the expected outcome and should be terminated.

Guidelines:
Expand All @@ -34,6 +33,5 @@ def check_expected_outcome(
Conversation History:
{previous_conversation}
JSON Output:
"""
)
""")
return prompt
6 changes: 2 additions & 4 deletions deepeval/simulator/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,7 @@ def simulate_user_turn(
previous_conversation = serialize_to_json(
turns, indent=4, ensure_ascii=False
)
prompt = textwrap.dedent(
f"""
prompt = textwrap.dedent(f"""
Pretend you are a user of an LLM app. Your task is to generate the next user input in {language}
based on the provided scenario, user profile, and the previous conversation.

Expand Down Expand Up @@ -95,6 +94,5 @@ def simulate_user_turn(
{previous_conversation}

JSON Output:
"""
)
""")
return prompt
Loading
Loading