Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion deepeval/cli/inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import typer
from rich import print


_INSTALL_HINT = (
"[bold red]deepeval inspect[/bold red] requires extras that are not "
"installed.\n"
Expand Down
2 changes: 1 addition & 1 deletion deepeval/cli/test/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import typer
from typing_extensions import Annotated

from deepeval.deepeval.config.settings import get_settings
from deepeval.config.settings import get_settings
from deepeval.telemetry import capture_evaluation_run
from deepeval.test_run import (
TEMP_FILE_PATH,
Expand Down
1 change: 0 additions & 1 deletion deepeval/inspect/widgets/_styling.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

from deepeval.inspect.types import Trace, TraceOrSpan


# `(glyph, tag, rich style)` per span type. Tags are full words rather
# than abbreviations because the tree pane is wide enough to spell them
# out, and "RETRIEVER" reads instantly while "RET" trips users into
Expand Down
1 change: 0 additions & 1 deletion deepeval/inspect/widgets/details.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
type_prefix,
)


# Matches the TRACE tag so the eye learns "cyan = structure markers".
_HEADER_ACCENT = "#8be9fd"
_CTA_ACCENT = "#bd93f9"
Expand Down
1 change: 0 additions & 1 deletion deepeval/inspect/widgets/help_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from textual.screen import ModalScreen
from textual.widgets import Static


_HELP_ROWS = [
("↑ ↓ / k j", "move selection in the tree"),
("h / l", "go to parent / select child in the tree"),
Expand Down
1 change: 0 additions & 1 deletion deepeval/inspect/widgets/span_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
type_prefix,
)


# Minimum gap (in cells) between the left content (name + metric badge +
# optional ERRORED pill) and the right-aligned duration. Below this the
# right column gives up trying to right-align and just leaves the
Expand Down
12 changes: 4 additions & 8 deletions deepeval/metrics/arena_g_eval/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@ def generate_arena_winner(
"Be specific and grounded in the evaluation steps."
)

return textwrap.dedent(
f"""
return textwrap.dedent(f"""
You are a judge. Given the following evaluation steps, select the single contestant that best aligns with the evaluation steps.

{ArenaGEvalTemplate.multimodal_rules if multimodal else ""}
Expand Down Expand Up @@ -88,16 +87,14 @@ def generate_arena_winner(
}}

JSON:
"""
)
""")

@staticmethod
def rewrite_reason(
reason: str,
dummy_to_real_names: Dict[str, str],
):
return textwrap.dedent(
f"""
return textwrap.dedent(f"""
Given the following reason that explains which contestant is the winner, rewrite the reason to REPLACE all contestant names with their real names.

The contestant names are wrapped in $name$ format (e.g., $Alice$, $Bob$, $Charlie$).
Expand Down Expand Up @@ -129,5 +126,4 @@ def rewrite_reason(
}}

JSON:
"""
)
""")
6 changes: 2 additions & 4 deletions deepeval/metrics/argument_correctness/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ def generate_verdicts(

stringified_tools_called = repr(tools_called)

return textwrap.dedent(
f"""
return textwrap.dedent(f"""
For the provided list of tool calls, determine whether each tool call input parameter is relevantly and correctly addresses the input.

Please generate a list of JSON with two keys: `verdict` and `reason`.
Expand Down Expand Up @@ -99,8 +98,7 @@ def generate_verdicts(
{stringified_tools_called}

JSON:
"""
)
""")

@staticmethod
def generate_reason(
Expand Down
6 changes: 2 additions & 4 deletions deepeval/metrics/contextual_relevancy/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,10 @@ def generate_verdicts(
# Conditional instructions based on mode
extraction_instructions = ""
if multimodal:
extraction_instructions = textwrap.dedent(
"""
extraction_instructions = textwrap.dedent("""
If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement.
If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available.
"""
).strip()
""").strip()
else:
extraction_instructions = "You should first extract statements found in the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement."

Expand Down
12 changes: 4 additions & 8 deletions deepeval/metrics/conversational_dag/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,7 @@ def generate_task_output(instructions: str, text: str):
class ConversationalBinaryJudgementTemplate:
@staticmethod
def generate_binary_verdict(criteria: str, text: str):
return dedent(
f"""{criteria}
return dedent(f"""{criteria}

Below is the full conversation you should evaluate. Consider dialogue context, speaker roles, and how responses were handled.

Expand All @@ -95,17 +94,15 @@ def generate_binary_verdict(criteria: str, text: str):
}}
**
JSON:
"""
)
""")


class ConversationalNonBinaryJudgementTemplate:
@staticmethod
def generate_non_binary_verdict(
criteria: str, text: str, options: List[str]
):
return dedent(
f"""{criteria}
return dedent(f"""{criteria}

You are evaluating the following conversation. Choose one of the options that best reflects the assistant's behavior.

Expand All @@ -128,5 +125,4 @@ def generate_non_binary_verdict(
}}
**
JSON:
"""
)
""")
1 change: 0 additions & 1 deletion deepeval/metrics/dag/serialization/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

from .types import NodeType


NODE_CLASSES: Dict[bool, Dict[NodeType, Type]] = {
False: {
NodeType.TASK: TaskNode,
Expand Down
1 change: 0 additions & 1 deletion deepeval/metrics/dag/serialization/serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@
from .registry import CLASS_TO_NODE_TYPE, NODE_CLASSES
from .types import ChildType, NodeType


# ----------------------------------------------------------------------------
# Public API
# ----------------------------------------------------------------------------
Expand Down
24 changes: 8 additions & 16 deletions deepeval/metrics/faithfulness/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,7 @@ def generate_verdicts(
):
example_section = ""
if multimodal:
example_section = textwrap.dedent(
"""
example_section = textwrap.dedent("""
Example retrieval contexts: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist."
Example claims: ["Barack Obama is a caucasian male.", "Zurich is a city in London", "Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.", "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect.", "Einstein was a German chef."]

Expand Down Expand Up @@ -123,11 +122,9 @@ def generate_verdicts(
]
}}
===== END OF EXAMPLE ======
"""
)
""")

format_instruction = textwrap.dedent(
"""
format_instruction = textwrap.dedent("""
Expected JSON format:
{{
"verdicts": [
Expand All @@ -144,31 +141,26 @@ def generate_verdicts(
}}
]
}}
"""
)
""")

guidelines = ""
if multimodal:
guidelines = textwrap.dedent(
"""
guidelines = textwrap.dedent("""
The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
You DON'T have to provide a reason if the answer is 'yes'.
ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE.
If there are clear contradictions or any data or images that's not mentioned in the retrieval context, just provide 'no'.
"""
)
""")
else:
guidelines = textwrap.dedent(
"""
guidelines = textwrap.dedent("""
Generate ONE verdict per claim - length of 'verdicts' MUST equal number of claims.
No 'reason' needed for 'yes' verdicts.
Only use 'no' if retrieval context DIRECTLY CONTRADICTS the claim - never use prior knowledge.
Use 'idk' for claims not backed up by context OR factually incorrect but non-contradictory - do not assume your knowledge.
Vague/speculative language in claims (e.g. 'may have', 'possibility') does NOT count as contradiction.
"""
)
""")

return textwrap.dedent(
f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
Expand Down
15 changes: 13 additions & 2 deletions deepeval/metrics/g_eval/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,15 @@
from .utils import Rubric
from .utils import (
RetrievalContextBudgetReport,
RetrievalContextChunkBudget,
RetrievalContextEvidenceCoverage,
Rubric,
)
from .template import GEvalTemplate

__all__ = ["Rubric", "GEvalTemplate"]
__all__ = [
"RetrievalContextBudgetReport",
"RetrievalContextChunkBudget",
"RetrievalContextEvidenceCoverage",
"Rubric",
"GEvalTemplate",
]
Loading
Loading