diff --git a/deepeval/cli/inspect.py b/deepeval/cli/inspect.py index 788506e58..9129cb5c9 100644 --- a/deepeval/cli/inspect.py +++ b/deepeval/cli/inspect.py @@ -14,7 +14,6 @@ import typer from rich import print - _INSTALL_HINT = ( "[bold red]deepeval inspect[/bold red] requires extras that are not " "installed.\n" diff --git a/deepeval/cli/test/command.py b/deepeval/cli/test/command.py index e438c304e..e87044e8c 100644 --- a/deepeval/cli/test/command.py +++ b/deepeval/cli/test/command.py @@ -7,7 +7,7 @@ import typer from typing_extensions import Annotated -from deepeval.deepeval.config.settings import get_settings +from deepeval.config.settings import get_settings from deepeval.telemetry import capture_evaluation_run from deepeval.test_run import ( TEMP_FILE_PATH, diff --git a/deepeval/inspect/widgets/_styling.py b/deepeval/inspect/widgets/_styling.py index c655a284b..f3be2aa62 100644 --- a/deepeval/inspect/widgets/_styling.py +++ b/deepeval/inspect/widgets/_styling.py @@ -12,7 +12,6 @@ from deepeval.inspect.types import Trace, TraceOrSpan - # `(glyph, tag, rich style)` per span type. Tags are full words rather # than abbreviations because the tree pane is wide enough to spell them # out, and "RETRIEVER" reads instantly while "RET" trips users into diff --git a/deepeval/inspect/widgets/details.py b/deepeval/inspect/widgets/details.py index f2bb3bd3e..c1408c625 100644 --- a/deepeval/inspect/widgets/details.py +++ b/deepeval/inspect/widgets/details.py @@ -30,7 +30,6 @@ type_prefix, ) - # Matches the TRACE tag so the eye learns "cyan = structure markers". _HEADER_ACCENT = "#8be9fd" _CTA_ACCENT = "#bd93f9" diff --git a/deepeval/inspect/widgets/help_modal.py b/deepeval/inspect/widgets/help_modal.py index c64f783b7..d659aac02 100644 --- a/deepeval/inspect/widgets/help_modal.py +++ b/deepeval/inspect/widgets/help_modal.py @@ -9,7 +9,6 @@ from textual.screen import ModalScreen from textual.widgets import Static - _HELP_ROWS = [ ("↑ ↓ / k j", "move selection in the tree"), ("h / l", "go to parent / select child in the tree"), diff --git a/deepeval/inspect/widgets/span_tree.py b/deepeval/inspect/widgets/span_tree.py index 255eccb55..097db93cc 100644 --- a/deepeval/inspect/widgets/span_tree.py +++ b/deepeval/inspect/widgets/span_tree.py @@ -25,7 +25,6 @@ type_prefix, ) - # Minimum gap (in cells) between the left content (name + metric badge + # optional ERRORED pill) and the right-aligned duration. Below this the # right column gives up trying to right-align and just leaves the diff --git a/deepeval/metrics/arena_g_eval/template.py b/deepeval/metrics/arena_g_eval/template.py index d65ed5542..394717d49 100644 --- a/deepeval/metrics/arena_g_eval/template.py +++ b/deepeval/metrics/arena_g_eval/template.py @@ -46,8 +46,7 @@ def generate_arena_winner( "Be specific and grounded in the evaluation steps." ) - return textwrap.dedent( - f""" + return textwrap.dedent(f""" You are a judge. Given the following evaluation steps, select the single contestant that best aligns with the evaluation steps. {ArenaGEvalTemplate.multimodal_rules if multimodal else ""} @@ -88,16 +87,14 @@ def generate_arena_winner( }} JSON: - """ - ) + """) @staticmethod def rewrite_reason( reason: str, dummy_to_real_names: Dict[str, str], ): - return textwrap.dedent( - f""" + return textwrap.dedent(f""" Given the following reason that explains which contestant is the winner, rewrite the reason to REPLACE all contestant names with their real names. The contestant names are wrapped in $name$ format (e.g., $Alice$, $Bob$, $Charlie$). @@ -129,5 +126,4 @@ def rewrite_reason( }} JSON: - """ - ) + """) diff --git a/deepeval/metrics/argument_correctness/template.py b/deepeval/metrics/argument_correctness/template.py index 5a52a0b64..1f5a5307e 100644 --- a/deepeval/metrics/argument_correctness/template.py +++ b/deepeval/metrics/argument_correctness/template.py @@ -19,8 +19,7 @@ def generate_verdicts( stringified_tools_called = repr(tools_called) - return textwrap.dedent( - f""" + return textwrap.dedent(f""" For the provided list of tool calls, determine whether each tool call input parameter is relevantly and correctly addresses the input. Please generate a list of JSON with two keys: `verdict` and `reason`. @@ -99,8 +98,7 @@ def generate_verdicts( {stringified_tools_called} JSON: - """ - ) + """) @staticmethod def generate_reason( diff --git a/deepeval/metrics/contextual_relevancy/template.py b/deepeval/metrics/contextual_relevancy/template.py index bf231ac26..283e90eb4 100644 --- a/deepeval/metrics/contextual_relevancy/template.py +++ b/deepeval/metrics/contextual_relevancy/template.py @@ -55,12 +55,10 @@ def generate_verdicts( # Conditional instructions based on mode extraction_instructions = "" if multimodal: - extraction_instructions = textwrap.dedent( - """ + extraction_instructions = textwrap.dedent(""" If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement. If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available. - """ - ).strip() + """).strip() else: extraction_instructions = "You should first extract statements found in the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement." diff --git a/deepeval/metrics/conversational_dag/templates.py b/deepeval/metrics/conversational_dag/templates.py index 4a29f6b66..3f81574ad 100644 --- a/deepeval/metrics/conversational_dag/templates.py +++ b/deepeval/metrics/conversational_dag/templates.py @@ -73,8 +73,7 @@ def generate_task_output(instructions: str, text: str): class ConversationalBinaryJudgementTemplate: @staticmethod def generate_binary_verdict(criteria: str, text: str): - return dedent( - f"""{criteria} + return dedent(f"""{criteria} Below is the full conversation you should evaluate. Consider dialogue context, speaker roles, and how responses were handled. @@ -95,8 +94,7 @@ def generate_binary_verdict(criteria: str, text: str): }} ** JSON: - """ - ) + """) class ConversationalNonBinaryJudgementTemplate: @@ -104,8 +102,7 @@ class ConversationalNonBinaryJudgementTemplate: def generate_non_binary_verdict( criteria: str, text: str, options: List[str] ): - return dedent( - f"""{criteria} + return dedent(f"""{criteria} You are evaluating the following conversation. Choose one of the options that best reflects the assistant's behavior. @@ -128,5 +125,4 @@ def generate_non_binary_verdict( }} ** JSON: - """ - ) + """) diff --git a/deepeval/metrics/dag/serialization/registry.py b/deepeval/metrics/dag/serialization/registry.py index 4b019ad79..d350e53f2 100644 --- a/deepeval/metrics/dag/serialization/registry.py +++ b/deepeval/metrics/dag/serialization/registry.py @@ -15,7 +15,6 @@ from .types import NodeType - NODE_CLASSES: Dict[bool, Dict[NodeType, Type]] = { False: { NodeType.TASK: TaskNode, diff --git a/deepeval/metrics/dag/serialization/serialization.py b/deepeval/metrics/dag/serialization/serialization.py index f8569d3eb..b2157a300 100644 --- a/deepeval/metrics/dag/serialization/serialization.py +++ b/deepeval/metrics/dag/serialization/serialization.py @@ -61,7 +61,6 @@ from .registry import CLASS_TO_NODE_TYPE, NODE_CLASSES from .types import ChildType, NodeType - # ---------------------------------------------------------------------------- # Public API # ---------------------------------------------------------------------------- diff --git a/deepeval/metrics/faithfulness/template.py b/deepeval/metrics/faithfulness/template.py index 7282720f0..76718c489 100644 --- a/deepeval/metrics/faithfulness/template.py +++ b/deepeval/metrics/faithfulness/template.py @@ -93,8 +93,7 @@ def generate_verdicts( ): example_section = "" if multimodal: - example_section = textwrap.dedent( - """ + example_section = textwrap.dedent(""" Example retrieval contexts: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist." Example claims: ["Barack Obama is a caucasian male.", "Zurich is a city in London", "Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.", "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect.", "Einstein was a German chef."] @@ -123,11 +122,9 @@ def generate_verdicts( ] }} ===== END OF EXAMPLE ====== - """ - ) + """) - format_instruction = textwrap.dedent( - """ + format_instruction = textwrap.dedent(""" Expected JSON format: {{ "verdicts": [ @@ -144,31 +141,26 @@ def generate_verdicts( }} ] }} - """ - ) + """) guidelines = "" if multimodal: - guidelines = textwrap.dedent( - """ + guidelines = textwrap.dedent(""" The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims. You DON'T have to provide a reason if the answer is 'yes'. ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT. Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction. Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE. If there are clear contradictions or any data or images that's not mentioned in the retrieval context, just provide 'no'. - """ - ) + """) else: - guidelines = textwrap.dedent( - """ + guidelines = textwrap.dedent(""" Generate ONE verdict per claim - length of 'verdicts' MUST equal number of claims. No 'reason' needed for 'yes' verdicts. Only use 'no' if retrieval context DIRECTLY CONTRADICTS the claim - never use prior knowledge. Use 'idk' for claims not backed up by context OR factually incorrect but non-contradictory - do not assume your knowledge. Vague/speculative language in claims (e.g. 'may have', 'possibility') does NOT count as contradiction. - """ - ) + """) return textwrap.dedent( f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'. diff --git a/deepeval/metrics/g_eval/__init__.py b/deepeval/metrics/g_eval/__init__.py index 3d4b94165..54cf06c64 100644 --- a/deepeval/metrics/g_eval/__init__.py +++ b/deepeval/metrics/g_eval/__init__.py @@ -1,4 +1,15 @@ -from .utils import Rubric +from .utils import ( + RetrievalContextBudgetReport, + RetrievalContextChunkBudget, + RetrievalContextEvidenceCoverage, + Rubric, +) from .template import GEvalTemplate -__all__ = ["Rubric", "GEvalTemplate"] +__all__ = [ + "RetrievalContextBudgetReport", + "RetrievalContextChunkBudget", + "RetrievalContextEvidenceCoverage", + "Rubric", + "GEvalTemplate", +] diff --git a/deepeval/metrics/g_eval/g_eval.py b/deepeval/metrics/g_eval/g_eval.py index 21fc08110..ab76f7a01 100644 --- a/deepeval/metrics/g_eval/g_eval.py +++ b/deepeval/metrics/g_eval/g_eval.py @@ -38,6 +38,10 @@ construct_geval_pull_evaluation_params, ensure_required_params, G_EVAL_API_PARAMS, + RetrievalContextBudgetReport, + RetrievalContextEvidenceCoverage, + build_retrieval_context_budget_report, + build_retrieval_relevance_query, ) from deepeval.config.settings import get_settings from deepeval.confident.api import Api, Endpoints, HttpMethods @@ -57,6 +61,7 @@ def __init__( async_mode: bool = True, strict_mode: bool = False, verbose_mode: bool = False, + max_retrieval_context_tokens: Optional[int] = None, evaluation_template: Type[GEvalTemplate] = GEvalTemplate, _include_g_eval_suffix: bool = True, ): @@ -66,6 +71,14 @@ def __init__( if criteria is not None or evaluation_steps is not None: validate_criteria_and_evaluation_steps(criteria, evaluation_steps) + if ( + max_retrieval_context_tokens is not None + and max_retrieval_context_tokens <= 0 + ): + raise ValueError( + "max_retrieval_context_tokens must be greater than 0." + ) + self.name = name self.evaluation_params = evaluation_params self.criteria = criteria @@ -84,6 +97,7 @@ def __init__( self.strict_mode = strict_mode self.async_mode = async_mode self.verbose_mode = verbose_mode + self.max_retrieval_context_tokens = max_retrieval_context_tokens self._include_g_eval_suffix = _include_g_eval_suffix self.evaluation_template = evaluation_template @@ -275,35 +289,12 @@ async def _a_evaluate( multimodal: bool, _additional_context: Optional[str] = None, ) -> Tuple[Union[int, float], str]: - test_case_content = construct_test_case_string( - self.evaluation_params, test_case - ) - g_eval_params_str = construct_g_eval_params_string( - self.evaluation_params + prompt = self._build_evaluation_prompt( + test_case=test_case, + evaluation_steps=self.evaluation_steps, + multimodal=multimodal, + _additional_context=_additional_context, ) - if not self.strict_mode: - rubric_str = format_rubrics(self.rubric) if self.rubric else None - prompt = self.evaluation_template.generate_evaluation_results( - evaluation_steps=number_evaluation_steps(self.evaluation_steps), - test_case_content=test_case_content, - parameters=g_eval_params_str, - rubric=rubric_str, - score_range=self.score_range, - _additional_context=_additional_context, - multimodal=multimodal, - ) - else: - prompt = ( - self.evaluation_template.generate_strict_evaluation_results( - evaluation_steps=number_evaluation_steps( - self.evaluation_steps - ), - test_case_content=test_case_content, - parameters=g_eval_params_str, - _additional_context=_additional_context, - multimodal=multimodal, - ) - ) try: # don't use log probabilities for unsupported gpt models if no_log_prob_support(self.model): @@ -348,37 +339,13 @@ def _evaluate( multimodal: bool, _additional_context: Optional[str] = None, ) -> Tuple[Union[int, float], str]: - test_case_content = construct_test_case_string( - self.evaluation_params, test_case - ) - g_eval_params_str = construct_g_eval_params_string( - self.evaluation_params + prompt = self._build_evaluation_prompt( + test_case=test_case, + evaluation_steps=self.evaluation_steps, + multimodal=multimodal, + _additional_context=_additional_context, ) - if not self.strict_mode: - rubric_str = format_rubrics(self.rubric) if self.rubric else None - prompt = self.evaluation_template.generate_evaluation_results( - evaluation_steps=number_evaluation_steps(self.evaluation_steps), - test_case_content=test_case_content, - parameters=g_eval_params_str, - rubric=rubric_str, - score_range=self.score_range, - _additional_context=_additional_context, - multimodal=multimodal, - ) - else: - prompt = ( - self.evaluation_template.generate_strict_evaluation_results( - evaluation_steps=number_evaluation_steps( - self.evaluation_steps - ), - test_case_content=test_case_content, - parameters=g_eval_params_str, - _additional_context=_additional_context, - multimodal=multimodal, - ) - ) - try: # don't use log probabilities for unsupported gpt models if no_log_prob_support(self.model): @@ -413,6 +380,106 @@ def _evaluate( extract_json=lambda d: (d["score"], d["reason"]), ) + def _build_evaluation_prompt( + self, + test_case: LLMTestCase, + evaluation_steps: List[str], + multimodal: bool, + _additional_context: Optional[str] = None, + ) -> str: + test_case_content = construct_test_case_string( + self.evaluation_params, + test_case, + self.max_retrieval_context_tokens, + ) + g_eval_params_str = construct_g_eval_params_string( + self.evaluation_params + ) + + if not self.strict_mode: + rubric_str = format_rubrics(self.rubric) if self.rubric else None + prompt = self.evaluation_template.generate_evaluation_results( + evaluation_steps=number_evaluation_steps(evaluation_steps), + test_case_content=test_case_content, + parameters=g_eval_params_str, + rubric=rubric_str, + score_range=self.score_range, + _additional_context=_additional_context, + multimodal=multimodal, + ) + else: + prompt = ( + self.evaluation_template.generate_strict_evaluation_results( + evaluation_steps=number_evaluation_steps(evaluation_steps), + test_case_content=test_case_content, + parameters=g_eval_params_str, + _additional_context=_additional_context, + multimodal=multimodal, + ) + ) + return prompt + + def preview_evaluation_prompt( + self, + test_case: LLMTestCase, + evaluation_steps: Optional[List[str]] = None, + _additional_context: Optional[str] = None, + ) -> str: + ensure_required_params( + self.evaluation_params, self.criteria, self.evaluation_steps + ) + multimodal = test_case.multimodal + check_llm_test_case_params( + test_case, + self.evaluation_params, + None, + None, + self, + self.model, + multimodal, + ) + prompt_steps = evaluation_steps or self.evaluation_steps + if not prompt_steps: + raise ValueError( + "preview_evaluation_prompt requires evaluation_steps. " + "Pass evaluation_steps explicitly or initialize GEval with " + "evaluation_steps to avoid an LLM call." + ) + + return self._build_evaluation_prompt( + test_case=test_case, + evaluation_steps=prompt_steps, + multimodal=multimodal, + _additional_context=_additional_context, + ) + + def get_retrieval_context_budget_report( + self, test_case: LLMTestCase + ) -> Optional[RetrievalContextBudgetReport]: + if ( + self.max_retrieval_context_tokens is None + or self.evaluation_params is None + or SingleTurnParams.RETRIEVAL_CONTEXT not in self.evaluation_params + or test_case.retrieval_context is None + ): + return None + + return build_retrieval_context_budget_report( + test_case.retrieval_context, + self.max_retrieval_context_tokens, + relevance_query=build_retrieval_relevance_query( + self.evaluation_params, test_case + ), + ) + + def get_retrieval_context_evidence_coverage( + self, test_case: LLMTestCase + ) -> Optional[RetrievalContextEvidenceCoverage]: + report = self.get_retrieval_context_budget_report(test_case) + if report is None: + return None + return report.evidence_coverage + def is_successful(self) -> bool: if self.error is not None: self.success = False diff --git a/deepeval/metrics/g_eval/utils.py b/deepeval/metrics/g_eval/utils.py index 7bcb16244..d71212100 100644 --- a/deepeval/metrics/g_eval/utils.py +++ b/deepeval/metrics/g_eval/utils.py @@ -1,24 +1,22 @@ from typing import List, Optional, Union, Tuple, Dict from openai.types.chat.chat_completion import ChatCompletion import math +import re from deepeval.models import DeepEvalBaseLLM, GPTModel, AzureOpenAIModel from deepeval.test_case import ( SingleTurnParams, MultiTurnParams, LLMTestCase, + RetrievedContextData, ToolCall, ) -from pydantic import BaseModel, field_validator +from pydantic import BaseModel, Field, field_validator from deepeval.models.llms.constants import OPENAI_MODELS_DATA from deepeval.test_case.conversational_test_case import ConversationalTestCase -from pydantic import BaseModel, Field -from typing import Optional, List, Tuple - - class APIRubric(BaseModel): scoreRange: Tuple[float, float] expectedOutcome: str @@ -32,6 +30,37 @@ class MetricPullResponse(BaseModel): rubric: Optional[List[APIRubric]] = None +class RetrievalContextChunkBudget(BaseModel): + index: int + source: Optional[str] + original_tokens: int + rendered_tokens: int + relevance_score: float = 0.0 + omitted: bool = False + + +class RetrievalContextEvidenceCoverage(BaseModel): + query_terms_count: int + covered_terms: List[str] + missing_terms: List[str] + coverage_ratio: float + warning: Optional[str] = None + + +class RetrievalContextBudgetReport(BaseModel): + original_tokens: int + rendered_tokens: int + budget_tokens: int + total_chunks: int + visible_chunks: int + omitted_chunks: int + compression_ratio: float + exceeded_budget: bool + rendered_context: str + chunks: List[RetrievalContextChunkBudget] + evidence_coverage: RetrievalContextEvidenceCoverage + + class Rubric(BaseModel): score_range: Tuple[int, int] expected_outcome: str @@ -321,14 +350,377 @@ def construct_non_turns_test_case_string( return f"Conversation-level fields:\n{body}" +TOKEN_CHAR_RATIO = 4 +MIN_CONTEXT_WINDOW_TOKENS = 32 +RELEVANCE_STOPWORDS = { + "about", + "after", + "also", + "because", + "before", + "being", + "between", + "could", + "does", + "from", + "have", + "into", + "only", + "should", + "that", + "their", + "there", + "these", + "they", + "this", + "through", + "what", + "when", + "where", + "which", + "with", + "would", +} + + +def estimate_token_count(text: str) -> int: + return max(1, math.ceil(len(text) / TOKEN_CHAR_RATIO)) + + +def _normalize_retrieval_context_item( + item: Union[str, RetrievedContextData], +) -> Tuple[Optional[str], str]: + if isinstance(item, RetrievedContextData): + return item.source, item.context + return None, str(item) + + +def _relevance_terms(text: Optional[str]) -> set[str]: + if not text: + return set() + + return { + _normalize_relevance_term(term) + for term in re.findall(r"[A-Za-z0-9_]{3,}", text.lower()) + if _normalize_relevance_term(term) not in RELEVANCE_STOPWORDS + } + + +def _normalize_relevance_term(term: str) -> str: + if len(term) > 4 and term.endswith("s"): + return term[:-1] + return term + + +def _retrieval_context_relevance_score( + context: str, + relevance_terms: set[str], +) -> float: + if not relevance_terms: + return 0.0 + + context_terms = _relevance_terms(context) + if not context_terms: + return 0.0 + + overlap = context_terms & relevance_terms + coverage = len(overlap) / len(relevance_terms) + density = len(overlap) / max(1, len(context_terms)) + return round(coverage + density, 4) + + +def _build_evidence_coverage( + original_context: str, + rendered_context: str, + relevance_terms: set[str], +) -> RetrievalContextEvidenceCoverage: + if not relevance_terms: + return RetrievalContextEvidenceCoverage( + query_terms_count=0, + covered_terms=[], + missing_terms=[], + coverage_ratio=1.0, + ) + + original_terms = _relevance_terms(original_context) + rendered_terms = _relevance_terms(rendered_context) + evidence_terms = relevance_terms & original_terms + if not evidence_terms: + return RetrievalContextEvidenceCoverage( + query_terms_count=len(relevance_terms), + covered_terms=[], + missing_terms=[], + coverage_ratio=1.0, + ) + + covered_terms = sorted(evidence_terms & rendered_terms) + missing_terms = sorted(evidence_terms - rendered_terms) + coverage_ratio = round(len(covered_terms) / len(evidence_terms), 4) + warning = None + if missing_terms: + warning = ( + "Some relevance-query terms appeared in the original retrieval " + "context but were not present after GEval compaction." + ) + return RetrievalContextEvidenceCoverage( + query_terms_count=len(relevance_terms), + covered_terms=covered_terms, + missing_terms=missing_terms, + coverage_ratio=coverage_ratio, + warning=warning, + ) + + +def build_retrieval_relevance_query( + evaluation_params: List[SingleTurnParams], + test_case: LLMTestCase, +) -> str: + query_parts: List[str] = [] + for param in ( + SingleTurnParams.INPUT, + SingleTurnParams.ACTUAL_OUTPUT, + SingleTurnParams.EXPECTED_OUTPUT, + ): + if param not in evaluation_params: + continue + value = getattr(test_case, param.value, None) + if value: + query_parts.append(str(value)) + + return "\n".join(query_parts) + + +def _truncate_middle(text: str, max_tokens: int, label: str) -> str: + max_chars = max_tokens * TOKEN_CHAR_RATIO + if len(text) <= max_chars: + return text + + marker = ( + f"\n\n[... omitted ~{estimate_token_count(text) - max_tokens} " + f"tokens from {label} to fit GEval retrieval context budget ...]\n\n" + ) + if max_chars <= len(marker) + 16: + return text[:max_chars] + + remaining_chars = max_chars - len(marker) + head_chars = math.ceil(remaining_chars * 0.6) + tail_chars = remaining_chars - head_chars + + return f"{text[:head_chars]}{marker}{text[-tail_chars:]}" + + +def format_retrieval_context_with_budget( + retrieval_context: List[Union[str, RetrievedContextData]], + max_retrieval_context_tokens: int, + relevance_query: Optional[str] = None, +) -> str: + return build_retrieval_context_budget_report( + retrieval_context, + max_retrieval_context_tokens, + relevance_query=relevance_query, + ).rendered_context + + +def build_retrieval_context_budget_report( + retrieval_context: List[Union[str, RetrievedContextData]], + max_retrieval_context_tokens: int, + relevance_query: Optional[str] = None, +) -> RetrievalContextBudgetReport: + if max_retrieval_context_tokens <= 0: + raise ValueError("max_retrieval_context_tokens must be greater than 0.") + + relevance_terms = _relevance_terms(relevance_query) + normalized_contexts = [ + ( + index, + *_normalize_retrieval_context_item(item), + ) + for index, item in enumerate(retrieval_context, start=1) + ] + total_tokens = sum( + estimate_token_count(context) for _, _, context in normalized_contexts + ) + original_context = "\n\n".join( + context for _, _, context in normalized_contexts + ) + + if total_tokens <= max_retrieval_context_tokens: + rendered_context = str(retrieval_context) + rendered_tokens = estimate_token_count(rendered_context) + return RetrievalContextBudgetReport( + original_tokens=total_tokens, + rendered_tokens=rendered_tokens, + budget_tokens=max_retrieval_context_tokens, + total_chunks=len(normalized_contexts), + visible_chunks=len(normalized_contexts), + omitted_chunks=0, + compression_ratio=1.0, + exceeded_budget=False, + rendered_context=rendered_context, + chunks=[ + RetrievalContextChunkBudget( + index=index, + source=source, + original_tokens=estimate_token_count(context), + rendered_tokens=estimate_token_count(context), + relevance_score=_retrieval_context_relevance_score( + context, relevance_terms + ), + omitted=False, + ) + for index, source, context in normalized_contexts + ], + evidence_coverage=_build_evidence_coverage( + original_context, + rendered_context, + relevance_terms, + ), + ) + + if not normalized_contexts: + rendered_context = str(retrieval_context) + return RetrievalContextBudgetReport( + original_tokens=0, + rendered_tokens=estimate_token_count(rendered_context), + budget_tokens=max_retrieval_context_tokens, + total_chunks=0, + visible_chunks=0, + omitted_chunks=0, + compression_ratio=1.0, + exceeded_budget=False, + rendered_context=rendered_context, + chunks=[], + evidence_coverage=_build_evidence_coverage( + original_context, + rendered_context, + relevance_terms, + ), + ) + + context_count = len(normalized_contexts) + visible_context_count = min( + context_count, + max(1, max_retrieval_context_tokens // MIN_CONTEXT_WINDOW_TOKENS), + ) + scored_contexts = [ + ( + index, + source, + context, + _retrieval_context_relevance_score(context, relevance_terms), + ) + for index, source, context in normalized_contexts + ] + ranked_contexts = sorted( + scored_contexts, + key=lambda item: (-item[3], item[0]), + ) + selected_indices = { + index for index, _, _, _ in ranked_contexts[:visible_context_count] + } + visible_contexts = [ + item for item in scored_contexts if item[0] in selected_indices + ] + omitted_context_count = context_count - visible_context_count + context_token_budget = max( + 1, + max_retrieval_context_tokens // visible_context_count, + ) + + rendered_contexts = [ + ( + "[retrieval_context compacted for GEval: " + f"estimated {total_tokens} tokens across {context_count} chunks; " + f"budget {max_retrieval_context_tokens} tokens; " + f"kept {visible_context_count} highest-relevance chunks]" + ) + ] + chunk_reports: List[RetrievalContextChunkBudget] = [] + for index, source, context, relevance_score in visible_contexts: + label = f"retrieval chunk {index}" + source_label = f" source={source}" if source else "" + rendered_chunk = _truncate_middle(context, context_token_budget, label) + rendered_contexts.append( + f"[{index}{source_label}]\n" f"{rendered_chunk}" + ) + chunk_reports.append( + RetrievalContextChunkBudget( + index=index, + source=source, + original_tokens=estimate_token_count(context), + rendered_tokens=estimate_token_count(rendered_chunk), + relevance_score=relevance_score, + omitted=estimate_token_count(context) + > estimate_token_count(rendered_chunk), + ) + ) + + if omitted_context_count > 0: + rendered_contexts.append( + f"[... omitted {omitted_context_count} retrieval chunks because " + "the GEval retrieval context budget was reached ...]" + ) + for index, source, context, relevance_score in scored_contexts: + if index in selected_indices: + continue + chunk_reports.append( + RetrievalContextChunkBudget( + index=index, + source=source, + original_tokens=estimate_token_count(context), + rendered_tokens=0, + relevance_score=relevance_score, + omitted=True, + ) + ) + + rendered_context = "\n\n".join(rendered_contexts) + rendered_tokens = estimate_token_count(rendered_context) + return RetrievalContextBudgetReport( + original_tokens=total_tokens, + rendered_tokens=rendered_tokens, + budget_tokens=max_retrieval_context_tokens, + total_chunks=context_count, + visible_chunks=visible_context_count, + omitted_chunks=omitted_context_count, + compression_ratio=( + round(rendered_tokens / total_tokens, 4) + if total_tokens > 0 + else 1.0 + ), + exceeded_budget=True, + rendered_context=rendered_context, + chunks=chunk_reports, + evidence_coverage=_build_evidence_coverage( + original_context, + rendered_context, + relevance_terms, + ), + ) + + def construct_test_case_string( - evaluation_params: List[SingleTurnParams], test_case: LLMTestCase + evaluation_params: List[SingleTurnParams], + test_case: LLMTestCase, + max_retrieval_context_tokens: Optional[int] = None, ) -> str: text = """""" for param in evaluation_params: value = getattr(test_case, param.value) if isinstance(value, ToolCall): value = repr(value) + elif ( + param == SingleTurnParams.RETRIEVAL_CONTEXT + and max_retrieval_context_tokens is not None + and isinstance(value, list) + ): + value = format_retrieval_context_with_budget( + value, + max_retrieval_context_tokens, + relevance_query=build_retrieval_relevance_query( + evaluation_params, test_case + ), + ) text += f"{G_EVAL_PARAMS[param]}:\n{value} \n\n" return text diff --git a/deepeval/metrics/multimodal_metrics/image_coherence/template.py b/deepeval/metrics/multimodal_metrics/image_coherence/template.py index 5cd5e0e74..b0a6e7e79 100644 --- a/deepeval/metrics/multimodal_metrics/image_coherence/template.py +++ b/deepeval/metrics/multimodal_metrics/image_coherence/template.py @@ -5,8 +5,7 @@ class ImageCoherenceTemplate: @staticmethod def evaluate_image_coherence(context_above, context_below): - return textwrap.dedent( - f""" + return textwrap.dedent(f""" # Task Description You are a multi-modal document evaluation assistant. You will receive an image and its textual context. Your task is to evaluate the coherence between the image and the text (context above and below) it accompanies. @@ -40,5 +39,4 @@ def evaluate_image_coherence(context_above, context_below): # Image [Insert Image Here] - """ - ) + """) diff --git a/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py b/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py index 99d998d15..3785a7436 100644 --- a/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +++ b/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py @@ -284,15 +284,13 @@ def is_successful(self) -> bool: def _generate_reason( self, ) -> str: - return textwrap.dedent( - f""" + return textwrap.dedent(f""" The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)} and the lowest score from perceptual quality was {min(self.PQ_scores)}. These scores were combined to reflect the overall effectiveness and quality of the AI-generated image(s). Reason for Semantic Consistency score: {self.SC_reasoning} Reason for Perceptual Quality score: {self.PQ_reasoning} - """ - ) + """) @property def __name__(self): diff --git a/deepeval/metrics/multimodal_metrics/image_editing/template.py b/deepeval/metrics/multimodal_metrics/image_editing/template.py index 90911fb4a..39d03c05f 100644 --- a/deepeval/metrics/multimodal_metrics/image_editing/template.py +++ b/deepeval/metrics/multimodal_metrics/image_editing/template.py @@ -3,8 +3,7 @@ class ImageEditingTemplate: - context = textwrap.dedent( - """ + context = textwrap.dedent(""" You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. @@ -13,13 +12,11 @@ class ImageEditingTemplate: "score" : [...], "reasoning" : "..." } - """ - ) + """) @staticmethod def generate_semantic_consistency_evaluation_results(text_prompt: str): - return textwrap.dedent( - f""" + return textwrap.dedent(f""" {ImageEditingTemplate.context} RULES: @@ -33,13 +30,11 @@ def generate_semantic_consistency_evaluation_results(text_prompt: str): Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting. Editing instruction: {text_prompt} - """ - ) + """) @staticmethod def generate_perceptual_quality_evaluation_results(): - return textwrap.dedent( - f""" + return textwrap.dedent(f""" {ImageEditingTemplate.context} RULES: @@ -59,5 +54,4 @@ def generate_perceptual_quality_evaluation_results(): 10 indicates the image has no artifacts. ) Put the score in a list such that output score = [naturalness, artifacts] - """ - ) + """) diff --git a/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py b/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py index c63ff5023..07e58bdad 100644 --- a/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py +++ b/deepeval/metrics/multimodal_metrics/image_helpfulness/template.py @@ -5,8 +5,7 @@ class ImageHelpfulnessTemplate: @staticmethod def evaluate_image_helpfulness(context_above, context_below): - return textwrap.dedent( - f""" + return textwrap.dedent(f""" # Task Description You are a multi-modal document evaluation assistant. You will receive an image and its textual context. Your task is to evaluate the helpfulness of the image in enabling human readers to comprehend the text (context above and below) it accompanies. @@ -40,5 +39,4 @@ def evaluate_image_helpfulness(context_above, context_below): # Image [Insert Image Here] - """ - ) + """) diff --git a/deepeval/metrics/multimodal_metrics/image_reference/template.py b/deepeval/metrics/multimodal_metrics/image_reference/template.py index 3be3cb18f..183889d13 100644 --- a/deepeval/metrics/multimodal_metrics/image_reference/template.py +++ b/deepeval/metrics/multimodal_metrics/image_reference/template.py @@ -5,8 +5,7 @@ class ImageReferenceTemplate: @staticmethod def evaluate_image_reference(context_above, context_below): - return textwrap.dedent( - f""" + return textwrap.dedent(f""" # Task Description You are a multi-modal document quality assessment assistant. You will receive an image and its accompanying textual context. Your task is to determine whether the image is explicitly referenced or explained within the surrounding text (both above and below the image). @@ -39,5 +38,4 @@ def evaluate_image_reference(context_above, context_below): # Image [Insert Image Here] - """ - ) + """) diff --git a/deepeval/metrics/multimodal_metrics/text_to_image/template.py b/deepeval/metrics/multimodal_metrics/text_to_image/template.py index 8488469f7..d2cfe3fe9 100644 --- a/deepeval/metrics/multimodal_metrics/text_to_image/template.py +++ b/deepeval/metrics/multimodal_metrics/text_to_image/template.py @@ -3,8 +3,7 @@ class TextToImageTemplate: - context = textwrap.dedent( - """ + context = textwrap.dedent(""" You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules. All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials. @@ -13,13 +12,11 @@ class TextToImageTemplate: "score" : [...], "reasoning" : "..." } - """ - ) + """) @staticmethod def generate_semantic_consistency_evaluation_results(text_prompt: str): - return textwrap.dedent( - f""" + return textwrap.dedent(f""" {TextToImageTemplate.context} RULES: @@ -34,13 +31,11 @@ def generate_semantic_consistency_evaluation_results(text_prompt: str): Put the score in a list such that output score = [score]. Text Prompt: {text_prompt} - """ - ) + """) @staticmethod def generate_perceptual_quality_evaluation_results(): - return textwrap.dedent( - f""" + return textwrap.dedent(f""" {TextToImageTemplate.context} RULES: @@ -60,5 +55,4 @@ def generate_perceptual_quality_evaluation_results(): 10 indicates the image has no artifacts. ) Put the score in a list such that output score = [naturalness, artifacts] - """ - ) + """) diff --git a/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py b/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py index c7eb60db2..4bd62085c 100644 --- a/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py +++ b/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py @@ -286,15 +286,13 @@ def is_successful(self) -> bool: return self.success def _generate_reason(self) -> str: - return textwrap.dedent( - f""" + return textwrap.dedent(f""" The overall score is {self.score:.2f} because the lowest score from semantic consistency was {min(self.SC_scores)} and the lowest score from perceptual quality was {min(self.PQ_scores)}. These scores were combined to reflect the overall effectiveness and quality of the AI-generated image(s). Reason for Semantic Consistency score: {self.SC_reasoning} Reason for Perceptual Quality score: {self.PQ_reasoning} - """ - ) + """) @property def __name__(self): diff --git a/deepeval/metrics/topic_adherence/template.py b/deepeval/metrics/topic_adherence/template.py index 0e0a17e6e..5131cc3bd 100644 --- a/deepeval/metrics/topic_adherence/template.py +++ b/deepeval/metrics/topic_adherence/template.py @@ -79,8 +79,7 @@ def get_qa_pair_verdict( question: str, response: str, ) -> str: - return textwrap.dedent( - f"""You are given: + return textwrap.dedent(f"""You are given: - A list of **relevant topics** - A **user question** - An **assistant response** @@ -138,8 +137,7 @@ def get_qa_pair_verdict( Response: {response} JSON: - """ - ) + """) @staticmethod def generate_reason(success, score, threshold, TP, TN, FP, FN) -> str: diff --git a/deepeval/metrics/turn_contextual_relevancy/template.py b/deepeval/metrics/turn_contextual_relevancy/template.py index 4b984e652..f1e95bd12 100644 --- a/deepeval/metrics/turn_contextual_relevancy/template.py +++ b/deepeval/metrics/turn_contextual_relevancy/template.py @@ -67,12 +67,10 @@ def generate_verdicts( # Conditional instructions based on mode extraction_instructions = "" if multimodal: - extraction_instructions = textwrap.dedent( - """ + extraction_instructions = textwrap.dedent(""" If the context is textual, you should first extract the statements found in the context if the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement. If the context is an image, `statement` should be a description of the image. Do not assume any information not visibly available. - """ - ).strip() + """).strip() else: extraction_instructions = "You should first extract statements found in the context, which are high level information found in the context, before deciding on a verdict and optionally a reason for each statement." diff --git a/deepeval/metrics/turn_faithfulness/template.py b/deepeval/metrics/turn_faithfulness/template.py index ee7f047e3..fdd597e44 100644 --- a/deepeval/metrics/turn_faithfulness/template.py +++ b/deepeval/metrics/turn_faithfulness/template.py @@ -17,8 +17,7 @@ class TurnFaithfulnessTemplate: def generate_claims( input: str, assistant_output: str, multimodal: bool = False ): - return textwrap.dedent( - f""" + return textwrap.dedent(f""" Extract every factual-sounding claim asserted in the ASSISTANT'S OUTPUT. A claim is any statement presented as fact, even if it is incorrect, vague, implied, or unverifiable. @@ -46,8 +45,7 @@ def generate_claims( {assistant_output} JSON: - """ - ) + """) @staticmethod def generate_truths( @@ -62,8 +60,7 @@ def generate_truths( else: limit_description = f"{extraction_limit} factual, explicit truths" - return textwrap.dedent( - f""" + return textwrap.dedent(f""" Extract {limit_description} from the REFERENCE CONTEXT. RULES: @@ -86,15 +83,13 @@ def generate_truths( {reference_context} JSON: - """ - ) + """) @staticmethod def generate_verdicts( claims: List[str], reference_context: str, multimodal: bool = False ): - return textwrap.dedent( - f""" + return textwrap.dedent(f""" For each claim, determine whether it is supported, contradicted, or not addressed by the reference context. DEFINITIONS: @@ -138,15 +133,13 @@ def generate_verdicts( {claims} JSON: - """ - ) + """) @staticmethod def generate_reason( score: float, contradictions: List[str], multimodal: bool = False ): - return textwrap.dedent( - f""" + return textwrap.dedent(f""" Below is a list of contradictions extracted from verdicts. Write a concise justification of the score. RULES: @@ -174,8 +167,7 @@ def generate_reason( {contradictions} JSON: - """ - ) + """) @staticmethod def generate_final_reason( diff --git a/deepeval/scorer/scorer.py b/deepeval/scorer/scorer.py index 17d037a84..bb74997e7 100644 --- a/deepeval/scorer/scorer.py +++ b/deepeval/scorer/scorer.py @@ -444,8 +444,7 @@ def squad_score( evaluation_model: DeepEvalBaseLLM, using_native_evaluation_model: bool, ): - prompt = textwrap.dedent( - f""" + prompt = textwrap.dedent(f""" Given the question and context, evaluate if the prediction is correct based on the expected output. Ensure to account for cases where the prediction and expected output might differ in form, such as '2' versus 'two'. @@ -459,8 +458,7 @@ def squad_score( {{ "answer": }} - """ - ) + """) # Generate the score using the model if using_native_evaluation_model: diff --git a/deepeval/simulator/controller/template.py b/deepeval/simulator/controller/template.py index 276045ecf..2539401ad 100644 --- a/deepeval/simulator/controller/template.py +++ b/deepeval/simulator/controller/template.py @@ -6,8 +6,7 @@ class SimulatorControllerTemplate: def check_expected_outcome( previous_conversation: str, expected_outcome: str ) -> str: - prompt = textwrap.dedent( - f"""You are a Conversation Completion Checker. + prompt = textwrap.dedent(f"""You are a Conversation Completion Checker. Your task is to determine whether the conversation has achieved the expected outcome and should be terminated. Guidelines: @@ -34,6 +33,5 @@ def check_expected_outcome( Conversation History: {previous_conversation} JSON Output: - """ - ) + """) return prompt diff --git a/deepeval/simulator/template.py b/deepeval/simulator/template.py index d8a760eac..a0a172e74 100644 --- a/deepeval/simulator/template.py +++ b/deepeval/simulator/template.py @@ -61,8 +61,7 @@ def simulate_user_turn( indent=4, ensure_ascii=False, ) - prompt = textwrap.dedent( - f""" + prompt = textwrap.dedent(f""" Pretend you are a user of an LLM app. Your task is to generate the next user input in {language} based on the provided scenario, user profile, and the previous conversation. @@ -97,6 +96,5 @@ def simulate_user_turn( {previous_conversation} JSON Output: - """ - ) + """) return prompt diff --git a/deepeval/synthesizer/templates/template.py b/deepeval/synthesizer/templates/template.py index d9ae93307..7d2294b65 100644 --- a/deepeval/synthesizer/templates/template.py +++ b/deepeval/synthesizer/templates/template.py @@ -721,9 +721,7 @@ class EvolutionTemplate: @staticmethod def multi_context_evolution(input, context): - return ( - EvolutionTemplate.base_instruction - + f""" + return EvolutionTemplate.base_instruction + f""" 1. `Input` should be rewritten to require readers to use information from all elements of `Context`. 2. `Rewritten Input` must be fully answerable from information in `Context`. 3. `Rewritten Input` should be concise and understandable by humans. @@ -765,13 +763,10 @@ def multi_context_evolution(input, context): {input} Rewritten Input: """ - ) @staticmethod def reasoning_evolution(input, context): - return ( - EvolutionTemplate.base_instruction - + f""" + return EvolutionTemplate.base_instruction + f""" 1. If `Input` can be solved with just a few simple thinking processes, you can rewrite it to explicitly request multiple-step reasoning. 2. `Rewritten Input` should require readers to make multiple logical connections or inferences. 3. `Rewritten Input` should be concise and understandable by humans. @@ -814,13 +809,10 @@ def reasoning_evolution(input, context): {input} Rewritten Input: """ - ) @staticmethod def concretizing_evolution(input, context): - return ( - EvolutionTemplate.base_instruction - + f""" + return EvolutionTemplate.base_instruction + f""" 1. Rewrite `Input` by replacing general concepts/inquiries with more specific ones. 2. `Rewritten Input` should be concise and understandable by humans. 3. `Rewritten Input` should not contain phrases like 'based on the provided context' or 'according to the context'. @@ -864,13 +856,10 @@ def concretizing_evolution(input, context): {context} Rewritten Input: """ - ) @staticmethod def constrained_evolution(input, context): - return ( - EvolutionTemplate.base_instruction - + f""" + return EvolutionTemplate.base_instruction + f""" 1. Rewrite `Input` by adding at least one more constraints/requirements. 2. `Rewritten Input` must be fully answerable from information in `Context`. 5. `Rewritten Input` should not contain more than 15 words. Use abbreviation wherever possible. @@ -912,13 +901,10 @@ def constrained_evolution(input, context): {input} Rewritten Input: """ - ) @staticmethod def comparative_question_evolution(input, context): - return ( - EvolutionTemplate.base_instruction - + f""" + return EvolutionTemplate.base_instruction + f""" 1. Rewrite `Input` to focus on comparing two or more entities, concepts, or processes. 2. `Rewritten Input` should encourage a detailed comparison that highlights similarities and differences. 3. `Rewritten Input` must be fully answerable from information in `Context`. @@ -961,13 +947,10 @@ def comparative_question_evolution(input, context): {input} Rewritten Input: """ - ) @staticmethod def hypothetical_scenario_evolution(input, context): - return ( - EvolutionTemplate.base_instruction - + f""" + return EvolutionTemplate.base_instruction + f""" 1. Rewrite `Input` to include a hypothetical or speculative scenario that is relevant to the `Context`. 2. `Rewritten Input` should encourage the reader to apply knowledge from the `Context` to imagine or deduce outcomes. 3. `Rewritten Input` should be concise, clear, and understandable by humans. @@ -1010,13 +993,10 @@ def hypothetical_scenario_evolution(input, context): {input} Rewritten Input: """ - ) @staticmethod def in_breadth_evolution(input, context): - return ( - EvolutionTemplate.base_instruction - + f""" + return EvolutionTemplate.base_instruction + f""" 1. Rewrite `Input` to create a brand new prompt. 2. `Rewritten Input` should belong to the same domain as the `input` but be even more rare. 3. `Rewritten Input` should be concise, clear, and understandable by humans. @@ -1058,7 +1038,6 @@ def in_breadth_evolution(input, context): {input} Rewritten Input: """ - ) class ConversationalEvolutionTemplate: @@ -1069,9 +1048,7 @@ class ConversationalEvolutionTemplate: @staticmethod def multi_context_evolution(scenario, context): - return ( - ConversationalEvolutionTemplate.base_instruction - + f""" + return ConversationalEvolutionTemplate.base_instruction + f""" 1. `Scenario` must be rewritten so participants must naturally rely on **all elements of `Context`** during the conversation. 2. `Rewritten Scenario` MUST remain a realistic multi-turn conversation setup. 3. Keep the rewritten scenario under **60 words**. @@ -1108,13 +1085,10 @@ def multi_context_evolution(scenario, context): {scenario} Rewritten Scenario: """ - ) @staticmethod def reasoning_evolution(scenario, context): - return ( - ConversationalEvolutionTemplate.base_instruction - + f""" + return ConversationalEvolutionTemplate.base_instruction + f""" 1. Rewrite `Scenario` so the resulting conversation requires multi-step reasoning between participants. 2. Add layered inferential or analytical demands grounded in `Context`. 3. Keep the rewritten scenario under **60 words**. @@ -1148,13 +1122,10 @@ def reasoning_evolution(scenario, context): {scenario} Rewritten Scenario: """ - ) @staticmethod def concretizing_evolution(scenario, context): - return ( - ConversationalEvolutionTemplate.base_instruction - + f""" + return ConversationalEvolutionTemplate.base_instruction + f""" 1. Rewrite `Scenario` by replacing general conversational settings with **highly specific**, concrete circumstances tied to `Context`. 2. Add situational cues, named events, or explicit constraints. 3. Keep the rewritten scenario under **60 words**. @@ -1187,13 +1158,10 @@ def concretizing_evolution(scenario, context): {scenario} Rewritten Scenario: """ - ) @staticmethod def constrained_evolution(scenario, context): - return ( - ConversationalEvolutionTemplate.base_instruction - + f""" + return ConversationalEvolutionTemplate.base_instruction + f""" 1. Rewrite `Scenario` by adding at least **one new constraint** that shapes how the conversation unfolds. 2. The constraint must logically follow from `Context`. 3. Keep the rewritten scenario under **60 words**. @@ -1226,13 +1194,10 @@ def constrained_evolution(scenario, context): {scenario} Rewritten Scenario: """ - ) @staticmethod def comparative_question_evolution(scenario, context): - return ( - ConversationalEvolutionTemplate.base_instruction - + f""" + return ConversationalEvolutionTemplate.base_instruction + f""" 1. Rewrite `Scenario` so the conversation naturally compares two or more concepts, tools, or approaches. 2. The comparison must be central to the multi-turn dialogue. 3. Keep the rewritten scenario under **60 words**. @@ -1264,13 +1229,10 @@ def comparative_question_evolution(scenario, context): {scenario} Rewritten Scenario: """ - ) @staticmethod def hypothetical_scenario_evolution(scenario, context): - return ( - ConversationalEvolutionTemplate.base_instruction - + f""" + return ConversationalEvolutionTemplate.base_instruction + f""" 1. Rewrite `Scenario` by adding a hypothetical twist grounded in `Context`. 2. The speculative change MUST drive the conversation. 3. Must remain realistic and multi-turn. @@ -1303,13 +1265,10 @@ def hypothetical_scenario_evolution(scenario, context): {scenario} Rewritten Scenario: """ - ) @staticmethod def in_breadth_evolution(scenario, context): - return ( - ConversationalEvolutionTemplate.base_instruction - + f""" + return ConversationalEvolutionTemplate.base_instruction + f""" 1. Rewrite `Scenario` into a brand-new conversational setup. 2. It must remain in the **same domain** but shift toward a **rarer or niche** topic. 3. Must remain a realistic multi-turn dialogue setup. @@ -1342,4 +1301,3 @@ def in_breadth_evolution(scenario, context): {scenario} Rewritten Scenario: """ - ) diff --git a/deepeval/synthesizer/templates/template_prompt.py b/deepeval/synthesizer/templates/template_prompt.py index de5970e9d..aab76539a 100644 --- a/deepeval/synthesizer/templates/template_prompt.py +++ b/deepeval/synthesizer/templates/template_prompt.py @@ -116,9 +116,7 @@ class PromptEvolutionTemplate: @staticmethod def reasoning_evolution(input): - return ( - PromptEvolutionTemplate.base_instruction - + f""" + return PromptEvolutionTemplate.base_instruction + f""" 1. If `Input` can be solved with just a few simple thinking processes, you can rewrite it to explicitly request multiple-step reasoning. 2. `Rewritten Input` should require readers to make multiple logical connections or inferences. 3. `Rewritten Input` should be concise and understandable by humans. @@ -151,13 +149,10 @@ def reasoning_evolution(input): {input} Rewritten Input: """ - ) @staticmethod def concretizing_evolution(input): - return ( - PromptEvolutionTemplate.base_instruction - + f""" + return PromptEvolutionTemplate.base_instruction + f""" 1. Rewrite `Input` by replacing general concepts/inquiries with more specific ones. 2. `Rewritten Input` should be concise and understandable by humans. 3. `Rewritten Input` should not contain more than 15 words. Use abbreviation wherever possible. @@ -189,13 +184,10 @@ def concretizing_evolution(input): {input} Rewritten Input: """ - ) @staticmethod def constrained_evolution(input): - return ( - PromptEvolutionTemplate.base_instruction - + f""" + return PromptEvolutionTemplate.base_instruction + f""" 1. Rewrite `Input` by adding at least one more constraints/requirements. 2. `Rewritten Input` should not contain more than 15 words. Use abbreviation wherever possible. @@ -226,13 +218,10 @@ def constrained_evolution(input): {input} Rewritten Input: """ - ) @staticmethod def comparative_question_evolution(input): - return ( - PromptEvolutionTemplate.base_instruction - + f""" + return PromptEvolutionTemplate.base_instruction + f""" 1. Rewrite `Input` to focus on comparing two or more entities, concepts, or processes. 2. `Rewritten Input` should encourage a detailed comparison that highlights similarities and differences. 3. `Rewritten Input` should be concise and understandable by humans. @@ -266,13 +255,10 @@ def comparative_question_evolution(input): {input} Rewritten Input: """ - ) @staticmethod def hypothetical_scenario_evolution(input): - return ( - PromptEvolutionTemplate.base_instruction - + f""" + return PromptEvolutionTemplate.base_instruction + f""" 1. Rewrite `Input` to include a hypothetical or speculative scenario. 2. `Rewritten Input` should encourage the reader to apply knowledge to imagine or deduce outcomes. 3. `Rewritten Input` should be concise, clear, and understandable by humans. @@ -305,13 +291,10 @@ def hypothetical_scenario_evolution(input): {input} Rewritten Input: """ - ) @staticmethod def in_breadth_evolution(input): - return ( - PromptEvolutionTemplate.base_instruction - + f""" + return PromptEvolutionTemplate.base_instruction + f""" 1. Rewrite `Input` to create a create a brand new prompt. 2. `Rewritten Input` should belong to the same domain as the `input` but be even more rare. 3. `Rewritten Input` should be concise, clear, and understandable by humans. @@ -344,7 +327,6 @@ def in_breadth_evolution(input): {input} Rewritten Input: """ - ) class ConversationalPromptEvolutionTemplate: @@ -354,9 +336,7 @@ class ConversationalPromptEvolutionTemplate: @staticmethod def reasoning_evolution(scenario): - return ( - ConversationalPromptEvolutionTemplate.base_instruction - + f""" + return ConversationalPromptEvolutionTemplate.base_instruction + f""" 1. Rewrite `Scenario` to force participants into multi-step conversational reasoning. 2. Add layered inferences or analytical leaps required in dialogue. 3. `Rewritten Scenario` must stay concise, human-readable, and remain a conversation setup. @@ -383,13 +363,10 @@ def reasoning_evolution(scenario): {scenario} Rewritten Scenario: """ - ) @staticmethod def concretizing_evolution(scenario): - return ( - ConversationalPromptEvolutionTemplate.base_instruction - + f""" + return ConversationalPromptEvolutionTemplate.base_instruction + f""" 1. Replace broad conversation setup with a **more specific, concrete** conversational scene. 2. Add real-world detail (location, constraint, specific topic). 3. Keep under **15 words**, concise, and still a dialogue setup. @@ -415,13 +392,10 @@ def concretizing_evolution(scenario): {scenario} Rewritten Scenario: """ - ) @staticmethod def constrained_evolution(scenario): - return ( - ConversationalPromptEvolutionTemplate.base_instruction - + f""" + return ConversationalPromptEvolutionTemplate.base_instruction + f""" 1. Add at least one new constraint shaping the conversation. 2. Constraint must significantly affect the dialogue. 3. Keep under **15 words**, concise, conversational. @@ -447,13 +421,10 @@ def constrained_evolution(scenario): {scenario} Rewritten Scenario: """ - ) @staticmethod def comparative_question_evolution(scenario): - return ( - ConversationalPromptEvolutionTemplate.base_instruction - + f""" + return ConversationalPromptEvolutionTemplate.base_instruction + f""" 1. Rewrite `Scenario` so the conversation centers on comparing two+ items. 2. Must highlight similarities/differences through dialogue. 3. Keep under **15 words**, concise, conversational. @@ -479,13 +450,10 @@ def comparative_question_evolution(scenario): {scenario} Rewritten Scenario: """ - ) @staticmethod def hypothetical_scenario_evolution(scenario): - return ( - ConversationalPromptEvolutionTemplate.base_instruction - + f""" + return ConversationalPromptEvolutionTemplate.base_instruction + f""" 1. Rewrite `Scenario` to introduce a hypothetical twist derived from the setup. 2. The hypothetical MUST drive the conversation. 3. Keep under **15 words**, concise, conversational. @@ -511,13 +479,10 @@ def hypothetical_scenario_evolution(scenario): {scenario} Rewritten Scenario: """ - ) @staticmethod def in_breadth_evolution(scenario): - return ( - ConversationalPromptEvolutionTemplate.base_instruction - + f""" + return ConversationalPromptEvolutionTemplate.base_instruction + f""" 1. Rewrite `Scenario` into a new conversation within the same domain. 2. The new conversation must explore a rarer, niche angle. 3. Keep under **15 words**, concise, conversational. @@ -543,4 +508,3 @@ def in_breadth_evolution(scenario): {scenario} Rewritten Scenario: """ - ) diff --git a/docs/content/docs/(custom)/metrics-llm-evals.mdx b/docs/content/docs/(custom)/metrics-llm-evals.mdx index 922557a09..f19541d5b 100644 --- a/docs/content/docs/(custom)/metrics-llm-evals.mdx +++ b/docs/content/docs/(custom)/metrics-llm-evals.mdx @@ -47,7 +47,7 @@ correctness_metric = GEval( ) ``` -There are **THREE** mandatory and **SEVEN** optional parameters required when instantiating an `GEval` class: +There are **THREE** mandatory and several optional parameters required when instantiating an `GEval` class: - `name`: name of custom metric. - `criteria`: a description outlining the specific evaluation aspects for each test case. @@ -59,6 +59,7 @@ There are **THREE** mandatory and **SEVEN** optional parameters required when in - [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. - [Optional] `async_mode`: a boolean which when set to `True`, enables [concurrent execution within the `measure()` method.](/docs/metrics-introduction#measuring-metrics-in-async) Defaulted to `True`. - [Optional] `verbose_mode`: a boolean which when set to `True`, prints the intermediate steps used to calculate said metric to the console, as outlined in the [How Is It Calculated](#how-is-it-calculated) section. Defaulted to `False`. +- [Optional] `max_retrieval_context_tokens`: an integer token budget for `retrieval_context` when it is included in `evaluation_params`. When set, large retrieved chunks are compacted before they are added to the G-Eval prompt. Defaulted to `None`. - [Optional] `evaluation_template`: a class of type `GEvalTemplate`, which allows you to [override the default prompts](#customize-your-template) used to compute the `GEval` score. Defaulted to `deepeval`'s `GEvalTemplate`. :::danger @@ -118,6 +119,70 @@ correctness_metric = GEval( ) ``` +### Large Retrieval Contexts + +When you use `SingleTurnParams.RETRIEVAL_CONTEXT` in a custom G-Eval metric, the retrieved chunks become part of the judge prompt. If your retriever returns very large documents, this can push the prompt toward the model context limit and make the judge focus on irrelevant middle sections. + +Use `max_retrieval_context_tokens` to keep the retrieval evidence bounded: + +```python +from deepeval.metrics import GEval +from deepeval.test_case import SingleTurnParams + +faithfulness = GEval( + name="RAG Faithfulness", + criteria="Check whether the actual output is supported by the retrieval context.", + evaluation_params=[ + SingleTurnParams.ACTUAL_OUTPUT, + SingleTurnParams.RETRIEVAL_CONTEXT, + ], + max_retrieval_context_tokens=4000, +) +``` + +When the retrieval context is over budget, `GEval` keeps the chunks with the strongest lexical overlap against the selected input, actual output, and expected output fields. It then preserves the start and end of each visible chunk, includes source labels for `RetrievedContextData`, and inserts an omission marker so the judge knows the context was compacted. + +You can also inspect the bounded prompt before running an evaluation. This is useful in CI or when tuning a RAG evaluator because it does not call the evaluation model: + +```python +from deepeval.metrics import GEval +from deepeval.test_case import LLMTestCase, SingleTurnParams + +metric = GEval( + name="RAG Faithfulness", + evaluation_steps=[ + "Check whether the actual output is supported by retrieval context.", + "Penalize unsupported claims.", + ], + evaluation_params=[ + SingleTurnParams.ACTUAL_OUTPUT, + SingleTurnParams.RETRIEVAL_CONTEXT, + ], + max_retrieval_context_tokens=4000, +) + +test_case = LLMTestCase( + input="Can customers get a refund?", + actual_output="Customers can request a refund within 30 days.", + retrieval_context=[...], +) + +report = metric.get_retrieval_context_budget_report(test_case) +coverage = metric.get_retrieval_context_evidence_coverage(test_case) +prompt = metric.preview_evaluation_prompt(test_case) + +print(report.original_tokens) +print(report.rendered_tokens) +print(report.compression_ratio) +print(coverage.coverage_ratio) +print(coverage.missing_terms) +print(prompt) +``` + +`get_retrieval_context_budget_report()` returns the estimated original tokens, rendered tokens, compression ratio, visible chunks, omitted chunks, per-chunk relevance scores, per-chunk source metadata, and an evidence coverage summary. The coverage summary compares terms from the selected input, actual output, and expected output fields against the rendered retrieval context, so you can catch cases where compaction removed evidence the judge still needs. + +You can call `get_retrieval_context_evidence_coverage()` directly when you only need the coverage ratio, covered terms, missing terms, and warning message. `preview_evaluation_prompt()` requires explicit `evaluation_steps` so it never triggers a hidden LLM call to generate steps. + ### Rubric You can provide a list of `Rubric`s through the `rubric` argument to confine your evaluation LLM to output in specific score ranges: @@ -495,4 +560,3 @@ metric.measure(...) }, ]} /> - diff --git a/manual_after_evals_iterator.py b/manual_after_evals_iterator.py index b3188b3de..23c053768 100644 --- a/manual_after_evals_iterator.py +++ b/manual_after_evals_iterator.py @@ -19,7 +19,6 @@ ) from deepeval.tracing.context import next_agent_span - RUN_ID = f"{Path(__file__).stem}-{uuid.uuid4().hex[:8]}" diff --git a/scripts/check_openai_model_capabilities.py b/scripts/check_openai_model_capabilities.py index a2bee11d8..6d76c777a 100644 --- a/scripts/check_openai_model_capabilities.py +++ b/scripts/check_openai_model_capabilities.py @@ -19,7 +19,6 @@ from deepeval.models.llms.constants import OPENAI_MODELS_DATA - DEFAULT_MODELS = ("gpt-5.4", "gpt-5.5") diff --git a/skills/deepeval/templates/metrics.py b/skills/deepeval/templates/metrics.py index 18acfdb33..632dda02e 100644 --- a/skills/deepeval/templates/metrics.py +++ b/skills/deepeval/templates/metrics.py @@ -5,7 +5,6 @@ TaskCompletionMetric, ) - # Keep metrics in one module so eval files stay focused on app execution. # Reuse existing project metrics and thresholds before adding new ones. SINGLE_TURN_TRACE_METRICS = [ diff --git a/skills/deepeval/templates/test_single_turn_no_tracing.py b/skills/deepeval/templates/test_single_turn_no_tracing.py index efbe357cb..2707d1f10 100644 --- a/skills/deepeval/templates/test_single_turn_no_tracing.py +++ b/skills/deepeval/templates/test_single_turn_no_tracing.py @@ -8,7 +8,6 @@ from metrics import SINGLE_TURN_NO_TRACING_METRICS - ai_app = import_module("ai_app") diff --git a/skills/deepeval/templates/test_single_turn_tracing.py b/skills/deepeval/templates/test_single_turn_tracing.py index d6002a386..fc07cad8c 100644 --- a/skills/deepeval/templates/test_single_turn_tracing.py +++ b/skills/deepeval/templates/test_single_turn_tracing.py @@ -7,7 +7,6 @@ from metrics import SINGLE_TURN_TRACE_METRICS - ai_app = import_module("ai_app") diff --git a/test_agentcore_agent.py b/test_agentcore_agent.py index 3c022e253..fd7cb7a82 100644 --- a/test_agentcore_agent.py +++ b/test_agentcore_agent.py @@ -34,7 +34,6 @@ from deepeval.metrics import AnswerRelevancyMetric from deepeval.tracing.context import next_agent_span - RUN_ID = f"{Path(__file__).stem}-{uuid.uuid4().hex[:8]}" diff --git a/test_pydantic_agent.py b/test_pydantic_agent.py index 46c50fbab..723ed43df 100644 --- a/test_pydantic_agent.py +++ b/test_pydantic_agent.py @@ -29,7 +29,6 @@ from deepeval.metrics import AnswerRelevancyMetric from deepeval.tracing.context import next_agent_span - RUN_ID = f"{Path(__file__).stem}-{uuid.uuid4().hex[:8]}" diff --git a/tests/test_core/test_evaluation/test_local_store.py b/tests/test_core/test_evaluation/test_local_store.py index 398b39379..649b96f0c 100644 --- a/tests/test_core/test_evaluation/test_local_store.py +++ b/tests/test_core/test_evaluation/test_local_store.py @@ -19,7 +19,6 @@ TestRunManager as _TestRunManager, ) - FILENAME_RE = re.compile(r"^test_run_\d{8}_\d{6}(?:_(\d+))?\.json$") diff --git a/tests/test_core/test_simulator/test_conversation_simulator_decision_graph.py b/tests/test_core/test_simulator/test_conversation_simulator_decision_graph.py index 03ed37c4d..f288b08d1 100644 --- a/tests/test_core/test_simulator/test_conversation_simulator_decision_graph.py +++ b/tests/test_core/test_simulator/test_conversation_simulator_decision_graph.py @@ -15,7 +15,6 @@ static_callback, ) - # --------------------------------------------------------------------------- # Builder / validation tests # --------------------------------------------------------------------------- diff --git a/tests/test_integrations/test_agentcore/test_evaluate_agent.py b/tests/test_integrations/test_agentcore/test_evaluate_agent.py index c4399a718..eb68f5ab8 100644 --- a/tests/test_integrations/test_agentcore/test_evaluate_agent.py +++ b/tests/test_integrations/test_agentcore/test_evaluate_agent.py @@ -35,7 +35,6 @@ init_evals_agentcore, ) - pytestmark = pytest.mark.skipif( not os.getenv("AWS_ACCESS_KEY_ID") or not os.getenv("OPENAI_API_KEY"), reason=( diff --git a/tests/test_integrations/test_agentcore/test_span_interceptor.py b/tests/test_integrations/test_agentcore/test_span_interceptor.py index 586205ba3..a6b68eea5 100644 --- a/tests/test_integrations/test_agentcore/test_span_interceptor.py +++ b/tests/test_integrations/test_agentcore/test_span_interceptor.py @@ -48,7 +48,6 @@ ) from deepeval.tracing.trace_context import trace - _span_id_counter = count(start=1) _trace_id_counter = count(start=1) diff --git a/tests/test_integrations/test_googleadk/apps/googleadk_eval_app.py b/tests/test_integrations/test_googleadk/apps/googleadk_eval_app.py index 2f1b5c221..a81463016 100644 --- a/tests/test_integrations/test_googleadk/apps/googleadk_eval_app.py +++ b/tests/test_integrations/test_googleadk/apps/googleadk_eval_app.py @@ -27,7 +27,6 @@ from deepeval.integrations.google_adk import instrument_google_adk from deepeval.tracing import update_current_span - _APP_NAME = "deepeval-googleadk-evals" diff --git a/tests/test_integrations/test_googleadk/apps/googleadk_multiple_tools_app.py b/tests/test_integrations/test_googleadk/apps/googleadk_multiple_tools_app.py index 01a1b328e..b5066e7f5 100644 --- a/tests/test_integrations/test_googleadk/apps/googleadk_multiple_tools_app.py +++ b/tests/test_integrations/test_googleadk/apps/googleadk_multiple_tools_app.py @@ -16,7 +16,6 @@ from deepeval.integrations.google_adk import instrument_google_adk - _APP_NAME = "deepeval-googleadk-multiple-tools" diff --git a/tests/test_integrations/test_googleadk/apps/googleadk_simple_app.py b/tests/test_integrations/test_googleadk/apps/googleadk_simple_app.py index 09fec5505..6b8d649fa 100644 --- a/tests/test_integrations/test_googleadk/apps/googleadk_simple_app.py +++ b/tests/test_integrations/test_googleadk/apps/googleadk_simple_app.py @@ -15,7 +15,6 @@ from deepeval.integrations.google_adk import instrument_google_adk - _APP_NAME = "deepeval-googleadk-simple" diff --git a/tests/test_integrations/test_googleadk/apps/googleadk_tool_app.py b/tests/test_integrations/test_googleadk/apps/googleadk_tool_app.py index aced7b8ad..f5355ebec 100644 --- a/tests/test_integrations/test_googleadk/apps/googleadk_tool_app.py +++ b/tests/test_integrations/test_googleadk/apps/googleadk_tool_app.py @@ -15,7 +15,6 @@ from deepeval.integrations.google_adk import instrument_google_adk - _APP_NAME = "deepeval-googleadk-tool" diff --git a/tests/test_integrations/test_googleadk/conftest.py b/tests/test_integrations/test_googleadk/conftest.py index e86518606..943497657 100644 --- a/tests/test_integrations/test_googleadk/conftest.py +++ b/tests/test_integrations/test_googleadk/conftest.py @@ -29,7 +29,6 @@ is_generate_mode, ) - _current_dir = os.path.dirname(os.path.abspath(__file__)) _schemas_dir = os.path.join(_current_dir, "schemas") diff --git a/tests/test_integrations/test_googleadk/test_async.py b/tests/test_integrations/test_googleadk/test_async.py index e32e8a425..046f623c4 100644 --- a/tests/test_integrations/test_googleadk/test_async.py +++ b/tests/test_integrations/test_googleadk/test_async.py @@ -37,7 +37,6 @@ ) from tests.test_integrations.test_googleadk.conftest import trace_test - pytestmark = pytest.mark.skipif( not os.getenv("GOOGLE_API_KEY"), reason="GOOGLE_API_KEY is required to run Google ADK tests against Gemini.", diff --git a/tests/test_integrations/test_googleadk/test_evaluate_agent.py b/tests/test_integrations/test_googleadk/test_evaluate_agent.py index 29471ab95..88e975ee9 100644 --- a/tests/test_integrations/test_googleadk/test_evaluate_agent.py +++ b/tests/test_integrations/test_googleadk/test_evaluate_agent.py @@ -35,7 +35,6 @@ init_evals_googleadk, ) - pytestmark = pytest.mark.skipif( not os.getenv("GOOGLE_API_KEY") or not os.getenv("OPENAI_API_KEY"), reason=( diff --git a/tests/test_integrations/test_googleadk/test_span_interceptor.py b/tests/test_integrations/test_googleadk/test_span_interceptor.py index 00440b230..79846a5e6 100644 --- a/tests/test_integrations/test_googleadk/test_span_interceptor.py +++ b/tests/test_integrations/test_googleadk/test_span_interceptor.py @@ -66,7 +66,6 @@ ) from deepeval.tracing.trace_context import trace - _span_id_counter = count(start=1) _trace_id_counter = count(start=1) diff --git a/tests/test_integrations/test_googleadk/test_sync.py b/tests/test_integrations/test_googleadk/test_sync.py index f27184227..85d6faee0 100644 --- a/tests/test_integrations/test_googleadk/test_sync.py +++ b/tests/test_integrations/test_googleadk/test_sync.py @@ -40,7 +40,6 @@ ) from tests.test_integrations.test_googleadk.conftest import trace_test - pytestmark = pytest.mark.skipif( not os.getenv("GOOGLE_API_KEY"), reason="GOOGLE_API_KEY is required to run Google ADK tests against Gemini.", diff --git a/tests/test_integrations/test_langchain/test_next_span.py b/tests/test_integrations/test_langchain/test_next_span.py index c0a16d9b5..4ed41fa94 100644 --- a/tests/test_integrations/test_langchain/test_next_span.py +++ b/tests/test_integrations/test_langchain/test_next_span.py @@ -41,7 +41,6 @@ ) from deepeval.tracing.types import LlmSpan, RetrieverSpan, ToolSpan - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- diff --git a/tests/test_integrations/test_langgraph/test_next_span.py b/tests/test_integrations/test_langgraph/test_next_span.py index b8240069b..d095b932e 100644 --- a/tests/test_integrations/test_langgraph/test_next_span.py +++ b/tests/test_integrations/test_langgraph/test_next_span.py @@ -30,7 +30,6 @@ ) from deepeval.tracing.types import LlmSpan, ToolSpan - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- diff --git a/tests/test_integrations/test_pydanticai/apps/pydanticai_isolation_app.py b/tests/test_integrations/test_pydanticai/apps/pydanticai_isolation_app.py index ce404e87f..018b94fbf 100644 --- a/tests/test_integrations/test_pydanticai/apps/pydanticai_isolation_app.py +++ b/tests/test_integrations/test_pydanticai/apps/pydanticai_isolation_app.py @@ -32,7 +32,6 @@ from deepeval.integrations.pydantic_ai import DeepEvalInstrumentationSettings from deepeval.tracing import update_current_span, update_current_trace - # Per-request ContextVar carrying request data the tool body reads back. # In each task / worker thread we ``set`` this BEFORE calling agent.run; # inside the tool we ``get`` it. The whole point is to confirm the value diff --git a/tests/test_integrations/test_pydanticai/test_span_interceptor.py b/tests/test_integrations/test_pydanticai/test_span_interceptor.py index 3b34e6c93..51c97db8a 100644 --- a/tests/test_integrations/test_pydanticai/test_span_interceptor.py +++ b/tests/test_integrations/test_pydanticai/test_span_interceptor.py @@ -44,7 +44,6 @@ ) from deepeval.tracing.trace_context import trace - _span_id_counter = count(start=1) _trace_id_counter = count(start=1) diff --git a/tests/test_integrations/test_strands/apps/strands_eval_app.py b/tests/test_integrations/test_strands/apps/strands_eval_app.py index 0de1e7f03..ecc71f391 100644 --- a/tests/test_integrations/test_strands/apps/strands_eval_app.py +++ b/tests/test_integrations/test_strands/apps/strands_eval_app.py @@ -24,7 +24,6 @@ from deepeval.integrations.strands import instrument_strands from deepeval.tracing import update_current_span - _DEFAULT_MODEL_ID = os.environ.get("STRANDS_TEST_MODEL", "gpt-4o-mini") diff --git a/tests/test_integrations/test_strands/apps/strands_multiple_tools_app.py b/tests/test_integrations/test_strands/apps/strands_multiple_tools_app.py index c3839e51a..433f41796 100644 --- a/tests/test_integrations/test_strands/apps/strands_multiple_tools_app.py +++ b/tests/test_integrations/test_strands/apps/strands_multiple_tools_app.py @@ -5,7 +5,6 @@ from deepeval.integrations.strands import instrument_strands - _DEFAULT_MODEL_ID = os.environ.get("STRANDS_TEST_MODEL", "gpt-4o-mini") diff --git a/tests/test_integrations/test_strands/apps/strands_simple_app.py b/tests/test_integrations/test_strands/apps/strands_simple_app.py index 9383e248c..a1e94d625 100644 --- a/tests/test_integrations/test_strands/apps/strands_simple_app.py +++ b/tests/test_integrations/test_strands/apps/strands_simple_app.py @@ -5,7 +5,6 @@ from deepeval.integrations.strands import instrument_strands - _DEFAULT_MODEL_ID = os.environ.get("STRANDS_TEST_MODEL", "gpt-4o-mini") diff --git a/tests/test_integrations/test_strands/apps/strands_tool_app.py b/tests/test_integrations/test_strands/apps/strands_tool_app.py index b26171598..c9d0d1c0d 100644 --- a/tests/test_integrations/test_strands/apps/strands_tool_app.py +++ b/tests/test_integrations/test_strands/apps/strands_tool_app.py @@ -5,7 +5,6 @@ from deepeval.integrations.strands import instrument_strands - _DEFAULT_MODEL_ID = os.environ.get("STRANDS_TEST_MODEL", "gpt-4o-mini") diff --git a/tests/test_integrations/test_strands/test_evaluate_agent.py b/tests/test_integrations/test_strands/test_evaluate_agent.py index e66b1b5de..445ce9a4b 100644 --- a/tests/test_integrations/test_strands/test_evaluate_agent.py +++ b/tests/test_integrations/test_strands/test_evaluate_agent.py @@ -35,7 +35,6 @@ init_evals_strands, ) - pytestmark = pytest.mark.skipif( not os.getenv("OPENAI_API_KEY"), reason=( diff --git a/tests/test_integrations/test_strands/test_span_interceptor.py b/tests/test_integrations/test_strands/test_span_interceptor.py index 040229e40..a711fe005 100644 --- a/tests/test_integrations/test_strands/test_span_interceptor.py +++ b/tests/test_integrations/test_strands/test_span_interceptor.py @@ -48,7 +48,6 @@ ) from deepeval.tracing.trace_context import trace - _span_id_counter = count(start=1) _trace_id_counter = count(start=1) diff --git a/tests/test_metrics/test_dag_serialization.py b/tests/test_metrics/test_dag_serialization.py index a343db5e1..477fcdc50 100644 --- a/tests/test_metrics/test_dag_serialization.py +++ b/tests/test_metrics/test_dag_serialization.py @@ -27,7 +27,6 @@ from deepeval.metrics.dag.utils import is_valid_dag_from_roots from deepeval.test_case import SingleTurnParams, MultiTurnParams - # ---------------------------------------------------------------------------- # Single-turn structural round-trips (no LLM dependency) # ---------------------------------------------------------------------------- diff --git a/tests/test_metrics/test_g_eval_prompt_budget.py b/tests/test_metrics/test_g_eval_prompt_budget.py new file mode 100644 index 000000000..e1a24915e --- /dev/null +++ b/tests/test_metrics/test_g_eval_prompt_budget.py @@ -0,0 +1,128 @@ +import pytest + +from deepeval.metrics import GEval +from deepeval.models import DeepEvalBaseLLM +from deepeval.test_case import ( + LLMTestCase, + RetrievedContextData, + SingleTurnParams, +) + + +class LocalEchoModel(DeepEvalBaseLLM): + def load_model(self): + return self + + def generate(self, *args, **kwargs) -> str: + return '{"score": 10, "reason": "ok"}' + + async def a_generate(self, *args, **kwargs) -> str: + return '{"score": 10, "reason": "ok"}' + + def get_model_name(self, *args, **kwargs) -> str: + return "local-echo" + + +def build_large_rag_case() -> LLMTestCase: + return LLMTestCase( + input="Can enterprise customers get a refund after onboarding?", + actual_output=( + "Enterprise customers can request a refund within 30 days, " + "but onboarding fees are non-refundable." + ), + retrieval_context=[ + RetrievedContextData( + source="handbook/refunds.md", + context=( + "Refunds are allowed for enterprise customers within " + "30 days. " + + ("Unrelated account-management details. " * 700) + + "Onboarding fees are non-refundable." + ), + ), + RetrievedContextData( + source="handbook/security.md", + context="Security review requirements. " * 650, + ), + RetrievedContextData( + source="handbook/support.md", + context="Support escalation playbook. " * 650, + ), + RetrievedContextData( + source="handbook/billing.md", + context="Billing ownership details. " * 650, + ), + RetrievedContextData( + source="handbook/sales.md", + context="Sales handoff details. " * 650, + ), + RetrievedContextData( + source="handbook/legal.md", + context="Legal escalation details. " * 650, + ), + RetrievedContextData( + source="handbook/procurement.md", + context="Procurement workflow details. " * 650, + ), + RetrievedContextData( + source="handbook/renewals.md", + context="Renewal workflow details. " * 650, + ), + ], + ) + + +def build_metric() -> GEval: + return GEval( + name="RAG Faithfulness", + evaluation_steps=[ + "Check whether the actual output is supported by retrieval context.", + "Penalize unsupported refund or onboarding claims.", + ], + evaluation_params=[ + SingleTurnParams.INPUT, + SingleTurnParams.ACTUAL_OUTPUT, + SingleTurnParams.RETRIEVAL_CONTEXT, + ], + max_retrieval_context_tokens=160, + model=LocalEchoModel(), + ) + + +def test_geval_preview_prompt_bounds_large_rag_context_without_model_call(): + test_case = build_large_rag_case() + metric = build_metric() + + prompt = metric.preview_evaluation_prompt(test_case) + report = metric.get_retrieval_context_budget_report(test_case) + coverage = metric.get_retrieval_context_evidence_coverage(test_case) + + assert report is not None + assert coverage is not None + assert report.original_tokens > 3000 + assert report.rendered_tokens < 260 + assert report.compression_ratio < 0.1 + assert coverage.coverage_ratio > 0 + assert "refund" in coverage.covered_terms + assert "handbook/refunds.md" in prompt + assert "Refunds are allowed" in prompt + assert "n-refundable" in prompt + assert "retrieval_context compacted for GEval" in prompt + assert report.omitted_chunks > 0 + assert "Legal escalation details" not in prompt + + +def test_geval_preview_prompt_requires_steps_to_avoid_hidden_llm_call(): + metric = GEval( + name="RAG Faithfulness", + criteria="Check whether the answer is grounded in retrieval context.", + evaluation_params=[ + SingleTurnParams.ACTUAL_OUTPUT, + SingleTurnParams.RETRIEVAL_CONTEXT, + ], + max_retrieval_context_tokens=160, + model=LocalEchoModel(), + ) + + with pytest.raises(ValueError, match="requires evaluation_steps"): + metric.preview_evaluation_prompt(build_large_rag_case()) diff --git a/tests/test_metrics/test_g_eval_utils.py b/tests/test_metrics/test_g_eval_utils.py index 8ee48b0b2..7fe124433 100644 --- a/tests/test_metrics/test_g_eval_utils.py +++ b/tests/test_metrics/test_g_eval_utils.py @@ -7,6 +7,8 @@ construct_geval_upload_payload, construct_non_turns_test_case_string, construct_test_case_string, + format_retrieval_context_with_budget, + build_retrieval_context_budget_report, ) from deepeval.metrics.utils import ( check_conversational_test_case_params, @@ -16,6 +18,7 @@ from deepeval.test_case import ( ConversationalTestCase, LLMTestCase, + RetrievedContextData, SingleTurnParams, Turn, MultiTurnParams, @@ -55,6 +58,182 @@ def test_geval_accepts_metadata_and_tags(): assert payload["evaluationParams"] == ["metadata", "tags"] +def test_geval_retrieval_context_budget_compacts_large_chunks(): + large_context = ( + "refund policy starts here. " + + ("middle evidence that should be compacted. " * 200) + + "refund policy ends here." + ) + test_case = LLMTestCase( + input="What is the refund window?", + actual_output="30 days", + retrieval_context=[large_context], + ) + + text = construct_test_case_string( + [SingleTurnParams.RETRIEVAL_CONTEXT], + test_case, + max_retrieval_context_tokens=80, + ) + + assert "retrieval_context compacted for GEval" in text + assert "refund policy starts here" in text + assert "refund policy ends here" in text + assert "omitted" in text + assert len(text) < len(large_context) + + +def test_geval_retrieval_context_budget_keeps_source_labels(): + retrieval_context = [ + RetrievedContextData( + source="docs/refunds.md", + context="Refunds are available within 30 days. " * 120, + ), + "Warranty claims require a receipt. " * 120, + ] + + text = format_retrieval_context_with_budget( + retrieval_context, + max_retrieval_context_tokens=96, + ) + + assert "[1 source=docs/refunds.md]" in text + assert "[2]" in text + assert "Refunds are available" in text + assert "Warranty claims" in text + + +def test_geval_retrieval_context_budget_marks_omitted_chunks(): + retrieval_context = [ + f"retrieval chunk {index} " + ("evidence " * 120) for index in range(8) + ] + + text = format_retrieval_context_with_budget( + retrieval_context, + max_retrieval_context_tokens=96, + ) + + assert "retrieval chunk 0" in text + assert "omitted" in text + assert "retrieval chunks because" in text + + +def test_geval_retrieval_context_budget_report_quantifies_compaction(): + retrieval_context = [ + RetrievedContextData( + source="kb/refunds.md", + context=( + "Refund policy anchor. " + + ("irrelevant body " * 500) + + "Refund policy tail evidence." + ), + ), + RetrievedContextData( + source="kb/warranty.md", + context="Warranty claims need receipts. " * 400, + ), + "Shipping delays are excluded from refunds. " * 400, + ] + + report = build_retrieval_context_budget_report( + retrieval_context, + max_retrieval_context_tokens=120, + ) + + assert report.exceeded_budget is True + assert report.original_tokens > report.rendered_tokens + assert report.compression_ratio < 0.25 + assert report.total_chunks == 3 + assert report.visible_chunks >= 1 + assert any(chunk.source == "kb/refunds.md" for chunk in report.chunks) + assert "Refund policy anchor" in report.rendered_context + assert "tail evidence" in report.rendered_context + assert "source=kb/refunds.md" in report.rendered_context + assert report.evidence_coverage.coverage_ratio == 1.0 + + +def test_geval_retrieval_context_budget_prioritizes_relevant_chunks(): + retrieval_context = [ + RetrievedContextData( + source="kb/security.md", + context="SAML setup and audit logging configuration. " * 160, + ), + RetrievedContextData( + source="kb/sales.md", + context="Pipeline stages and account ownership notes. " * 160, + ), + RetrievedContextData( + source="kb/refunds.md", + context=( + "Enterprise refunds are available within 30 days after purchase. " + "Onboarding fees are non-refundable." + ), + ), + RetrievedContextData( + source="kb/procurement.md", + context="Vendor review and procurement workflow details. " * 160, + ), + ] + + report = build_retrieval_context_budget_report( + retrieval_context, + max_retrieval_context_tokens=64, + relevance_query=( + "Can enterprise customers get a refund after onboarding? " + "Enterprise customers can request a refund within 30 days, " + "but onboarding fees are non-refundable." + ), + ) + + assert "kb/refunds.md" in report.rendered_context + assert "Enterprise refunds are available" in report.rendered_context + assert "kb/procurement.md" not in report.rendered_context + refund_chunk = next( + chunk for chunk in report.chunks if chunk.source == "kb/refunds.md" + ) + assert refund_chunk.omitted is False + assert refund_chunk.relevance_score > 0 + assert report.omitted_chunks == 2 + assert "enterprise" in report.evidence_coverage.covered_terms + assert "refund" in report.evidence_coverage.covered_terms + assert report.evidence_coverage.missing_terms == [] + + +def test_geval_retrieval_context_budget_reports_missing_evidence_terms(): + retrieval_context = [ + RetrievedContextData( + source="kb/refunds.md", + context=( + "enterprise refund policy starts. " + + ("background notes " * 120) + + "nonrefundable onboarding fee." + ), + ) + ] + + report = build_retrieval_context_budget_report( + retrieval_context, + max_retrieval_context_tokens=18, + relevance_query=( + "Enterprise customers can request a refund, " + "but onboarding is nonrefundable." + ), + ) + + assert report.evidence_coverage.coverage_ratio < 1.0 + assert "enterprise" in report.evidence_coverage.covered_terms + assert "nonrefundable" in report.evidence_coverage.missing_terms + assert report.evidence_coverage.warning is not None + + +def test_geval_retrieval_context_budget_rejects_invalid_budget(): + with pytest.raises(ValueError, match="greater than 0"): + format_retrieval_context_with_budget( + ["context"], + max_retrieval_context_tokens=0, + ) + + def test_geval_requires_metadata_when_selected(): test_case = LLMTestCase(input="input", tags=["tag"])