diff --git a/README.md b/README.md
index 46bd39f8ad..1f784e6bd4 100644
--- a/README.md
+++ b/README.md
@@ -220,13 +220,13 @@ Open `test_chatbot.py` and write your first test case to run an **end-to-end** e
import pytest
from deepeval import assert_test
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
def test_case():
correctness_metric = GEval(
name="Correctness",
criteria="Determine if the 'actual output' is correct based on the 'expected output'.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
threshold=0.5
)
test_case = LLMTestCase(
@@ -268,14 +268,14 @@ Use the `@observe` decorator to trace components (LLM calls, retrievers, tool ca
```python
from deepeval.tracing import observe, update_current_span
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.dataset import EvaluationDataset, Golden
from deepeval.metrics import GEval
correctness = GEval(
name="Correctness",
criteria="Determine if the 'actual output' is correct based on the 'expected output'.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
)
@observe(metrics=[correctness])
diff --git a/deepeval/cli/main.py b/deepeval/cli/main.py
index 9731334b75..f336c4e936 100644
--- a/deepeval/cli/main.py
+++ b/deepeval/cli/main.py
@@ -270,7 +270,6 @@ def login(
settings = get_settings()
save = save or settings.DEEPEVAL_DEFAULT_SAVE or "dotenv:.env.local"
with settings.edit(save=save) as edit_ctx:
- settings.API_KEY = key
settings.CONFIDENT_API_KEY = key
handled, path, updated = edit_ctx.result
@@ -335,7 +334,6 @@ def logout(
settings = get_settings()
save = save or settings.DEEPEVAL_DEFAULT_SAVE or "dotenv:.env.local"
with settings.edit(save=save) as edit_ctx:
- settings.API_KEY = None
settings.CONFIDENT_API_KEY = None
handled, path, updated = edit_ctx.result
diff --git a/deepeval/confident/api.py b/deepeval/confident/api.py
index 8304a94d20..0dcd8b2913 100644
--- a/deepeval/confident/api.py
+++ b/deepeval/confident/api.py
@@ -79,7 +79,7 @@ def get_base_api_url():
def get_confident_api_key() -> Optional[str]:
s = get_settings()
- key: Optional[SecretStr] = s.CONFIDENT_API_KEY or s.API_KEY
+ key: Optional[SecretStr] = s.CONFIDENT_API_KEY
return key.get_secret_value() if key else None
@@ -98,17 +98,14 @@ def set_confident_api_key(api_key: Optional[str]) -> None:
if save is None:
with s.edit(persist=False):
s.CONFIDENT_API_KEY = SecretStr(api_key) if api_key else None
- s.API_KEY = SecretStr(api_key) if api_key else None
else:
# Respect default save: update runtime + write to dotenv, but not JSON
with s.edit(save=save, persist=None):
s.CONFIDENT_API_KEY = SecretStr(api_key) if api_key else None
- s.API_KEY = SecretStr(api_key) if api_key else None
def is_confident():
- confident_api_key = get_confident_api_key()
- return confident_api_key is not None
+ return get_confident_api_key() is not None
def log_retry_error(retry_state: RetryCallState):
diff --git a/deepeval/config/settings.py b/deepeval/config/settings.py
index bd4e561954..ef017bab55 100644
--- a/deepeval/config/settings.py
+++ b/deepeval/config/settings.py
@@ -377,10 +377,6 @@ def __setattr__(self, name: str, value):
# Model Keys
#
- API_KEY: Optional[SecretStr] = Field(
- None,
- description="Alias for CONFIDENT_API_KEY (Confident AI API key).",
- )
CONFIDENT_API_KEY: Optional[SecretStr] = Field(
None,
description="Confident AI API key (used for uploading results/telemetry to Confident).",
diff --git a/deepeval/dataset/dataset.py b/deepeval/dataset/dataset.py
index b0d96fe25f..f1499de310 100644
--- a/deepeval/dataset/dataset.py
+++ b/deepeval/dataset/dataset.py
@@ -350,7 +350,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
raise ValueError(f"Error processing expected_tools: {e}")
else:
expected_tools.append([])
- additional_metadatas = [
+ metadatas = [
ast.literal_eval(metadata) if metadata else None
for metadata in get_column_data(
df, additional_metadata_col_name, default=""
@@ -365,7 +365,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
retrieval_context,
tools_called,
expected_tools,
- additional_metadata,
+ metadata,
) in zip(
inputs,
actual_outputs,
@@ -374,7 +374,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
retrieval_contexts,
tools_called,
expected_tools,
- additional_metadatas,
+ metadatas,
):
self.add_test_case(
LLMTestCase(
@@ -385,7 +385,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
retrieval_context=retrieval_context,
tools_called=tools_called,
expected_tools=expected_tools,
- additional_metadata=additional_metadata,
+ metadata=metadata,
)
)
@@ -575,7 +575,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
comments = get_column_data(df, comments_key_name)
name = get_column_data(df, name_key_name)
source_files = get_column_data(df, source_file_col_name)
- additional_metadatas = [
+ metadatas = [
ast.literal_eval(metadata) if metadata else None
for metadata in get_column_data(
df, additional_metadata_col_name, default=""
@@ -597,7 +597,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
comments,
name,
source_file,
- additional_metadata,
+ metadata,
scenario,
turns,
expected_outcome,
@@ -613,7 +613,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
comments,
name,
source_files,
- additional_metadatas,
+ metadatas,
scenarios,
turns_raw,
expected_outcomes,
@@ -630,7 +630,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
context=context,
comments=comments,
name=name,
- additional_metadata=additional_metadata,
+ additional_metadata=metadata,
)
)
else:
@@ -643,7 +643,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
retrieval_context=retrieval_context,
tools_called=tools_called,
expected_tools=expected_tools,
- additional_metadata=additional_metadata,
+ additional_metadata=metadata,
source_file=source_file,
comments=comments,
name=name,
@@ -688,7 +688,7 @@ def add_goldens_from_json_file(
comments = json_obj.get(comments_key_name)
name = json_obj.get(name_key_name)
parsed_turns = parse_turns(turns) if turns else []
- additional_metadata = json_obj.get(additional_metadata_key_name)
+ metadata = json_obj.get(additional_metadata_key_name)
self.add_golden(
ConversationalGolden(
@@ -699,7 +699,7 @@ def add_goldens_from_json_file(
context=context,
comments=comments,
name=name,
- additional_metadata=additional_metadata,
+ additional_metadata=metadata,
)
)
else:
@@ -713,7 +713,7 @@ def add_goldens_from_json_file(
comments = json_obj.get(comments_key_name)
name = json_obj.get(name_key_name)
source_file = json_obj.get(source_file_key_name)
- additional_metadata = json_obj.get(additional_metadata_key_name)
+ metadata = json_obj.get(additional_metadata_key_name)
self.add_golden(
Golden(
@@ -724,7 +724,7 @@ def add_goldens_from_json_file(
retrieval_context=retrieval_context,
tools_called=tools_called,
expected_tools=expected_tools,
- additional_metadata=additional_metadata,
+ additional_metadata=metadata,
comments=comments,
name=name,
source_file=source_file,
@@ -803,7 +803,7 @@ def parse_tools(value):
comments = json_obj.get(comments_key_name)
name = json_obj.get(name_key_name)
parsed_turns = parse_turns(turns) if turns else []
- additional_metadata = json_obj.get(additional_metadata_key_name)
+ metadata = json_obj.get(additional_metadata_key_name)
custom_column_key_values = json_obj.get(
custom_column_key_values_key_name
)
@@ -817,7 +817,7 @@ def parse_tools(value):
context=context,
comments=comments,
name=name,
- additional_metadata=additional_metadata,
+ additional_metadata=metadata,
custom_column_key_values=custom_column_key_values,
)
)
@@ -839,7 +839,7 @@ def parse_tools(value):
comments = json_obj.get(comments_key_name)
name = json_obj.get(name_key_name)
source_file = json_obj.get(source_file_key_name)
- additional_metadata = json_obj.get(additional_metadata_key_name)
+ metadata = json_obj.get(additional_metadata_key_name)
custom_column_key_values = json_obj.get(
custom_column_key_values_key_name
)
@@ -853,7 +853,7 @@ def parse_tools(value):
retrieval_context=retrieval_context,
tools_called=tools_called,
expected_tools=expected_tools,
- additional_metadata=additional_metadata,
+ additional_metadata=metadata,
custom_column_key_values=custom_column_key_values,
comments=comments,
name=name,
diff --git a/deepeval/dataset/utils.py b/deepeval/dataset/utils.py
index 52023cc623..e8a8a81863 100644
--- a/deepeval/dataset/utils.py
+++ b/deepeval/dataset/utils.py
@@ -24,7 +24,7 @@ def convert_test_cases_to_goldens(
"retrieval_context": test_case.retrieval_context,
"tools_called": test_case.tools_called,
"expected_tools": test_case.expected_tools,
- "additional_metadata": test_case.additional_metadata,
+ "additional_metadata": test_case.metadata,
}
goldens.append(Golden(**golden))
return goldens
@@ -47,7 +47,7 @@ def convert_goldens_to_test_cases(
expected_tools=golden.expected_tools,
name=golden.name,
comments=golden.comments,
- additional_metadata=golden.additional_metadata,
+ metadata=golden.additional_metadata,
_dataset_alias=_alias,
_dataset_id=_id,
_dataset_rank=index,
@@ -71,7 +71,7 @@ def convert_convo_test_cases_to_convo_goldens(
"expected_outcome": test_case.expected_outcome,
"user_description": test_case.user_description,
"context": test_case.context,
- "additional_metadata": test_case.additional_metadata,
+ "additional_metadata": test_case.metadata,
}
goldens.append(ConversationalGolden(**golden))
return goldens
@@ -91,7 +91,7 @@ def convert_convo_goldens_to_convo_test_cases(
user_description=golden.user_description,
context=golden.context,
name=golden.name,
- additional_metadata=golden.additional_metadata,
+ metadata=golden.additional_metadata,
comments=golden.comments,
_dataset_alias=_alias,
_dataset_id=_id,
@@ -141,9 +141,7 @@ def _dump_list(models):
"mcp_tools_called": _dump_list(turn.mcp_tools_called),
"mcp_resources_called": _dump_list(turn.mcp_resources_called),
"mcp_prompts_called": _dump_list(turn.mcp_prompts_called),
- "additional_metadata": (
- turn.additional_metadata if turn.additional_metadata else None
- ),
+ "metadata": turn.metadata if turn.metadata else None,
}
res.append(cur_turn)
try:
diff --git a/deepeval/evaluate/execute/agentic.py b/deepeval/evaluate/execute/agentic.py
index 4e9c745215..979b8c623b 100644
--- a/deepeval/evaluate/execute/agentic.py
+++ b/deepeval/evaluate/execute/agentic.py
@@ -124,7 +124,7 @@ async def _a_execute_agentic_test_case(
retrieval_context=current_trace.retrieval_context,
tools_called=current_trace.tools_called,
expected_tools=current_trace.expected_tools,
- additional_metadata=golden.additional_metadata,
+ metadata=golden.additional_metadata,
comments=golden.comments,
name=golden.name,
_dataset_alias=golden._dataset_alias,
@@ -243,7 +243,7 @@ async def dfs(trace: Trace, span: BaseSpan):
expected_output=None,
context=None,
retrieval_context=None,
- additional_metadata=golden.additional_metadata,
+ metadata=golden.additional_metadata,
tools_called=None,
expected_tools=None,
comments=golden.comments,
diff --git a/deepeval/evaluate/execute/loop.py b/deepeval/evaluate/execute/loop.py
index 8357246c6c..311fd4a9ed 100644
--- a/deepeval/evaluate/execute/loop.py
+++ b/deepeval/evaluate/execute/loop.py
@@ -217,7 +217,7 @@ def evaluate_test_cases(
expected_output=current_trace.expected_output,
context=current_trace.context,
retrieval_context=current_trace.retrieval_context,
- additional_metadata=golden.additional_metadata,
+ metadata=golden.additional_metadata,
tools_called=current_trace.tools_called,
expected_tools=current_trace.expected_tools,
comments=golden.comments,
diff --git a/deepeval/evaluate/execute/trace_scope.py b/deepeval/evaluate/execute/trace_scope.py
index 8fbe4b0d75..6beac0c86e 100644
--- a/deepeval/evaluate/execute/trace_scope.py
+++ b/deepeval/evaluate/execute/trace_scope.py
@@ -142,7 +142,7 @@ def _assert_test_from_current_trace(
expected_output=current_trace.expected_output,
context=current_trace.context,
retrieval_context=current_trace.retrieval_context,
- additional_metadata=golden.additional_metadata,
+ metadata=golden.additional_metadata,
tools_called=current_trace.tools_called,
expected_tools=current_trace.expected_tools,
comments=golden.comments,
diff --git a/deepeval/evaluate/types.py b/deepeval/evaluate/types.py
index 80bf960a81..7e9b1ae7fe 100644
--- a/deepeval/evaluate/types.py
+++ b/deepeval/evaluate/types.py
@@ -23,7 +23,7 @@ class TestResult:
context: Optional[List[str]] = None
retrieval_context: Optional[List[str]] = None
turns: Optional[List[TurnApi]] = None
- additional_metadata: Optional[Dict] = None
+ metadata: Optional[Dict] = None
class EvaluationResult(BaseModel):
diff --git a/deepeval/evaluate/utils.py b/deepeval/evaluate/utils.py
index 3fa27cbf6f..42535d3015 100644
--- a/deepeval/evaluate/utils.py
+++ b/deepeval/evaluate/utils.py
@@ -121,7 +121,7 @@ def create_test_result(
success=api_test_case.success,
metrics_data=api_test_case.metrics_data,
conversational=True,
- additional_metadata=api_test_case.additional_metadata,
+ metadata=api_test_case.metadata,
turns=api_test_case.turns,
)
else:
@@ -135,7 +135,7 @@ def create_test_result(
actual_output=api_test_case.actual_output,
conversational=False,
multimodal=True,
- additional_metadata=api_test_case.additional_metadata,
+ metadata=api_test_case.metadata,
)
else:
return TestResult(
@@ -149,7 +149,7 @@ def create_test_result(
retrieval_context=api_test_case.retrieval_context,
conversational=False,
multimodal=False,
- additional_metadata=api_test_case.additional_metadata,
+ metadata=api_test_case.metadata,
)
diff --git a/deepeval/key_handler.py b/deepeval/key_handler.py
index 26ee47ca61..41507f8d62 100644
--- a/deepeval/key_handler.py
+++ b/deepeval/key_handler.py
@@ -34,8 +34,6 @@ def _secret_env_keys() -> frozenset[str]:
def _env_key_for_legacy_enum(key) -> str:
- # For ModelKeyValues, .name == .value, for KeyValues it's the important one:
- # KeyValues.API_KEY.name == "API_KEY" (matches Settings), value == "api_key" (legacy json key)
return getattr(key, "name", str(key))
@@ -48,7 +46,6 @@ def _is_secret_key(key) -> bool:
class KeyValues(Enum):
# Confident AI
- API_KEY = "api_key"
CONFIDENT_API_KEY = "confident_api_key"
CONFIDENT_BASE_URL = "confident_base_url"
CONFIDENT_REGION = "confident_region"
diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py
index fec838ae9a..1dd1d7bd98 100644
--- a/deepeval/metrics/answer_relevancy/answer_relevancy.py
+++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py
@@ -11,7 +11,7 @@
generate_with_schema_and_extract,
a_generate_with_schema_and_extract,
)
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams, MLLMImage
+from deepeval.test_case import LLMTestCase, SingleTurnParams, MLLMImage
from deepeval.metrics import BaseMetric
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
@@ -25,9 +25,9 @@
class AnswerRelevancyMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/arena_g_eval/arena_g_eval.py b/deepeval/metrics/arena_g_eval/arena_g_eval.py
index aabc305a2b..9d473f973f 100644
--- a/deepeval/metrics/arena_g_eval/arena_g_eval.py
+++ b/deepeval/metrics/arena_g_eval/arena_g_eval.py
@@ -6,7 +6,7 @@
from deepeval.metrics import BaseArenaMetric
from deepeval.metrics.arena_g_eval.utils import format_arena_test_case
from deepeval.test_case import (
- LLMTestCaseParams,
+ SingleTurnParams,
ArenaTestCase,
)
from deepeval.metrics.arena_g_eval.template import ArenaGEvalTemplate
@@ -37,7 +37,7 @@ class ArenaGEval(BaseArenaMetric):
def __init__(
self,
name: str,
- evaluation_params: List[LLMTestCaseParams],
+ evaluation_params: List[SingleTurnParams],
criteria: Optional[str] = None,
evaluation_steps: Optional[List[str]] = None,
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
diff --git a/deepeval/metrics/arena_g_eval/utils.py b/deepeval/metrics/arena_g_eval/utils.py
index 9ba6ed7b44..caa844cb97 100644
--- a/deepeval/metrics/arena_g_eval/utils.py
+++ b/deepeval/metrics/arena_g_eval/utils.py
@@ -4,7 +4,7 @@
import random
from deepeval.test_case import (
- LLMTestCaseParams,
+ SingleTurnParams,
ToolCall,
ArenaTestCase,
LLMTestCase,
@@ -87,7 +87,7 @@ def __repr__(self):
def format_arena_test_case(
- evaluation_params: List[LLMTestCaseParams], test_case: ArenaTestCase
+ evaluation_params: List[SingleTurnParams], test_case: ArenaTestCase
) -> Tuple[FormattedArenaTestCase, Dict[str, str]]:
case = next(iter([case.test_case for case in test_case.contestants]))
@@ -111,11 +111,11 @@ def format_arena_test_case(
formatted_test_case = FormattedArenaTestCase(
input=(
- case.input if LLMTestCaseParams.INPUT in evaluation_params else None
+ case.input if SingleTurnParams.INPUT in evaluation_params else None
),
expected_output=(
case.expected_output
- if LLMTestCaseParams.EXPECTED_OUTPUT in evaluation_params
+ if SingleTurnParams.EXPECTED_OUTPUT in evaluation_params
else None
),
contestants={
@@ -130,32 +130,32 @@ def format_arena_test_case(
def construct_formatted_llm_test_case(
- evaluation_params: List[LLMTestCaseParams], test_case: LLMTestCase
+ evaluation_params: List[SingleTurnParams], test_case: LLMTestCase
) -> FormattedLLMTestCase:
return FormattedLLMTestCase(
actual_output=(
test_case.actual_output
- if LLMTestCaseParams.ACTUAL_OUTPUT in evaluation_params
+ if SingleTurnParams.ACTUAL_OUTPUT in evaluation_params
else None
),
context=(
test_case.context
- if LLMTestCaseParams.CONTEXT in evaluation_params
+ if SingleTurnParams.CONTEXT in evaluation_params
else None
),
retrieval_context=(
test_case.retrieval_context
- if LLMTestCaseParams.RETRIEVAL_CONTEXT in evaluation_params
+ if SingleTurnParams.RETRIEVAL_CONTEXT in evaluation_params
else None
),
tools_called=(
test_case.tools_called
- if LLMTestCaseParams.TOOLS_CALLED in evaluation_params
+ if SingleTurnParams.TOOLS_CALLED in evaluation_params
else None
),
expected_tools=(
test_case.expected_tools
- if LLMTestCaseParams.EXPECTED_TOOLS in evaluation_params
+ if SingleTurnParams.EXPECTED_TOOLS in evaluation_params
else None
),
)
diff --git a/deepeval/metrics/argument_correctness/argument_correctness.py b/deepeval/metrics/argument_correctness/argument_correctness.py
index 5f94bf4578..3d84494962 100644
--- a/deepeval/metrics/argument_correctness/argument_correctness.py
+++ b/deepeval/metrics/argument_correctness/argument_correctness.py
@@ -10,7 +10,7 @@
)
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
ToolCall,
)
from deepeval.metrics import BaseMetric
@@ -27,9 +27,9 @@
class ArgumentCorrectnessMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.TOOLS_CALLED,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.TOOLS_CALLED,
]
def __init__(
diff --git a/deepeval/metrics/base_metric.py b/deepeval/metrics/base_metric.py
index e6f36cd5cf..2ef93ff384 100644
--- a/deepeval/metrics/base_metric.py
+++ b/deepeval/metrics/base_metric.py
@@ -6,7 +6,7 @@
from deepeval.test_case import (
LLMTestCase,
ConversationalTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
ArenaTestCase,
)
@@ -15,7 +15,7 @@
class BaseMetric:
- _required_params = List[LLMTestCaseParams]
+ _required_params = List[SingleTurnParams]
threshold: float
score: Optional[float] = None
score_breakdown: Dict = None
diff --git a/deepeval/metrics/bias/bias.py b/deepeval/metrics/bias/bias.py
index c64dad3cf7..c053e54eb7 100644
--- a/deepeval/metrics/bias/bias.py
+++ b/deepeval/metrics/bias/bias.py
@@ -3,7 +3,7 @@
from deepeval.metrics import BaseMetric
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.models import DeepEvalBaseLLM
@@ -25,9 +25,9 @@
class BiasMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/contextual_precision/contextual_precision.py b/deepeval/metrics/contextual_precision/contextual_precision.py
index b0a299501d..f218e571d1 100644
--- a/deepeval/metrics/contextual_precision/contextual_precision.py
+++ b/deepeval/metrics/contextual_precision/contextual_precision.py
@@ -13,7 +13,7 @@
)
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics import BaseMetric
from deepeval.models import DeepEvalBaseLLM
@@ -25,10 +25,10 @@
class ContextualPrecisionMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.RETRIEVAL_CONTEXT,
- LLMTestCaseParams.EXPECTED_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.RETRIEVAL_CONTEXT,
+ SingleTurnParams.EXPECTED_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/contextual_recall/contextual_recall.py b/deepeval/metrics/contextual_recall/contextual_recall.py
index e9813fb9f7..b6cd209ea3 100644
--- a/deepeval/metrics/contextual_recall/contextual_recall.py
+++ b/deepeval/metrics/contextual_recall/contextual_recall.py
@@ -13,7 +13,7 @@
)
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics import BaseMetric
from deepeval.models import DeepEvalBaseLLM
@@ -29,10 +29,10 @@
class ContextualRecallMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.RETRIEVAL_CONTEXT,
- LLMTestCaseParams.EXPECTED_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.RETRIEVAL_CONTEXT,
+ SingleTurnParams.EXPECTED_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py
index c098cfd642..94a8e5b9b1 100644
--- a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py
+++ b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py
@@ -14,7 +14,7 @@
)
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics import BaseMetric
from deepeval.models import DeepEvalBaseLLM
@@ -29,9 +29,9 @@
class ContextualRelevancyMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.RETRIEVAL_CONTEXT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.RETRIEVAL_CONTEXT,
]
def __init__(
diff --git a/deepeval/metrics/conversation_completeness/conversation_completeness.py b/deepeval/metrics/conversation_completeness/conversation_completeness.py
index 71c6ba1be3..da301b6ed7 100644
--- a/deepeval/metrics/conversation_completeness/conversation_completeness.py
+++ b/deepeval/metrics/conversation_completeness/conversation_completeness.py
@@ -16,7 +16,7 @@
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.test_case import ConversationalTestCase
-from deepeval.test_case import TurnParams
+from deepeval.test_case import MultiTurnParams
from deepeval.test_case.conversational_test_case import Turn
from deepeval.utils import get_or_create_event_loop, prettify_list
from deepeval.metrics.conversation_completeness.schema import (
@@ -27,7 +27,7 @@
class ConversationCompletenessMetric(BaseConversationalMetric):
- _required_test_case_params = [TurnParams.CONTENT, TurnParams.ROLE]
+ _required_test_case_params = [MultiTurnParams.CONTENT, MultiTurnParams.ROLE]
def __init__(
self,
diff --git a/deepeval/metrics/conversational_dag/nodes.py b/deepeval/metrics/conversational_dag/nodes.py
index aa9781f5a6..a672921f18 100644
--- a/deepeval/metrics/conversational_dag/nodes.py
+++ b/deepeval/metrics/conversational_dag/nodes.py
@@ -15,7 +15,7 @@
)
from deepeval.test_case import (
ConversationalTestCase,
- TurnParams,
+ MultiTurnParams,
ToolCall,
Turn,
)
@@ -297,7 +297,7 @@ class ConversationalTaskNode(ConversationalBaseNode):
instructions: str
output_label: str
children: List[ConversationalBaseNode]
- evaluation_params: List[TurnParams] = None
+ evaluation_params: List[MultiTurnParams] = None
turn_window: Tuple[int, int] = None
label: Optional[str] = None
_verbose_logs: Optional[str] = None
@@ -448,7 +448,7 @@ async def _a_execute(
class ConversationalBinaryJudgementNode(ConversationalBaseNode):
criteria: str
children: List[ConversationalVerdictNode]
- evaluation_params: Optional[List[TurnParams]] = None
+ evaluation_params: Optional[List[MultiTurnParams]] = None
turn_window: Tuple[int, int] = None
label: Optional[str] = None
_verbose_logs: Optional[str] = None
@@ -616,7 +616,7 @@ async def _a_execute(
class ConversationalNonBinaryJudgementNode(ConversationalBaseNode):
criteria: str
children: List[ConversationalVerdictNode]
- evaluation_params: Optional[List[TurnParams]] = None
+ evaluation_params: Optional[List[MultiTurnParams]] = None
turn_window: Tuple[int, int] = None
label: Optional[str] = None
_verbose_logs: Optional[str] = None
diff --git a/deepeval/metrics/conversational_g_eval/conversational_g_eval.py b/deepeval/metrics/conversational_g_eval/conversational_g_eval.py
index 476b05e1a4..7249c8f7fd 100644
--- a/deepeval/metrics/conversational_g_eval/conversational_g_eval.py
+++ b/deepeval/metrics/conversational_g_eval/conversational_g_eval.py
@@ -17,7 +17,7 @@
construct_geval_upload_payload,
)
from deepeval.test_case import (
- TurnParams,
+ MultiTurnParams,
ConversationalTestCase,
)
from deepeval.metrics.conversational_g_eval.template import (
@@ -39,11 +39,19 @@
from deepeval.confident.api import Api, Endpoints, HttpMethods
+def _debug_print_prompt(label: str, prompt: str) -> None:
+ """Debug helper: dump a built prompt to stdout. Remove or gate when no longer needed."""
+ bar = "=" * 80
+ print(f"\n{bar}\n[ConversationalGEval prompt] {label}\n{bar}")
+ print(prompt)
+ print(f"{bar}\n", flush=True)
+
+
class ConversationalGEval(BaseConversationalMetric):
def __init__(
self,
name: str,
- evaluation_params: Optional[List[TurnParams]] = None,
+ evaluation_params: Optional[List[MultiTurnParams]] = None,
criteria: Optional[str] = None,
evaluation_steps: Optional[List[str]] = None,
model: Optional[Union[str, DeepEvalBaseLLM]] = None,
@@ -63,12 +71,12 @@ def __init__(
self.name = name
if evaluation_params is None:
- evaluation_params = [TurnParams.CONTENT, TurnParams.ROLE]
+ evaluation_params = [MultiTurnParams.CONTENT, MultiTurnParams.ROLE]
- if TurnParams.CONTENT not in evaluation_params:
- evaluation_params.append(TurnParams.CONTENT)
- if TurnParams.ROLE not in evaluation_params:
- evaluation_params.append(TurnParams.ROLE)
+ if MultiTurnParams.CONTENT not in evaluation_params:
+ evaluation_params.append(MultiTurnParams.CONTENT)
+ if MultiTurnParams.ROLE not in evaluation_params:
+ evaluation_params.append(MultiTurnParams.ROLE)
self.evaluation_params = evaluation_params
@@ -204,6 +212,9 @@ async def _a_generate_evaluation_steps(self) -> List[str]:
prompt = self.evaluation_template.generate_evaluation_steps(
criteria=self.criteria, parameters=g_eval_params_str
)
+ _debug_print_prompt(
+ f"{self.__name__} :: generate_evaluation_steps (async)", prompt
+ )
return await a_generate_with_schema_and_extract(
metric=self,
prompt=prompt,
@@ -222,6 +233,9 @@ def _generate_evaluation_steps(self) -> List[str]:
prompt = self.evaluation_template.generate_evaluation_steps(
criteria=self.criteria, parameters=g_eval_params_str
)
+ _debug_print_prompt(
+ f"{self.__name__} :: generate_evaluation_steps (sync)", prompt
+ )
return generate_with_schema_and_extract(
metric=self,
prompt=prompt,
@@ -261,6 +275,9 @@ async def _a_evaluate(
],
parameters=g_eval_params_str,
)
+ _debug_print_prompt(
+ f"{self.__name__} :: generate_evaluation_results (async)", prompt
+ )
try:
if no_log_prob_support(self.model):
raise AttributeError("log_probs unsupported.")
@@ -326,6 +343,9 @@ def evaluate(
],
parameters=g_eval_params_str,
)
+ _debug_print_prompt(
+ f"{self.__name__} :: generate_evaluation_results (sync)", prompt
+ )
try:
if no_log_prob_support(self.model):
raise AttributeError("log_probs unsupported.")
diff --git a/deepeval/metrics/conversational_g_eval/template.py b/deepeval/metrics/conversational_g_eval/template.py
index 4cce2c8efb..0914d2006b 100644
--- a/deepeval/metrics/conversational_g_eval/template.py
+++ b/deepeval/metrics/conversational_g_eval/template.py
@@ -4,7 +4,11 @@
class ConversationalGEvalTemplate:
@staticmethod
def generate_evaluation_steps(parameters: str, criteria: str):
- return f"""Given an evaluation criteria which outlines how you should judge a conversation between a user and an LLM chatbot using the {parameters} fields in each turn, generate 3-4 concise evaluation steps based on the criteria below. Based on the evaluation criteria, you MUST make it clear how to evaluate the {parameters} in relation to one another in each turn, as well as the overall quality of the conversation.
+ return f"""Given an evaluation criteria which outlines how you should judge a conversation between a user and an LLM chatbot using the {parameters} fields, generate 3-4 concise evaluation steps based on the criteria below.
+
+Note that {parameters} can include both turn-level fields (e.g. content, role, retrieval_context, tools_called) and conversation-level fields (e.g. scenario, expected_outcome, metadata, tags, context, chatbot_role, user_description). Evaluate each field at its correct scope: turn-level fields appear once per turn, while conversation-level fields apply to the conversation as a whole and should NOT be expected to repeat on every turn.
+
+Based on the evaluation criteria, you MUST make it clear how to evaluate the {parameters} together to assess both each turn and the overall quality of the conversation.
Evaluation Criteria:
{criteria}
@@ -56,14 +60,15 @@ def generate_evaluation_results(
Evaluation Steps:
{evaluation_steps}
- {rubric_text}Conversation:
+ {rubric_text}Per-turn fields:
{turns}
{test_case_content}
-
Parameters to consider during evaluation:
{parameters}
+ Note: the "Per-turn fields" block lists each turn separately, while the "Conversation-level fields" block applies to the whole conversation. Do not penalize individual turns for missing conversation-level fields.
+
---
IMPORTANT: You MUST return only a valid JSON object with the exact keys `"score"` and `"reason"`. No additional text, commentary, or formatting.
diff --git a/deepeval/metrics/dag/nodes.py b/deepeval/metrics/dag/nodes.py
index ffc9e350fb..9434044210 100644
--- a/deepeval/metrics/dag/nodes.py
+++ b/deepeval/metrics/dag/nodes.py
@@ -23,7 +23,7 @@
a_generate_with_schema_and_extract,
generate_with_schema_and_extract,
)
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall
+from deepeval.test_case import LLMTestCase, SingleTurnParams, ToolCall
from deepeval.utils import prettify_list
@@ -254,7 +254,7 @@ class TaskNode(BaseNode):
instructions: str
output_label: str
children: List[BaseNode]
- evaluation_params: List[LLMTestCaseParams] = None
+ evaluation_params: List[SingleTurnParams] = None
label: Optional[str] = None
_verbose_logs: Optional[str] = None
_output: Optional[str] = None
@@ -377,7 +377,7 @@ async def _a_execute(
class BinaryJudgementNode(BaseNode):
criteria: str
children: List[VerdictNode]
- evaluation_params: Optional[List[LLMTestCaseParams]] = None
+ evaluation_params: Optional[List[SingleTurnParams]] = None
label: Optional[str] = None
_verbose_logs: Optional[str] = None
_verdict: Optional[BinaryJudgementVerdict] = None
@@ -507,7 +507,7 @@ async def _a_execute(
class NonBinaryJudgementNode(BaseNode):
criteria: str
children: List[VerdictNode]
- evaluation_params: Optional[List[LLMTestCaseParams]] = None
+ evaluation_params: Optional[List[SingleTurnParams]] = None
label: Optional[str] = None
_verbose_logs: Optional[str] = None
_verdict: Optional[NonBinaryJudgementVerdict] = None
diff --git a/deepeval/metrics/dag/serialization/serialization.py b/deepeval/metrics/dag/serialization/serialization.py
index 41b46b834e..f8569d3eb0 100644
--- a/deepeval/metrics/dag/serialization/serialization.py
+++ b/deepeval/metrics/dag/serialization/serialization.py
@@ -56,7 +56,7 @@
VerdictNode,
)
from deepeval.metrics.g_eval.g_eval import GEval
-from deepeval.test_case import LLMTestCaseParams, TurnParams
+from deepeval.test_case import SingleTurnParams, MultiTurnParams
from .registry import CLASS_TO_NODE_TYPE, NODE_CLASSES
from .types import ChildType, NodeType
@@ -402,7 +402,7 @@ def _collect_referenced_ids(nodes_spec: Dict[str, Any]) -> Set[str]:
def _eval_params_cls(multiturn: bool):
- return TurnParams if multiturn else LLMTestCaseParams
+ return MultiTurnParams if multiturn else SingleTurnParams
def _deserialize_eval_params(values, multiturn: bool):
@@ -519,7 +519,7 @@ def _build_metric(child_spec: Dict[str, Any]):
if "evaluation_params" in kwargs and isinstance(
kwargs["evaluation_params"], list
):
- # Try LLMTestCaseParams first, then TurnParams for conversational metrics.
+ # Try SingleTurnParams first, then MultiTurnParams for conversational metrics.
if issubclass(cls, BaseConversationalMetric):
kwargs["evaluation_params"] = _deserialize_eval_params(
kwargs["evaluation_params"], multiturn=True
diff --git a/deepeval/metrics/dag/utils.py b/deepeval/metrics/dag/utils.py
index 41f149af0b..d819856df5 100644
--- a/deepeval/metrics/dag/utils.py
+++ b/deepeval/metrics/dag/utils.py
@@ -16,7 +16,7 @@
ConversationalTaskNode,
ConversationalVerdictNode,
)
-from deepeval.test_case import LLMTestCaseParams, TurnParams
+from deepeval.test_case import SingleTurnParams, MultiTurnParams
def is_valid_dag_from_roots(
@@ -75,9 +75,9 @@ def extract_required_params(
nodes: list[BaseNode],
multiturn: bool,
required_params: Optional[
- Union[Set[LLMTestCaseParams], Set[TurnParams]]
+ Union[Set[SingleTurnParams], Set[MultiTurnParams]]
] = None,
-) -> Union[Set[LLMTestCaseParams], Set[TurnParams]]:
+) -> Union[Set[SingleTurnParams], Set[MultiTurnParams]]:
if required_params is None:
required_params = set()
diff --git a/deepeval/metrics/exact_match/exact_match.py b/deepeval/metrics/exact_match/exact_match.py
index 411814ba88..c44fec75ef 100644
--- a/deepeval/metrics/exact_match/exact_match.py
+++ b/deepeval/metrics/exact_match/exact_match.py
@@ -6,14 +6,14 @@
construct_verbose_logs,
)
from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
class ExactMatchMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
- LLMTestCaseParams.EXPECTED_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
+ SingleTurnParams.EXPECTED_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py
index bcccf91d90..896e2a9adc 100644
--- a/deepeval/metrics/faithfulness/faithfulness.py
+++ b/deepeval/metrics/faithfulness/faithfulness.py
@@ -1,7 +1,7 @@
from typing import List, Optional, Union, Type
import asyncio
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics import BaseMetric
from deepeval.utils import (
get_or_create_event_loop,
@@ -27,10 +27,10 @@
class FaithfulnessMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
- LLMTestCaseParams.RETRIEVAL_CONTEXT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
+ SingleTurnParams.RETRIEVAL_CONTEXT,
]
def __init__(
diff --git a/deepeval/metrics/g_eval/g_eval.py b/deepeval/metrics/g_eval/g_eval.py
index 43a0c5f75b..72cfa54d2b 100644
--- a/deepeval/metrics/g_eval/g_eval.py
+++ b/deepeval/metrics/g_eval/g_eval.py
@@ -6,7 +6,7 @@
from deepeval.metrics import BaseMetric
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics.g_eval.template import GEvalTemplate
from deepeval.utils import get_or_create_event_loop, prettify_list
@@ -43,7 +43,7 @@ class GEval(BaseMetric):
def __init__(
self,
name: str,
- evaluation_params: List[LLMTestCaseParams],
+ evaluation_params: List[SingleTurnParams],
criteria: Optional[str] = None,
evaluation_steps: Optional[List[str]] = None,
rubric: Optional[List[Rubric]] = None,
diff --git a/deepeval/metrics/g_eval/utils.py b/deepeval/metrics/g_eval/utils.py
index 7b459b8b1b..b8c1640551 100644
--- a/deepeval/metrics/g_eval/utils.py
+++ b/deepeval/metrics/g_eval/utils.py
@@ -4,8 +4,8 @@
from deepeval.models import DeepEvalBaseLLM, GPTModel, AzureOpenAIModel
from deepeval.test_case import (
- LLMTestCaseParams,
- TurnParams,
+ SingleTurnParams,
+ MultiTurnParams,
LLMTestCase,
ToolCall,
)
@@ -34,47 +34,55 @@ def validate_score_range(cls, value):
G_EVAL_PARAMS = {
- LLMTestCaseParams.INPUT: "Input",
- LLMTestCaseParams.ACTUAL_OUTPUT: "Actual Output",
- LLMTestCaseParams.EXPECTED_OUTPUT: "Expected Output",
- LLMTestCaseParams.CONTEXT: "Context",
- LLMTestCaseParams.RETRIEVAL_CONTEXT: "Retrieval Context",
- LLMTestCaseParams.EXPECTED_TOOLS: "Expected Tools",
- LLMTestCaseParams.TOOLS_CALLED: "Tools Called",
+ SingleTurnParams.INPUT: "Input",
+ SingleTurnParams.ACTUAL_OUTPUT: "Actual Output",
+ SingleTurnParams.EXPECTED_OUTPUT: "Expected Output",
+ SingleTurnParams.CONTEXT: "Context",
+ SingleTurnParams.RETRIEVAL_CONTEXT: "Retrieval Context",
+ SingleTurnParams.METADATA: "Metadata",
+ SingleTurnParams.TAGS: "Tags",
+ SingleTurnParams.EXPECTED_TOOLS: "Expected Tools",
+ SingleTurnParams.TOOLS_CALLED: "Tools Called",
}
CONVERSATIONAL_G_EVAL_PARAMS = {
- TurnParams.CONTENT: "Content",
- TurnParams.ROLE: "Role",
- TurnParams.TOOLS_CALLED: "Tools Called",
- TurnParams.RETRIEVAL_CONTEXT: "Retrieval Context",
- TurnParams.EXPECTED_OUTCOME: "Expected Outcome",
- TurnParams.SCENARIO: "Scenario",
+ MultiTurnParams.CONTENT: "Content",
+ MultiTurnParams.ROLE: "Role",
+ MultiTurnParams.METADATA: "Metadata",
+ MultiTurnParams.TAGS: "Tags",
+ MultiTurnParams.TOOLS_CALLED: "Tools Called",
+ MultiTurnParams.RETRIEVAL_CONTEXT: "Retrieval Context",
+ MultiTurnParams.EXPECTED_OUTCOME: "Expected Outcome",
+ MultiTurnParams.SCENARIO: "Scenario",
}
G_EVAL_API_PARAMS = {
- LLMTestCaseParams.INPUT: "input",
- LLMTestCaseParams.ACTUAL_OUTPUT: "actualOutput",
- LLMTestCaseParams.EXPECTED_OUTPUT: "expectedOutput",
- LLMTestCaseParams.CONTEXT: "context",
- LLMTestCaseParams.RETRIEVAL_CONTEXT: "retrievalContext",
- LLMTestCaseParams.EXPECTED_TOOLS: "expectedTools",
- LLMTestCaseParams.TOOLS_CALLED: "toolsCalled",
+ SingleTurnParams.INPUT: "input",
+ SingleTurnParams.ACTUAL_OUTPUT: "actualOutput",
+ SingleTurnParams.EXPECTED_OUTPUT: "expectedOutput",
+ SingleTurnParams.CONTEXT: "context",
+ SingleTurnParams.RETRIEVAL_CONTEXT: "retrievalContext",
+ SingleTurnParams.METADATA: "metadata",
+ SingleTurnParams.TAGS: "tags",
+ SingleTurnParams.EXPECTED_TOOLS: "expectedTools",
+ SingleTurnParams.TOOLS_CALLED: "toolsCalled",
}
CONVERSATIONAL_G_EVAL_API_PARAMS = {
- TurnParams.ROLE: "role",
- TurnParams.CONTENT: "content",
- TurnParams.SCENARIO: "scenario",
- TurnParams.EXPECTED_OUTCOME: "expectedOutcome",
- TurnParams.RETRIEVAL_CONTEXT: "retrievalContext",
- TurnParams.TOOLS_CALLED: "toolsCalled",
+ MultiTurnParams.ROLE: "role",
+ MultiTurnParams.CONTENT: "content",
+ MultiTurnParams.METADATA: "metadata",
+ MultiTurnParams.TAGS: "tags",
+ MultiTurnParams.SCENARIO: "scenario",
+ MultiTurnParams.EXPECTED_OUTCOME: "expectedOutcome",
+ MultiTurnParams.RETRIEVAL_CONTEXT: "retrievalContext",
+ MultiTurnParams.TOOLS_CALLED: "toolsCalled",
}
def construct_geval_upload_payload(
name: str,
- evaluation_params: List[LLMTestCaseParams],
+ evaluation_params: List[SingleTurnParams],
g_eval_api_params: Dict,
criteria: Optional[str] = None,
evaluation_steps: Optional[List[str]] = None,
@@ -197,7 +205,7 @@ def no_log_prob_support(model: Union[str, DeepEvalBaseLLM]):
def construct_g_eval_params_string(
- llm_test_case_params: List[LLMTestCaseParams],
+ llm_test_case_params: List[SingleTurnParams],
):
g_eval_params = [G_EVAL_PARAMS[param] for param in llm_test_case_params]
if len(g_eval_params) == 1:
@@ -213,7 +221,7 @@ def construct_g_eval_params_string(
def construct_conversational_g_eval_turn_params_string(
- turn_params: List[TurnParams],
+ turn_params: List[MultiTurnParams],
):
g_eval_params = [
CONVERSATIONAL_G_EVAL_PARAMS[param] for param in turn_params
@@ -232,25 +240,29 @@ def construct_conversational_g_eval_turn_params_string(
def construct_non_turns_test_case_string(
- turn_params: List[TurnParams], test_case: ConversationalTestCase
+ turn_params: List[MultiTurnParams], test_case: ConversationalTestCase
) -> str:
- text = """"""
+ body = """"""
for param in turn_params:
if (
- param == TurnParams.RETRIEVAL_CONTEXT
- or param == TurnParams.TOOLS_CALLED
- or param == TurnParams.CONTENT
- or param == TurnParams.ROLE
+ param == MultiTurnParams.RETRIEVAL_CONTEXT
+ or param == MultiTurnParams.TOOLS_CALLED
+ or param == MultiTurnParams.CONTENT
+ or param == MultiTurnParams.ROLE
):
continue
value = getattr(test_case, param.value)
- text += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value} \n\n"
- return text
+ body += f"{CONVERSATIONAL_G_EVAL_PARAMS[param]}:\n{value} \n\n"
+
+ if not body:
+ return ""
+
+ return f"Conversation-level fields:\n{body}"
def construct_test_case_string(
- evaluation_params: List[LLMTestCaseParams], test_case: LLMTestCase
+ evaluation_params: List[SingleTurnParams], test_case: LLMTestCase
) -> str:
text = """"""
for param in evaluation_params:
diff --git a/deepeval/metrics/goal_accuracy/goal_accuracy.py b/deepeval/metrics/goal_accuracy/goal_accuracy.py
index aa4e948737..887341a054 100644
--- a/deepeval/metrics/goal_accuracy/goal_accuracy.py
+++ b/deepeval/metrics/goal_accuracy/goal_accuracy.py
@@ -10,7 +10,7 @@
a_generate_with_schema_and_extract,
generate_with_schema_and_extract,
)
-from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
+from deepeval.test_case import ConversationalTestCase, MultiTurnParams, Turn
from deepeval.metrics import BaseConversationalMetric
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics.indicator import metric_progress_indicator
@@ -27,8 +27,8 @@
class GoalAccuracyMetric(BaseConversationalMetric):
_required_test_case_params = [
- TurnParams.ROLE,
- TurnParams.CONTENT,
+ MultiTurnParams.ROLE,
+ MultiTurnParams.CONTENT,
]
def __init__(
diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py
index 027a2d56c5..d3b9808ac7 100644
--- a/deepeval/metrics/hallucination/hallucination.py
+++ b/deepeval/metrics/hallucination/hallucination.py
@@ -2,7 +2,7 @@
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics import BaseMetric
from deepeval.utils import get_or_create_event_loop, prettify_list
@@ -24,10 +24,10 @@
class HallucinationMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
- LLMTestCaseParams.CONTEXT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
+ SingleTurnParams.CONTEXT,
]
def __init__(
diff --git a/deepeval/metrics/json_correctness/json_correctness.py b/deepeval/metrics/json_correctness/json_correctness.py
index 15c6b24a76..0949a205ee 100644
--- a/deepeval/metrics/json_correctness/json_correctness.py
+++ b/deepeval/metrics/json_correctness/json_correctness.py
@@ -4,7 +4,7 @@
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics import BaseMetric
from deepeval.metrics.utils import (
@@ -24,9 +24,9 @@
class JsonCorrectnessMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/knowledge_retention/knowledge_retention.py b/deepeval/metrics/knowledge_retention/knowledge_retention.py
index 11d3c5da55..4392df3c99 100644
--- a/deepeval/metrics/knowledge_retention/knowledge_retention.py
+++ b/deepeval/metrics/knowledge_retention/knowledge_retention.py
@@ -1,6 +1,6 @@
from typing import Optional, Union, List
-from deepeval.test_case import ConversationalTestCase, Turn, TurnParams
+from deepeval.test_case import ConversationalTestCase, Turn, MultiTurnParams
from deepeval.metrics import BaseConversationalMetric
from deepeval.metrics.utils import (
check_conversational_test_case_params,
@@ -24,7 +24,7 @@
class KnowledgeRetentionMetric(BaseConversationalMetric):
- _required_test_case_params = [TurnParams.CONTENT, TurnParams.ROLE]
+ _required_test_case_params = [MultiTurnParams.CONTENT, MultiTurnParams.ROLE]
def __init__(
self,
diff --git a/deepeval/metrics/mcp/mcp_task_completion.py b/deepeval/metrics/mcp/mcp_task_completion.py
index a4e15cee85..8778507c4f 100644
--- a/deepeval/metrics/mcp/mcp_task_completion.py
+++ b/deepeval/metrics/mcp/mcp_task_completion.py
@@ -12,7 +12,7 @@
generate_with_schema_and_extract,
)
from deepeval.metrics.indicator import metric_progress_indicator
-from deepeval.test_case import ConversationalTestCase, TurnParams
+from deepeval.test_case import ConversationalTestCase, MultiTurnParams
from deepeval.utils import get_or_create_event_loop, prettify_list
from deepeval.metrics.mcp.schema import Task, TaskScore, Reason
from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
@@ -21,8 +21,8 @@
class MCPTaskCompletionMetric(BaseConversationalMetric):
_required_test_case_params = [
- TurnParams.ROLE,
- TurnParams.CONTENT,
+ MultiTurnParams.ROLE,
+ MultiTurnParams.CONTENT,
]
def __init__(
diff --git a/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py b/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py
index 1744d6e7c7..2b666a292b 100644
--- a/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py
+++ b/deepeval/metrics/mcp/multi_turn_mcp_use_metric.py
@@ -12,7 +12,7 @@
generate_with_schema_and_extract,
)
from deepeval.metrics.indicator import metric_progress_indicator
-from deepeval.test_case import ConversationalTestCase, TurnParams
+from deepeval.test_case import ConversationalTestCase, MultiTurnParams
from deepeval.utils import get_or_create_event_loop, prettify_list
from deepeval.metrics.mcp.schema import Task, ArgsScore, ToolScore, Reason
from deepeval.metrics.mcp.template import MCPTaskCompletionTemplate
@@ -21,8 +21,8 @@
class MultiTurnMCPUseMetric(BaseConversationalMetric):
_required_test_case_params = [
- TurnParams.ROLE,
- TurnParams.CONTENT,
+ MultiTurnParams.ROLE,
+ MultiTurnParams.CONTENT,
]
def __init__(
diff --git a/deepeval/metrics/mcp_use_metric/mcp_use_metric.py b/deepeval/metrics/mcp_use_metric/mcp_use_metric.py
index e110116344..cfcd55782e 100644
--- a/deepeval/metrics/mcp_use_metric/mcp_use_metric.py
+++ b/deepeval/metrics/mcp_use_metric/mcp_use_metric.py
@@ -10,7 +10,7 @@
)
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
MCPServer,
MCPToolCall,
MCPResourceCall,
@@ -24,10 +24,10 @@
class MCPUseMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
- LLMTestCaseParams.MCP_SERVERS,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
+ SingleTurnParams.MCP_SERVERS,
]
def __init__(
diff --git a/deepeval/metrics/misuse/misuse.py b/deepeval/metrics/misuse/misuse.py
index fd8d608cd6..e2b42dba52 100644
--- a/deepeval/metrics/misuse/misuse.py
+++ b/deepeval/metrics/misuse/misuse.py
@@ -3,7 +3,7 @@
from deepeval.metrics import BaseMetric
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.models import DeepEvalBaseLLM
@@ -25,9 +25,9 @@
class MisuseMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py b/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py
index b75180aef3..d0d3cd056e 100644
--- a/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py
+++ b/deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py
@@ -2,7 +2,7 @@
from typing import Optional, List, Tuple, Union
from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
+from deepeval.test_case import SingleTurnParams, LLMTestCase, MLLMImage
from deepeval.metrics.multimodal_metrics.image_coherence.template import (
ImageCoherenceTemplate,
)
@@ -25,9 +25,9 @@
class ImageCoherenceMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py b/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py
index f5ba67ecb5..63498a4f3b 100644
--- a/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py
+++ b/deepeval/metrics/multimodal_metrics/image_editing/image_editing.py
@@ -4,7 +4,7 @@
import textwrap
from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
+from deepeval.test_case import SingleTurnParams, LLMTestCase, MLLMImage
from deepeval.metrics.multimodal_metrics.image_editing.template import (
ImageEditingTemplate,
)
@@ -26,9 +26,9 @@
class ImageEditingMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py b/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py
index c9a046dd60..2cb1e0dbff 100644
--- a/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py
+++ b/deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py
@@ -2,7 +2,7 @@
from typing import Optional, List, Tuple, Union
from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
+from deepeval.test_case import SingleTurnParams, LLMTestCase, MLLMImage
from deepeval.metrics.multimodal_metrics.image_helpfulness.template import (
ImageHelpfulnessTemplate,
)
@@ -26,9 +26,9 @@
class ImageHelpfulnessMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py b/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py
index cb43eb9fc7..d5b541b0af 100644
--- a/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py
+++ b/deepeval/metrics/multimodal_metrics/image_reference/image_reference.py
@@ -2,7 +2,7 @@
from typing import Optional, List, Tuple, Union
from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
+from deepeval.test_case import SingleTurnParams, LLMTestCase, MLLMImage
from deepeval.metrics.multimodal_metrics.image_reference.template import (
ImageReferenceTemplate,
)
@@ -26,9 +26,9 @@
class ImageReferenceMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py b/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py
index 6fa99e1f48..9c99738ded 100644
--- a/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py
+++ b/deepeval/metrics/multimodal_metrics/text_to_image/text_to_image.py
@@ -4,7 +4,7 @@
import textwrap
from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCaseParams, LLMTestCase, MLLMImage
+from deepeval.test_case import SingleTurnParams, LLMTestCase, MLLMImage
from deepeval.metrics.multimodal_metrics.text_to_image.template import (
TextToImageTemplate,
)
@@ -23,9 +23,9 @@
from deepeval.metrics.multimodal_metrics.text_to_image.schema import ReasonScore
from deepeval.metrics.indicator import metric_progress_indicator
-required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
diff --git a/deepeval/metrics/non_advice/non_advice.py b/deepeval/metrics/non_advice/non_advice.py
index a4b3083bb3..ffae0d27a7 100644
--- a/deepeval/metrics/non_advice/non_advice.py
+++ b/deepeval/metrics/non_advice/non_advice.py
@@ -3,7 +3,7 @@
from deepeval.metrics import BaseMetric
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.models import DeepEvalBaseLLM
@@ -28,9 +28,9 @@
class NonAdviceMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/pattern_match/pattern_match.py b/deepeval/metrics/pattern_match/pattern_match.py
index 5045fcb816..3d46ab12b9 100644
--- a/deepeval/metrics/pattern_match/pattern_match.py
+++ b/deepeval/metrics/pattern_match/pattern_match.py
@@ -7,13 +7,13 @@
construct_verbose_logs,
)
from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
class PatternMatchMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/pii_leakage/pii_leakage.py b/deepeval/metrics/pii_leakage/pii_leakage.py
index 2ba025263d..7e34265bd7 100644
--- a/deepeval/metrics/pii_leakage/pii_leakage.py
+++ b/deepeval/metrics/pii_leakage/pii_leakage.py
@@ -3,7 +3,7 @@
from deepeval.metrics import BaseMetric
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.models import DeepEvalBaseLLM
@@ -25,9 +25,9 @@
class PIILeakageMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/plan_adherence/plan_adherence.py b/deepeval/metrics/plan_adherence/plan_adherence.py
index af3032fb8b..f80a1331d6 100644
--- a/deepeval/metrics/plan_adherence/plan_adherence.py
+++ b/deepeval/metrics/plan_adherence/plan_adherence.py
@@ -8,7 +8,7 @@
a_generate_with_schema_and_extract,
generate_with_schema_and_extract,
)
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics import BaseMetric
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics.indicator import metric_progress_indicator
@@ -27,9 +27,9 @@
class PlanAdherenceMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/plan_quality/plan_quality.py b/deepeval/metrics/plan_quality/plan_quality.py
index 16ba91ad7f..fc899a4daa 100644
--- a/deepeval/metrics/plan_quality/plan_quality.py
+++ b/deepeval/metrics/plan_quality/plan_quality.py
@@ -8,7 +8,7 @@
a_generate_with_schema_and_extract,
generate_with_schema_and_extract,
)
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics import BaseMetric
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics.indicator import metric_progress_indicator
@@ -30,9 +30,9 @@
class PlanQualityMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/prompt_alignment/prompt_alignment.py b/deepeval/metrics/prompt_alignment/prompt_alignment.py
index 27738b2e51..c6861ec52b 100644
--- a/deepeval/metrics/prompt_alignment/prompt_alignment.py
+++ b/deepeval/metrics/prompt_alignment/prompt_alignment.py
@@ -16,7 +16,7 @@
)
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics import BaseMetric
from deepeval.models import DeepEvalBaseLLM
@@ -27,9 +27,9 @@
class PromptAlignmentMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/role_adherence/role_adherence.py b/deepeval/metrics/role_adherence/role_adherence.py
index a433e14aa2..4c5e7ad1b7 100644
--- a/deepeval/metrics/role_adherence/role_adherence.py
+++ b/deepeval/metrics/role_adherence/role_adherence.py
@@ -16,12 +16,12 @@
)
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics.indicator import metric_progress_indicator
-from deepeval.test_case import Turn, ConversationalTestCase, TurnParams
+from deepeval.test_case import Turn, ConversationalTestCase, MultiTurnParams
from deepeval.utils import get_or_create_event_loop, prettify_list
class RoleAdherenceMetric(BaseConversationalMetric):
- _required_test_case_params = [TurnParams.CONTENT, TurnParams.ROLE]
+ _required_test_case_params = [MultiTurnParams.CONTENT, MultiTurnParams.ROLE]
def __init__(
self,
diff --git a/deepeval/metrics/role_violation/role_violation.py b/deepeval/metrics/role_violation/role_violation.py
index 78659480b2..73ab3fc57b 100644
--- a/deepeval/metrics/role_violation/role_violation.py
+++ b/deepeval/metrics/role_violation/role_violation.py
@@ -3,7 +3,7 @@
from deepeval.metrics import BaseMetric
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.models import DeepEvalBaseLLM
@@ -25,9 +25,9 @@
class RoleViolationMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/step_efficiency/step_efficiency.py b/deepeval/metrics/step_efficiency/step_efficiency.py
index 5f6b741140..2d5f888b5a 100644
--- a/deepeval/metrics/step_efficiency/step_efficiency.py
+++ b/deepeval/metrics/step_efficiency/step_efficiency.py
@@ -8,7 +8,7 @@
a_generate_with_schema_and_extract,
generate_with_schema_and_extract,
)
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics import BaseMetric
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics.indicator import metric_progress_indicator
@@ -20,9 +20,9 @@
class StepEfficiencyMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/summarization/summarization.py b/deepeval/metrics/summarization/summarization.py
index 4de6809289..8ceac3181f 100644
--- a/deepeval/metrics/summarization/summarization.py
+++ b/deepeval/metrics/summarization/summarization.py
@@ -3,7 +3,7 @@
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics import BaseMetric
from deepeval.models import DeepEvalBaseLLM
@@ -32,9 +32,9 @@
class SummarizationMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/task_completion/task_completion.py b/deepeval/metrics/task_completion/task_completion.py
index f964e8e71b..8152bf6b4f 100644
--- a/deepeval/metrics/task_completion/task_completion.py
+++ b/deepeval/metrics/task_completion/task_completion.py
@@ -10,7 +10,7 @@
)
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics import BaseMetric
from deepeval.models import DeepEvalBaseLLM
@@ -24,9 +24,9 @@
class TaskCompletionMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/tool_correctness/tool_correctness.py b/deepeval/metrics/tool_correctness/tool_correctness.py
index b8520a86ef..24eb436ffa 100644
--- a/deepeval/metrics/tool_correctness/tool_correctness.py
+++ b/deepeval/metrics/tool_correctness/tool_correctness.py
@@ -13,7 +13,7 @@
from deepeval.models import DeepEvalBaseLLM
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
ToolCallParams,
ToolCall,
)
@@ -24,10 +24,10 @@
class ToolCorrectnessMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.TOOLS_CALLED,
- LLMTestCaseParams.EXPECTED_TOOLS,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.TOOLS_CALLED,
+ SingleTurnParams.EXPECTED_TOOLS,
]
def __init__(
diff --git a/deepeval/metrics/tool_use/tool_use.py b/deepeval/metrics/tool_use/tool_use.py
index c8dd196970..295432fc4d 100644
--- a/deepeval/metrics/tool_use/tool_use.py
+++ b/deepeval/metrics/tool_use/tool_use.py
@@ -11,7 +11,7 @@
)
from deepeval.test_case import (
ConversationalTestCase,
- TurnParams,
+ MultiTurnParams,
ToolCall,
Turn,
)
@@ -30,8 +30,8 @@
class ToolUseMetric(BaseConversationalMetric):
_required_test_case_params = [
- TurnParams.ROLE,
- TurnParams.CONTENT,
+ MultiTurnParams.ROLE,
+ MultiTurnParams.CONTENT,
]
def __init__(
diff --git a/deepeval/metrics/topic_adherence/topic_adherence.py b/deepeval/metrics/topic_adherence/topic_adherence.py
index b9440479f7..245ca7aeca 100644
--- a/deepeval/metrics/topic_adherence/topic_adherence.py
+++ b/deepeval/metrics/topic_adherence/topic_adherence.py
@@ -9,7 +9,7 @@
a_generate_with_schema_and_extract,
generate_with_schema_and_extract,
)
-from deepeval.test_case import ConversationalTestCase, TurnParams
+from deepeval.test_case import ConversationalTestCase, MultiTurnParams
from deepeval.metrics import BaseConversationalMetric
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics.indicator import metric_progress_indicator
@@ -25,8 +25,8 @@
class TopicAdherenceMetric(BaseConversationalMetric):
_required_test_case_params = [
- TurnParams.ROLE,
- TurnParams.CONTENT,
+ MultiTurnParams.ROLE,
+ MultiTurnParams.CONTENT,
]
def __init__(
diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py
index b05f734777..d64dcd206b 100644
--- a/deepeval/metrics/toxicity/toxicity.py
+++ b/deepeval/metrics/toxicity/toxicity.py
@@ -3,7 +3,7 @@
from deepeval.metrics import BaseMetric
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval.metrics.indicator import metric_progress_indicator
from deepeval.models import DeepEvalBaseLLM
@@ -26,9 +26,9 @@
class ToxicityMetric(BaseMetric):
- _required_params: List[LLMTestCaseParams] = [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ _required_params: List[SingleTurnParams] = [
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
def __init__(
diff --git a/deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py b/deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py
index 2d482a69b9..bc0ea528e9 100644
--- a/deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py
+++ b/deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py
@@ -1,7 +1,7 @@
from typing import List, Optional, Union, Type, Tuple
import asyncio
import itertools
-from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
+from deepeval.test_case import ConversationalTestCase, MultiTurnParams, Turn
from deepeval.metrics import BaseConversationalMetric
from deepeval.utils import (
get_or_create_event_loop,
@@ -31,11 +31,11 @@
class TurnContextualPrecisionMetric(BaseConversationalMetric):
- _required_test_case_params: List[TurnParams] = [
- TurnParams.ROLE,
- TurnParams.CONTENT,
- TurnParams.RETRIEVAL_CONTEXT,
- TurnParams.EXPECTED_OUTCOME,
+ _required_test_case_params: List[MultiTurnParams] = [
+ MultiTurnParams.ROLE,
+ MultiTurnParams.CONTENT,
+ MultiTurnParams.RETRIEVAL_CONTEXT,
+ MultiTurnParams.EXPECTED_OUTCOME,
]
def __init__(
diff --git a/deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py b/deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py
index 786d5ac662..6612c12b43 100644
--- a/deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py
+++ b/deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py
@@ -1,7 +1,7 @@
from typing import List, Optional, Union, Type, Tuple
import asyncio
import itertools
-from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
+from deepeval.test_case import ConversationalTestCase, MultiTurnParams, Turn
from deepeval.metrics import BaseConversationalMetric
from deepeval.utils import (
get_or_create_event_loop,
@@ -31,11 +31,11 @@
class TurnContextualRecallMetric(BaseConversationalMetric):
- _required_test_case_params: List[TurnParams] = [
- TurnParams.ROLE,
- TurnParams.CONTENT,
- TurnParams.RETRIEVAL_CONTEXT,
- TurnParams.EXPECTED_OUTCOME,
+ _required_test_case_params: List[MultiTurnParams] = [
+ MultiTurnParams.ROLE,
+ MultiTurnParams.CONTENT,
+ MultiTurnParams.RETRIEVAL_CONTEXT,
+ MultiTurnParams.EXPECTED_OUTCOME,
]
def __init__(
diff --git a/deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py b/deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py
index 795b611a0e..df689b84c2 100644
--- a/deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py
+++ b/deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py
@@ -1,7 +1,7 @@
from typing import List, Optional, Union, Type, Tuple
import asyncio
import itertools
-from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
+from deepeval.test_case import ConversationalTestCase, MultiTurnParams, Turn
from deepeval.metrics import BaseConversationalMetric
from deepeval.utils import (
get_or_create_event_loop,
@@ -31,10 +31,10 @@
class TurnContextualRelevancyMetric(BaseConversationalMetric):
- _required_test_case_params: List[TurnParams] = [
- TurnParams.ROLE,
- TurnParams.CONTENT,
- TurnParams.RETRIEVAL_CONTEXT,
+ _required_test_case_params: List[MultiTurnParams] = [
+ MultiTurnParams.ROLE,
+ MultiTurnParams.CONTENT,
+ MultiTurnParams.RETRIEVAL_CONTEXT,
]
def __init__(
diff --git a/deepeval/metrics/turn_faithfulness/turn_faithfulness.py b/deepeval/metrics/turn_faithfulness/turn_faithfulness.py
index 1130b00d76..55fba6e149 100644
--- a/deepeval/metrics/turn_faithfulness/turn_faithfulness.py
+++ b/deepeval/metrics/turn_faithfulness/turn_faithfulness.py
@@ -1,7 +1,7 @@
from typing import List, Optional, Union, Type, Tuple
import asyncio
import itertools
-from deepeval.test_case import ConversationalTestCase, TurnParams, Turn
+from deepeval.test_case import ConversationalTestCase, MultiTurnParams, Turn
from deepeval.metrics import BaseConversationalMetric
from deepeval.utils import (
get_or_create_event_loop,
@@ -33,10 +33,10 @@
class TurnFaithfulnessMetric(BaseConversationalMetric):
- _required_test_case_params: List[TurnParams] = [
- TurnParams.ROLE,
- TurnParams.CONTENT,
- TurnParams.RETRIEVAL_CONTEXT,
+ _required_test_case_params: List[MultiTurnParams] = [
+ MultiTurnParams.ROLE,
+ MultiTurnParams.CONTENT,
+ MultiTurnParams.RETRIEVAL_CONTEXT,
]
def __init__(
diff --git a/deepeval/metrics/turn_relevancy/turn_relevancy.py b/deepeval/metrics/turn_relevancy/turn_relevancy.py
index 3354623048..ebc73d1816 100644
--- a/deepeval/metrics/turn_relevancy/turn_relevancy.py
+++ b/deepeval/metrics/turn_relevancy/turn_relevancy.py
@@ -18,7 +18,7 @@
)
from deepeval.models import DeepEvalBaseLLM
from deepeval.metrics.indicator import metric_progress_indicator
-from deepeval.test_case import ConversationalTestCase, Turn, TurnParams
+from deepeval.test_case import ConversationalTestCase, Turn, MultiTurnParams
from deepeval.utils import get_or_create_event_loop, prettify_list
from deepeval.metrics.turn_relevancy.schema import (
TurnRelevancyVerdict,
@@ -27,7 +27,7 @@
class TurnRelevancyMetric(BaseConversationalMetric):
- _required_test_case_params = [TurnParams.CONTENT, TurnParams.ROLE]
+ _required_test_case_params = [MultiTurnParams.CONTENT, MultiTurnParams.ROLE]
def __init__(
self,
diff --git a/deepeval/metrics/utils.py b/deepeval/metrics/utils.py
index 0e8a526d56..75f6c9a99f 100644
--- a/deepeval/metrics/utils.py
+++ b/deepeval/metrics/utils.py
@@ -59,13 +59,13 @@
from deepeval.models.base_model import DeepEvalBaseEmbeddingModel
from deepeval.test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
ConversationalTestCase,
MLLMImage,
Turn,
ArenaTestCase,
ToolCall,
- TurnParams,
+ MultiTurnParams,
)
MULTIMODAL_SUPPORTED_MODELS = {
@@ -105,7 +105,7 @@ def copy_metrics(
def format_turns(
- llm_test_cases: List[LLMTestCase], test_case_params: List[LLMTestCaseParams]
+ llm_test_cases: List[LLMTestCase], test_case_params: List[SingleTurnParams]
) -> List[Dict[str, Union[str, List[str]]]]:
res = []
for llm_test_case in llm_test_cases:
@@ -120,17 +120,28 @@ def format_turns(
def convert_turn_to_dict(
turn: Turn,
- turn_params: List[TurnParams] = [TurnParams.CONTENT, TurnParams.ROLE],
+ turn_params: List[MultiTurnParams] = [
+ MultiTurnParams.CONTENT,
+ MultiTurnParams.ROLE,
+ ],
) -> Dict:
- result = {
- param.value: getattr(turn, param.value)
- for param in turn_params
- if (
- param != TurnParams.SCENARIO
- and param != TurnParams.EXPECTED_OUTCOME
- and getattr(turn, param.value) is not None
- )
- }
+ result = {}
+ for param in turn_params:
+ if param in (
+ MultiTurnParams.SCENARIO,
+ MultiTurnParams.EXPECTED_OUTCOME,
+ MultiTurnParams.METADATA,
+ MultiTurnParams.TAGS,
+ ):
+ continue
+
+ if not hasattr(turn, param.value):
+ continue
+
+ value = getattr(turn, param.value)
+ if value is not None:
+ result[param.value] = value
+
return result
@@ -220,7 +231,7 @@ def construct_verbose_logs(metric: BaseMetric, steps: List[str]) -> str:
def check_conversational_test_case_params(
test_case: ConversationalTestCase,
- test_case_params: List[TurnParams],
+ test_case_params: List[MultiTurnParams],
metric: BaseConversationalMetric,
require_chatbot_role: bool = False,
model: Optional[DeepEvalBaseLLM] = None,
@@ -251,18 +262,34 @@ def check_conversational_test_case_params(
raise ValueError(error_str)
if (
- TurnParams.EXPECTED_OUTCOME in test_case_params
+ MultiTurnParams.EXPECTED_OUTCOME in test_case_params
and test_case.expected_outcome is None
):
error_str = f"'expected_outcome' in a conversational test case cannot be empty for the '{metric.__name__}' metric."
metric.error = error_str
raise MissingTestCaseParamsError(error_str)
- if TurnParams.SCENARIO in test_case_params and test_case.scenario is None:
+ if (
+ MultiTurnParams.SCENARIO in test_case_params
+ and test_case.scenario is None
+ ):
error_str = f"'scenario' in a conversational test case cannot be empty for the '{metric.__name__}' metric."
metric.error = error_str
raise MissingTestCaseParamsError(error_str)
+ if (
+ MultiTurnParams.METADATA in test_case_params
+ and test_case.metadata is None
+ ):
+ error_str = f"'metadata' in a conversational test case cannot be empty for the '{metric.__name__}' metric."
+ metric.error = error_str
+ raise MissingTestCaseParamsError(error_str)
+
+ if MultiTurnParams.TAGS in test_case_params and test_case.tags is None:
+ error_str = f"'tags' in a conversational test case cannot be empty for the '{metric.__name__}' metric."
+ metric.error = error_str
+ raise MissingTestCaseParamsError(error_str)
+
if require_chatbot_role and test_case.chatbot_role is None:
error_str = f"'chatbot_role' in a conversational test case cannot be empty for the '{metric.__name__}' metric."
metric.error = error_str
@@ -276,7 +303,7 @@ def check_conversational_test_case_params(
def check_llm_test_case_params(
test_case: LLMTestCase,
- test_case_params: List[LLMTestCaseParams],
+ test_case_params: List[SingleTurnParams],
input_image_count: Optional[int],
actual_output_image_count: Optional[int],
metric: Union[BaseMetric, BaseArenaMetric],
@@ -327,10 +354,8 @@ def check_llm_test_case_params(
# Centralized: if a metric requires actual_output, reject empty/whitespace
# (including empty multimodal outputs) as "missing params".
- if LLMTestCaseParams.ACTUAL_OUTPUT in test_case_params:
- actual_output = getattr(
- test_case, LLMTestCaseParams.ACTUAL_OUTPUT.value
- )
+ if SingleTurnParams.ACTUAL_OUTPUT in test_case_params:
+ actual_output = getattr(test_case, SingleTurnParams.ACTUAL_OUTPUT.value)
if isinstance(actual_output, str) and actual_output == "":
error_str = f"'actual_output' cannot be empty for the '{metric.__name__}' metric"
metric.error = error_str
@@ -358,7 +383,7 @@ def check_llm_test_case_params(
def check_arena_test_case_params(
arena_test_case: ArenaTestCase,
- test_case_params: List[LLMTestCaseParams],
+ test_case_params: List[SingleTurnParams],
metric: BaseArenaMetric,
model: Optional[DeepEvalBaseLLM] = None,
multimodal: Optional[bool] = False,
diff --git a/deepeval/models/base_model.py b/deepeval/models/base_model.py
index c82251c745..0831824b5a 100644
--- a/deepeval/models/base_model.py
+++ b/deepeval/models/base_model.py
@@ -7,6 +7,7 @@
@dataclass
class DeepEvalModelData:
supports_log_probs: Optional[bool] = None
+ max_log_probs: Optional[int] = None
supports_multimodal: Optional[bool] = None
supports_structured_outputs: Optional[bool] = None
supports_json: Optional[bool] = None
diff --git a/deepeval/models/llms/constants.py b/deepeval/models/llms/constants.py
index 51993fb343..97e51d6371 100644
--- a/deepeval/models/llms/constants.py
+++ b/deepeval/models/llms/constants.py
@@ -390,6 +390,7 @@ def make_model_data(**kwargs: Any) -> ModelDataFactory:
),
"gpt-5.4": make_model_data(
supports_log_probs=True,
+ max_log_probs=5,
supports_multimodal=True,
supports_structured_outputs=True,
supports_json=True,
@@ -399,6 +400,7 @@ def make_model_data(**kwargs: Any) -> ModelDataFactory:
),
"gpt-5.4-2026-03-05": make_model_data(
supports_log_probs=True,
+ max_log_probs=5,
supports_multimodal=True,
supports_structured_outputs=True,
supports_json=True,
diff --git a/deepeval/models/llms/openai_model.py b/deepeval/models/llms/openai_model.py
index cf42389880..bda9fc7798 100644
--- a/deepeval/models/llms/openai_model.py
+++ b/deepeval/models/llms/openai_model.py
@@ -282,6 +282,13 @@ async def a_generate(
# Other generate functions #
############################
+ def _cap_top_logprobs(self, top_logprobs: int) -> int:
+ max_log_probs = self.model_data.max_log_probs
+ if max_log_probs is None:
+ return top_logprobs
+
+ return min(top_logprobs, max_log_probs)
+
@retry_openai
def generate_raw_response(
self,
@@ -298,6 +305,7 @@ def generate_raw_response(
"when calling `generate_raw_response`."
)
+ top_logprobs = self._cap_top_logprobs(top_logprobs)
client = self.load_model(async_mode=False)
if is_multimodal:
prompt = convert_to_multi_modal_array(input=prompt)
@@ -336,6 +344,7 @@ async def a_generate_raw_response(
"when calling `a_generate_raw_response`."
)
+ top_logprobs = self._cap_top_logprobs(top_logprobs)
client = self.load_model(async_mode=True)
if is_multimodal:
prompt = convert_to_multi_modal_array(input=prompt)
diff --git a/deepeval/test_case/__init__.py b/deepeval/test_case/__init__.py
index 6acc94f8ba..7b7b04aa27 100644
--- a/deepeval/test_case/__init__.py
+++ b/deepeval/test_case/__init__.py
@@ -1,6 +1,8 @@
+import warnings
+
from .llm_test_case import (
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
ToolCall,
ToolCallParams,
MLLMImage,
@@ -8,7 +10,7 @@
from .conversational_test_case import (
ConversationalTestCase,
Turn,
- TurnParams,
+ MultiTurnParams,
)
from .arena_test_case import ArenaTestCase, Contestant
from .mcp import (
@@ -20,12 +22,12 @@
__all__ = [
"LLMTestCase",
- "LLMTestCaseParams",
+ "SingleTurnParams",
"ToolCall",
"ToolCallParams",
"ConversationalTestCase",
"Turn",
- "TurnParams",
+ "MultiTurnParams",
"MCPServer",
"MCPPromptCall",
"MCPResourceCall",
@@ -34,3 +36,23 @@
"ArenaTestCase",
"Contestant",
]
+
+
+def __getattr__(name: str):
+ if name == "LLMTestCaseParams":
+ warnings.warn(
+ "'LLMTestCaseParams' is deprecated and will be removed in a future "
+ "release. Use 'SingleTurnParams' instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return SingleTurnParams
+ if name == "TurnParams":
+ warnings.warn(
+ "'TurnParams' is deprecated and will be removed in a future "
+ "release. Use 'MultiTurnParams' instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return MultiTurnParams
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/deepeval/test_case/api.py b/deepeval/test_case/api.py
index 9ea56f08d2..4ef8a08cdb 100644
--- a/deepeval/test_case/api.py
+++ b/deepeval/test_case/api.py
@@ -22,7 +22,6 @@ def create_api_turn(turn: Turn, index: int) -> TurnApi:
user_id=turn.user_id,
retrievalContext=turn.retrieval_context,
toolsCalled=turn.tools_called,
- additionalMetadata=turn.additional_metadata,
order=index,
)
@@ -60,7 +59,7 @@ def create_api_test_case(
tags=test_case.tags,
comments=test_case.comments,
imagesMapping=test_case._get_images_mapping(),
- additionalMetadata=test_case.additional_metadata,
+ metadata=test_case.metadata,
)
api_test_case.turns = [
@@ -103,7 +102,7 @@ def create_api_test_case(
runDuration=None,
evaluationCost=None,
order=order,
- additionalMetadata=test_case.additional_metadata,
+ metadata=test_case.metadata,
comments=test_case.comments,
tags=test_case.tags,
trace=trace,
diff --git a/deepeval/test_case/conversational_test_case.py b/deepeval/test_case/conversational_test_case.py
index b5a30049e0..f22f47c5ba 100644
--- a/deepeval/test_case/conversational_test_case.py
+++ b/deepeval/test_case/conversational_test_case.py
@@ -1,4 +1,5 @@
import re
+import warnings
from pydantic import (
BaseModel,
Field,
@@ -21,9 +22,11 @@
from deepeval.test_case.llm_test_case import _MLLM_IMAGE_REGISTRY
-class TurnParams(Enum):
+class MultiTurnParams(Enum):
ROLE = "role"
CONTENT = "content"
+ METADATA = "metadata"
+ TAGS = "tags"
SCENARIO = "scenario"
EXPECTED_OUTCOME = "expected_outcome"
CONTEXT = "context"
@@ -36,6 +39,18 @@ class TurnParams(Enum):
MCP_PROMPTS = "mcp_prompts_called"
+def __getattr__(name: str):
+ if name == "TurnParams":
+ warnings.warn(
+ "'TurnParams' is deprecated and will be removed in a future "
+ "release. Use 'MultiTurnParams' instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return MultiTurnParams
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
class Turn(BaseModel):
role: Literal["user", "assistant"]
content: str
@@ -53,14 +68,31 @@ class Turn(BaseModel):
mcp_tools_called: Optional[List[MCPToolCall]] = Field(default=None)
mcp_resources_called: Optional[List[MCPResourceCall]] = Field(default=None)
mcp_prompts_called: Optional[List[MCPPromptCall]] = Field(default=None)
- additional_metadata: Optional[Dict] = Field(
+ metadata: Optional[Dict] = Field(
default=None,
- serialization_alias="additionalMetadata",
validation_alias=AliasChoices(
- "additionalMetadata", "additional_metadata"
+ "metadata", "additionalMetadata", "additional_metadata"
),
)
+ @property
+ def additional_metadata(self) -> Optional[Dict]:
+ warnings.warn(
+ "'additional_metadata' is deprecated. Use 'metadata' instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return self.metadata
+
+ @additional_metadata.setter
+ def additional_metadata(self, value: Optional[Dict]):
+ warnings.warn(
+ "'additional_metadata' is deprecated. Use 'metadata' instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ self.metadata = value
+
@property
def _mcp_interaction(self) -> bool:
"""Whether this turn involves any MCP interactions."""
@@ -84,8 +116,8 @@ def __repr__(self):
attrs.append(f"mcp_resources_called={self.mcp_resources_called!r}")
if self.mcp_prompts_called is not None:
attrs.append(f"mcp_prompts_called={self.mcp_prompts_called!r}")
- if self.additional_metadata is not None:
- attrs.append(f"additional_metadata={self.additional_metadata!r}")
+ if self.metadata is not None:
+ attrs.append(f"metadata={self.metadata!r}")
return f"Turn({', '.join(attrs)})"
@model_validator(mode="before")
@@ -158,11 +190,10 @@ class ConversationalTestCase(BaseModel):
serialization_alias="chatbotRole",
validation_alias=AliasChoices("chatbotRole", "chatbot_role"),
)
- additional_metadata: Optional[Dict] = Field(
+ metadata: Optional[Dict] = Field(
default=None,
- serialization_alias="additionalMetadata",
validation_alias=AliasChoices(
- "additionalMetadata", "additional_metadata"
+ "metadata", "additionalMetadata", "additional_metadata"
),
)
comments: Optional[str] = Field(default=None)
@@ -174,6 +205,24 @@ class ConversationalTestCase(BaseModel):
_dataset_alias: Optional[str] = PrivateAttr(default=None)
_dataset_id: Optional[str] = PrivateAttr(default=None)
+ @property
+ def additional_metadata(self) -> Optional[Dict]:
+ warnings.warn(
+ "'additional_metadata' is deprecated. Use 'metadata' instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return self.metadata
+
+ @additional_metadata.setter
+ def additional_metadata(self, value: Optional[Dict]):
+ warnings.warn(
+ "'additional_metadata' is deprecated. Use 'metadata' instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ self.metadata = value
+
@model_validator(mode="after")
def set_is_multimodal(self):
import re
diff --git a/deepeval/test_case/llm_test_case.py b/deepeval/test_case/llm_test_case.py
index c5e7f854a9..00152fd516 100644
--- a/deepeval/test_case/llm_test_case.py
+++ b/deepeval/test_case/llm_test_case.py
@@ -14,6 +14,7 @@
import mimetypes
import base64
import weakref
+import warnings
from dataclasses import dataclass, field
from urllib.parse import urlparse, unquote
from deepeval.utils import make_model_config
@@ -167,12 +168,14 @@ def as_data_uri(self) -> Optional[str]:
return f"data:{self.mimeType};base64,{self.dataBase64}"
-class LLMTestCaseParams(Enum):
+class SingleTurnParams(Enum):
INPUT = "input"
ACTUAL_OUTPUT = "actual_output"
EXPECTED_OUTPUT = "expected_output"
CONTEXT = "context"
RETRIEVAL_CONTEXT = "retrieval_context"
+ METADATA = "metadata"
+ TAGS = "tags"
TOOLS_CALLED = "tools_called"
EXPECTED_TOOLS = "expected_tools"
MCP_SERVERS = "mcp_servers"
@@ -181,6 +184,18 @@ class LLMTestCaseParams(Enum):
MCP_PROMPTS_CALLED = "mcp_prompts_called"
+def __getattr__(name: str):
+ if name == "LLMTestCaseParams":
+ warnings.warn(
+ "'LLMTestCaseParams' is deprecated and will be removed in a future "
+ "release. Use 'SingleTurnParams' instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return SingleTurnParams
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
class ToolCallParams(Enum):
INPUT_PARAMETERS = "input_parameters"
OUTPUT = "output"
@@ -324,11 +339,10 @@ class LLMTestCase(BaseModel):
serialization_alias="retrievalContext",
validation_alias=AliasChoices("retrievalContext", "retrieval_context"),
)
- additional_metadata: Optional[Dict] = Field(
+ metadata: Optional[Dict] = Field(
default=None,
- serialization_alias="additionalMetadata",
validation_alias=AliasChoices(
- "additionalMetadata", "additional_metadata"
+ "metadata", "additionalMetadata", "additional_metadata"
),
)
tools_called: Optional[List[ToolCall]] = Field(
@@ -383,6 +397,24 @@ class LLMTestCase(BaseModel):
default_factory=lambda: str(uuid.uuid4())
)
+ @property
+ def additional_metadata(self) -> Optional[Dict]:
+ warnings.warn(
+ "'additional_metadata' is deprecated. Use 'metadata' instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ return self.metadata
+
+ @additional_metadata.setter
+ def additional_metadata(self, value: Optional[Dict]):
+ warnings.warn(
+ "'additional_metadata' is deprecated. Use 'metadata' instead.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ self.metadata = value
+
@model_validator(mode="after")
def set_is_multimodal(self):
import re
diff --git a/deepeval/test_run/api.py b/deepeval/test_run/api.py
index 9180c6b811..dab9598f57 100644
--- a/deepeval/test_run/api.py
+++ b/deepeval/test_run/api.py
@@ -45,9 +45,7 @@ class LLMApiTestCase(BaseModel):
order: Union[int, None] = Field(None)
# These should map 1 to 1 from golden
- additional_metadata: Optional[Dict] = Field(
- None, alias="additionalMetadata"
- )
+ metadata: Optional[Dict] = Field(None)
comments: Optional[str] = Field(None)
trace: Optional[TraceApi] = Field(None)
@@ -104,9 +102,6 @@ class TurnApi(BaseModel):
user_id: Optional[str] = Field(None, alias="userId")
retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
tools_called: Optional[List[ToolCall]] = Field(None, alias="toolsCalled")
- additional_metadata: Optional[Dict] = Field(
- None, alias="additionalMetadata"
- )
comments: Optional[str] = Field(None)
@@ -123,9 +118,7 @@ class ConversationalApiTestCase(BaseModel):
user_description: Optional[str] = Field(None, alias="userDescription")
context: Optional[list] = Field(None)
comments: Optional[str] = Field(None)
- additional_metadata: Optional[Dict] = Field(
- None, alias="additionalMetadata"
- )
+ metadata: Optional[Dict] = Field(None)
images_mapping: Optional[Dict[str, MLLMImage]] = Field(
None, alias="imagesMapping"
)
diff --git a/deepeval/test_run/cache.py b/deepeval/test_run/cache.py
index 0fad66b3b5..18661957d4 100644
--- a/deepeval/test_run/cache.py
+++ b/deepeval/test_run/cache.py
@@ -8,7 +8,7 @@
from deepeval.utils import make_model_config
-from deepeval.test_case import LLMTestCaseParams, LLMTestCase, ToolCallParams
+from deepeval.test_case import SingleTurnParams, LLMTestCase, ToolCallParams
from deepeval.test_run.api import MetricData
from deepeval.utils import (
delete_file_if_exists,
@@ -51,7 +51,7 @@ class MetricConfiguration(BaseModel):
assessment_questions: Optional[List[str]] = None
embeddings: Optional[str] = None
evaluation_params: Optional[
- Union[List[LLMTestCaseParams], List[ToolCallParams]]
+ Union[List[SingleTurnParams], List[ToolCallParams]]
] = None
@@ -118,11 +118,11 @@ def get_cached_test_case(
cached_test_run = self.get_cached_test_run()
cache_dict = {
- LLMTestCaseParams.INPUT.value: test_case.input,
- LLMTestCaseParams.ACTUAL_OUTPUT.value: test_case.actual_output,
- LLMTestCaseParams.EXPECTED_OUTPUT.value: test_case.expected_output,
- LLMTestCaseParams.CONTEXT.value: test_case.context,
- LLMTestCaseParams.RETRIEVAL_CONTEXT.value: test_case.retrieval_context,
+ SingleTurnParams.INPUT.value: test_case.input,
+ SingleTurnParams.ACTUAL_OUTPUT.value: test_case.actual_output,
+ SingleTurnParams.EXPECTED_OUTPUT.value: test_case.expected_output,
+ SingleTurnParams.CONTEXT.value: test_case.context,
+ SingleTurnParams.RETRIEVAL_CONTEXT.value: test_case.retrieval_context,
"hyperparameters": hyperparameters,
}
test_case_cache_key = serialize(cache_dict)
@@ -141,11 +141,11 @@ def cache_test_case(
if self.disable_write_cache or portalocker is None:
return
cache_dict = {
- LLMTestCaseParams.INPUT.value: test_case.input,
- LLMTestCaseParams.ACTUAL_OUTPUT.value: test_case.actual_output,
- LLMTestCaseParams.EXPECTED_OUTPUT.value: test_case.expected_output,
- LLMTestCaseParams.CONTEXT.value: test_case.context,
- LLMTestCaseParams.RETRIEVAL_CONTEXT.value: test_case.retrieval_context,
+ SingleTurnParams.INPUT.value: test_case.input,
+ SingleTurnParams.ACTUAL_OUTPUT.value: test_case.actual_output,
+ SingleTurnParams.EXPECTED_OUTPUT.value: test_case.expected_output,
+ SingleTurnParams.CONTEXT.value: test_case.context,
+ SingleTurnParams.RETRIEVAL_CONTEXT.value: test_case.retrieval_context,
"hyperparameters": hyperparameters,
}
test_case_cache_key = serialize(cache_dict)
diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py
index 750157f2b3..ce80e6245d 100644
--- a/deepeval/test_run/test_run.py
+++ b/deepeval/test_run/test_run.py
@@ -896,23 +896,6 @@ def post_test_run(self, test_run: TestRun) -> Optional[Tuple[str, str]]:
json_str = json.dumps(body, cls=TestRunEncoder)
body = json.loads(json_str)
- # DEBUG: dump the exact JSON payload sent to Confident AI.
- try:
- print(
- "\n===== [deepeval debug] POST payload to Confident AI =====",
- flush=True,
- )
- print(json.dumps(body, indent=2, default=str), flush=True)
- print(
- "===== [deepeval debug] end payload =====\n",
- flush=True,
- )
- except Exception as _dbg_err:
- print(
- f"[deepeval debug] failed to dump payload: {_dbg_err}",
- flush=True,
- )
-
data, link = api.send_request(
method=HttpMethods.POST,
endpoint=Endpoints.TEST_RUN_ENDPOINT,
@@ -1103,7 +1086,8 @@ def wrap_up_test_run(
self.save_test_run_locally()
delete_file_if_exists(self.temp_file_path)
- if is_confident() and self.disable_request is False:
+ confident_enabled = is_confident()
+ if confident_enabled and self.disable_request is False:
return self.post_test_run(test_run)
else:
self.save_test_run(
diff --git a/deepeval/utils.py b/deepeval/utils.py
index 7f0e61d6e1..46c4b16cd6 100644
--- a/deepeval/utils.py
+++ b/deepeval/utils.py
@@ -83,7 +83,6 @@ class TurnLike(Protocol):
user_id: Optional[str]
retrieval_context: Optional[Sequence[str]]
tools_called: Optional[Sequence[Any]]
- additional_metadata: Optional[Dict[str, Any]]
comments: Optional[str]
@@ -124,7 +123,7 @@ def convert_keys_to_snake_case(data: Any) -> Any:
new_dict = {}
for k, v in data.items():
new_key = camel_to_snake(k)
- if k == "additionalMetadata":
+ if k == "additionalMetadata" or k == "metadata":
new_dict[new_key] = (
v # Convert key but do not recurse into value
)
@@ -642,17 +641,6 @@ def format_turn(
f"{indent}↳ comment: {shorten(str(turn.comments), meta_length)}"
)
- meta = turn.additional_metadata or {}
- if isinstance(meta, dict):
- for k in list(meta.keys())[:3]:
- if k in {"user_id", "userId"}:
- continue
- v = meta.get(k)
- if v is not None:
- lines.append(
- f"{indent}↳ meta.{k}: {shorten(str(v), meta_length)}"
- )
-
return "\n".join(lines)
diff --git a/demo_trace_scope/test_observed_app.py b/demo_trace_scope/test_observed_app.py
index a8a133dab9..9b6b2381fd 100644
--- a/demo_trace_scope/test_observed_app.py
+++ b/demo_trace_scope/test_observed_app.py
@@ -6,7 +6,7 @@
from deepeval.dataset import Golden
from deepeval.tracing import observe, update_current_span, update_current_trace
from deepeval.metrics import AnswerRelevancyMetric, GEval, FaithfulnessMetric
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
@observe(metrics=[AnswerRelevancyMetric()])
@@ -18,8 +18,8 @@ def retriever(query: str) -> list[str]:
name="Metric 1",
criteria="Metric 1 criteria",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
)
diff --git a/docs/content/blog/rag-contract-assistant-deepeval-guide.mdx b/docs/content/blog/rag-contract-assistant-deepeval-guide.mdx
index c40fd9ecaa..5153cb497e 100644
--- a/docs/content/blog/rag-contract-assistant-deepeval-guide.mdx
+++ b/docs/content/blog/rag-contract-assistant-deepeval-guide.mdx
@@ -318,7 +318,7 @@ Here’s how you can evaluate the generator with the above mentioned metrics:
```python
from deepeval.metrics import FaithfulnessMetric, AnswerRelevancyMetric, GEval
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
# Hardcoded query and expected answer
query = "What benefits do part-time employees get?"
@@ -344,13 +344,13 @@ metrics = [
GEval(
name="Tone",
criteria="Is the answer professional?",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
strict_mode=True,
),
GEval(
name="Citations",
criteria="Does the answer cite or refer to the source documents?",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.CONTEXT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.CONTEXT],
strict_mode=True,
),
]
@@ -374,7 +374,7 @@ There are multiple levers you can adjust to improve the generator:
4. Citation formatting and instruction
```python
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics import FaithfulnessMetric, AnswerRelevancyMetric, GEval
from langchain.llms import Ollama, OpenAI, HuggingFaceHub
@@ -399,13 +399,13 @@ metrics = [
GEval(
name="Tone",
criteria="Is the answer professional?",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
strict_mode=True,
),
GEval(
name="Citations",
criteria="Does the answer cite or refer to the source documents?",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.CONTEXT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.CONTEXT],
strict_mode=True,
),
]
@@ -484,7 +484,7 @@ from deepeval.metrics import (
GEval,
ContextualRelevancyMetric,
)
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.dataset import EvaluationDataset
from deepeval import assert_test
@@ -541,7 +541,7 @@ metrics = [
GEval(
name="Professional Tone Check",
criteria="Is the answer professionally framed and appropriate for a legal context?",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
strict_mode=True,
threshold=0.8,
),
diff --git a/docs/content/blog/top-5-geval-use-cases.mdx b/docs/content/blog/top-5-geval-use-cases.mdx
index 90c55f4f00..2d0f1b3005 100644
--- a/docs/content/blog/top-5-geval-use-cases.mdx
+++ b/docs/content/blog/top-5-geval-use-cases.mdx
@@ -35,13 +35,13 @@ Here's how to define a G-Eval metric in DeepEval with just a few lines of code:
```python
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
# Define a custom G-Eval metric
custom_metric = GEval(
name="Relevancy",
criteria="Check if the actual output directly addresses the input.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.INPUT]
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.INPUT]
)
```
@@ -66,7 +66,7 @@ Here's an example answer correctness metric defined using G-Eval:
```python
# Create a custom correctness metric
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
correctness_metric = GEval(
name="Correctness",
@@ -77,7 +77,7 @@ correctness_metric = GEval(
"You should also heavily penalize omission of detail",
"Vague language, or contradicting OPINIONS, are OK"
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
)
```
@@ -115,7 +115,7 @@ Here's a an example coherence metric assessing clarify defined using G-Eval:
```python
# Create a custom clarity metric focused on clear communication
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
clarity_metric = GEval(
name="Clarity",
@@ -125,7 +125,7 @@ clarity_metric = GEval(
"Assess whether complex ideas are presented in a way that’s easy to follow.",
"Identify any vague or confusing parts that reduce understanding."
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -159,7 +159,7 @@ Here's an example professionalism metric defined using G-Eval:
```python
# Create a custom professionalism metric
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
professionalism_metric = GEval(
name="Professionalism",
@@ -171,7 +171,7 @@ professionalism_metric = GEval(
"Ensure the actual output stays contextually appropriate and avoids casual or ambiguous expressions.",
"Check if the actual output is clear, respectful, and avoids slang or overly informal phrasing."
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -203,7 +203,7 @@ Here's an example custom PII Leakage metric.
```python
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
pii_leakage_metric = GEval(
name="PII Leakage",
@@ -213,7 +213,7 @@ pii_leakage_metric = GEval(
"Ensure the output uses placeholders or anonymized data when applicable.",
"Verify that sensitive information is not exposed even in edge cases or unclear prompts."
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -248,7 +248,7 @@ Below is an example of a custom **Faithfulness** metric for a medical diagnosis
```python
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
custom_faithfulness_metric = GEval(
name="Medical Diagnosis Faithfulness",
@@ -261,7 +261,7 @@ custom_faithfulness_metric = GEval(
"Heavily penalize hallucinations, especially those that could result in incorrect medical advice.",
"Provide reasons for the faithfulness score, emphasizing the importance of clinical accuracy and patient safety."
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.RETRIEVAL_CONTEXT],
)
```
@@ -285,12 +285,12 @@ This is a naive G-Eval approach to evaluate the persuasiveness of a sales email
```python
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
geval_metric = GEval(
name="Persuasiveness",
criteria="Determine how persuasive the `actual output` is to getting a user booking in a call.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -320,7 +320,7 @@ A **DAG** handles the above use case deterministically by splitting the logic, a
Here is an example of a G-Eval + DAG approach:
```python
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics.dag import (
DeepAcyclicGraph,
TaskNode,
@@ -333,7 +333,7 @@ from deepeval.metrics import DAGMetric, GEval
geval_metric = GEval(
name="Persuasiveness",
criteria="Determine how persuasive the `actual output` is to getting a user booking in a call.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
conciseness_node = BinaryJudgementNode(
diff --git a/docs/content/changelog/changelog-2025.mdx b/docs/content/changelog/changelog-2025.mdx
index 8f6d38fd09..2047cc2b10 100644
--- a/docs/content/changelog/changelog-2025.mdx
+++ b/docs/content/changelog/changelog-2025.mdx
@@ -14,6 +14,10 @@ sidebar_label: 🐍 2025
- **Reliability** improved with better async handling, timeouts, and retries
- **Documentation** expanded with comprehensive tutorials to help teams ship confidently
+## Heads up: deprecations
+
+- `LLMTestCaseParams` has been renamed to `SingleTurnParams`, and `TurnParams` has been renamed to `MultiTurnParams`, so the names line up with the AI system being evaluated rather than the underlying object. The old names still work but importing them now emits a `DeprecationWarning`; switch to `SingleTurnParams` / `MultiTurnParams` to silence it. GEval also now treats `metadata` and `tags` strictly as test-case-level params for `ConversationalTestCase` (they're no longer pulled from individual turns into the prompt).
+
## Thank you to our contributors
@@ -1050,7 +1054,7 @@ March made evaluations and synthesis more reliable. Defaults improved for Ollama
#### v2.5.8
- Fix Ragas metrics failing with an “async_mode is missing” error by explicitly running metric tracking in non-async mode during evaluation. ([#1402](https://github.com/confident-ai/deepeval/pull/1402)) {/* pr:1402 */} ([Tanay Agrawal](https://github.com/tanayag))
-- Fix the import path for `LLMTestCaseParams` in the metrics selection tutorial so the example code runs without import errors. ([#1407](https://github.com/confident-ai/deepeval/pull/1407)) {/* pr:1407 */} ([Obada Khalili](https://github.com/obadakhalili))
+- Fix the import path for `SingleTurnParams` in the metrics selection tutorial so the example code runs without import errors. ([#1407](https://github.com/confident-ai/deepeval/pull/1407)) {/* pr:1407 */} ([Obada Khalili](https://github.com/obadakhalili))
- Fix a typo in the synthetic input generation template to clarify instructions about avoiding repetitive `input`. ([#1408](https://github.com/confident-ai/deepeval/pull/1408)) {/* pr:1408 */} ([John D. McDonald](https://github.com/Rasputin2))
- Fix tool correctness reason messages so the `expected` and `called` tool names are reported in the right order when using exact match checks. ([#1409](https://github.com/confident-ai/deepeval/pull/1409)) {/* pr:1409 */} ([Casey Lewiston](https://github.com/shredinger137))
- Fix the dataset synthesis tutorial to use the correct `StylingConfig` keyword argument, replacing `expected_output` with `expected_output_format` so the example code runs as intended. ([#1411](https://github.com/confident-ai/deepeval/pull/1411)) {/* pr:1411 */} ([Obada Khalili](https://github.com/obadakhalili))
diff --git a/docs/content/docs/(concepts)/(test-cases)/evaluation-arena-test-cases.mdx b/docs/content/docs/(concepts)/(test-cases)/evaluation-arena-test-cases.mdx
index 018d9d059b..b1f6bbc9d3 100644
--- a/docs/content/docs/(concepts)/(test-cases)/evaluation-arena-test-cases.mdx
+++ b/docs/content/docs/(concepts)/(test-cases)/evaluation-arena-test-cases.mdx
@@ -212,15 +212,15 @@ The `[DEEPEVAL:IMAGE:awefv234fvbnhg456]` here is actually the instance of `MLLMI
The [`ArenaGEval` metric](/docs/metrics-arena-g-eval) is the only metric that uses an `ArenaTestCase`, which picks a "winner" out of the list of contestants:
```python
-from deepeval.metrics import ArenaTestCase, LLMTestCaseParams
+from deepeval.metrics import ArenaTestCase, SingleTurnParams
...
arena_geval = ArenaGEval(
name="Friendly",
criteria="Choose the winner of the more friendly contestant based on the input and actual output",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
)
diff --git a/docs/content/docs/(concepts)/evaluation-prompts.mdx b/docs/content/docs/(concepts)/evaluation-prompts.mdx
index 2f775dc7ae..eb393cfe17 100644
--- a/docs/content/docs/(concepts)/evaluation-prompts.mdx
+++ b/docs/content/docs/(concepts)/evaluation-prompts.mdx
@@ -207,7 +207,7 @@ for golden in dataset.evals_iterator():
You can also evaluate prompts side-by-side using `ArenaGEval` to pick the best-performing prompt for your given criteria. Simply include the prompts in the `hyperparameters` field of each `Contestant`.
```python title="main.py" showLineNumbers={true}
-from deepeval.test_case import ArenaTestCase, LLMTestCase, LLMTestCaseParams, Contestant
+from deepeval.test_case import ArenaTestCase, LLMTestCase, SingleTurnParams, Contestant
from deepeval.metrics import ArenaGEval
from deepeval.prompt import Prompt
from deepeval import compare
@@ -234,8 +234,8 @@ arena_geval = ArenaGEval(
name="Friendly",
criteria="Choose the winner of the more friendly contestant based on the input and actual output",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
)
diff --git a/docs/content/docs/(custom)/metrics-arena-g-eval.mdx b/docs/content/docs/(custom)/metrics-arena-g-eval.mdx
index 76233b71a6..3befd7b0a8 100644
--- a/docs/content/docs/(custom)/metrics-arena-g-eval.mdx
+++ b/docs/content/docs/(custom)/metrics-arena-g-eval.mdx
@@ -24,7 +24,7 @@ You'll also need to supply any additional arguments such as `expected_output` an
To create a custom metric that chooses the best `LLMTestCase`, simply instantiate a `ArenaGEval` class and define an evaluation criteria in everyday language:
```python
-from deepeval.test_case import ArenaTestCase, LLMTestCase, LLMTestCaseParams, Contestant
+from deepeval.test_case import ArenaTestCase, LLMTestCase, SingleTurnParams, Contestant
from deepeval.metrics import ArenaGEval
from deepeval import compare
@@ -52,8 +52,8 @@ metric = ArenaGEval(
name="Friendly",
criteria="Choose the winner of the more friendly contestant based on the input and actual output",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
)
@@ -64,7 +64,7 @@ There are **THREE** mandatory and **FOUR** optional parameters required when ins
- `name`: name of metric. This will **not** affect the evaluation.
- `criteria`: a description outlining the specific evaluation aspects for each test case.
-- `evaluation_params`: a list of type `LLMTestCaseParams`, include only the parameters that are relevant for evaluation..
+- `evaluation_params`: a list of type `SingleTurnParams`, include only the parameters that are relevant for evaluation..
- [Optional] `evaluation_steps`: a list of strings outlining the exact steps the LLM should take for evaluation. If `evaluation_steps` is not provided, `ConversationalGEval` will generate a series of `evaluation_steps` on your behalf based on the provided `criteria`. You can only provide either `evaluation_steps` **OR** `criteria`, and not both.
- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](/docs/metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to .
- [Optional] `async_mode`: a boolean which when set to `True`, enables [concurrent execution within the `measure()` method.](/docs/metrics-introduction#measuring-metrics-in-async) Defaulted to `True`.
diff --git a/docs/content/docs/(custom)/metrics-conversational-dag.mdx b/docs/content/docs/(custom)/metrics-conversational-dag.mdx
index db5f7153c4..f772431ec1 100644
--- a/docs/content/docs/(custom)/metrics-conversational-dag.mdx
+++ b/docs/content/docs/(custom)/metrics-conversational-dag.mdx
@@ -64,7 +64,7 @@ There are exactly **FOUR** different node types you can choose from to create a
### Task node
-The `ConversationalTaskNode` is designed specifically for processing either the data from a test case using parameters from `TurnParams`, or the output from a parent `ConversationalTaskNode`.
+The `ConversationalTaskNode` is designed specifically for processing either the data from a test case using parameters from `MultiTurnParams`, or the output from a parent `ConversationalTaskNode`.
:::note
The `ConversationalDAGMetric` allows you to choose a certain window of turns to run evaluations on as well.
@@ -76,12 +76,12 @@ You can also break down a conversation into atomic units by choosing a specific
```python
from deepeval.metrics.conversational_dag import ConversationalTaskNode
-from deepeval.test_case import TurnParams
+from deepeval.test_case import MultiTurnParams
task_node = ConversationalTaskNode(
instructions="Summarize the assistant's replies in one paragraph.",
output_label="Summary",
- evaluation_params=[TurnParams.ROLE, TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.ROLE, MultiTurnParams.CONTENT],
children=[],
turn_window=(0,6),
)
@@ -92,7 +92,7 @@ There are **THREE** mandatory and **THREE** optional parameters when creating a
- `instructions`: a string specifying how to process a conversation, and/or outputs from a previous parent `TaskNode`.
- `output_label`: a string representing the final output. The `child` `ConversationalBaseNode`s will use the `output_label` to reference the output from the current `ConversationalTaskNode`.
- `children`: a list of `ConversationalBaseNode`s. There **must not** be a `ConversationalVerdictNode` in the list of children for a `ConversationalTaskNode`.
-- [Optional] `evaluation_params`: a list of type `TurnParams`. Include only the parameters that are relevant for processing.
+- [Optional] `evaluation_params`: a list of type `MultiTurnParams`. Include only the parameters that are relevant for processing.
- [Optional] `label`: a string that will be displayed in the verbose logs if `verbose_mode` is `True`.
- [Optional] `turn_window`: a tuple of 2 indices (inclusive) specifying the conversation window the task node must focus on. The window must contain the conversation where the task must be performed.
@@ -116,7 +116,7 @@ There are **TWO** mandatory and **THREE** optional parameters when creating a `C
- `criteria`: a yes/no question based on output from parent node(s) and optionally parameters from the `Turn`.
- `children`: a list of exactly two `ConversationalVerdictNodes`, one with a verdict value of `True`, and the other with a value of `False`.
-- [Optional] `evaluation_params`: a list of type `TurnParams`. Include only the parameters that are relevant for processing.
+- [Optional] `evaluation_params`: a list of type `MultiTurnParams`. Include only the parameters that are relevant for processing.
- [Optional] `label`: a string that will be displayed in the verbose logs if `verbose_mode` is `True`.
- [Optional] `turn_window`: a tuple of 2 indices (inclusive) specifying the conversation window the task node must focus on. The window must contain the conversation where the task must be performed.
@@ -145,7 +145,7 @@ There are **TWO** mandatory and **THREE** optional parameters when creating a `C
- `criteria`: an open-ended question based on output from parent node(s) and optionally parameters from the `Turn`.
- `children`: a list of `ConversationalVerdictNodes`, where the `verdict` values determine the possible verdict of the current non-binary judgement.
-- [Optional] `evaluation_params`: a list of type `TurnParams`. Include only the parameters that are relevant for processing.
+- [Optional] `evaluation_params`: a list of type `MultiTurnParams`. Include only the parameters that are relevant for processing.
- [Optional] `label`: a string that will be displayed in the verbose logs if `verbose_mode` is `True`.
- [Optional] `turn_window`: a tuple of 2 indices (inclusive) specifying the conversation window the task node must focus on. The window must contain the conversation where the task must be performed.
@@ -211,7 +211,7 @@ from deepeval.metrics.conversational_dag import ConversationalTaskNode
task_node = ConversationalTaskNode(
instructions="Summarize the conversation and explain assistant's behaviour overall.",
output_label="Summary",
- evaluation_params=[TurnParams.ROLE, TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.ROLE, MultiTurnParams.CONTENT],
children=[],
)
```
@@ -284,7 +284,7 @@ from deepeval.metrics.conversational_dag import (
ConversationalNonBinaryJudgementNode,
ConversationalVerdictNode,
)
-from deepeval.test_case import TurnParams
+from deepeval.test_case import MultiTurnParams
non_binary_node = ConversationalNonBinaryJudgementNode(
criteria="How was the assistant's behaviour towards user?",
@@ -306,7 +306,7 @@ binary_node = ConversationalBinaryJudgementNode(
task_node = ConversationalTaskNode(
instructions="Summarize the conversation and explain assistant's behaviour overall.",
output_label="Summary",
- evaluation_params=[TurnParams.ROLE, TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.ROLE, MultiTurnParams.CONTENT],
children=[binary_node],
)
diff --git a/docs/content/docs/(custom)/metrics-conversational-g-eval.mdx b/docs/content/docs/(custom)/metrics-conversational-g-eval.mdx
index 58c9fc593e..db88cc9fe2 100644
--- a/docs/content/docs/(custom)/metrics-conversational-g-eval.mdx
+++ b/docs/content/docs/(custom)/metrics-conversational-g-eval.mdx
@@ -23,7 +23,7 @@ To create a custom metric that evaluates entire LLM conversations, simply instan
```python
from deepeval import evaluate
-from deepeval.test_case import Turn, TurnParams, ConversationalTestCase
+from deepeval.test_case import Turn, MultiTurnParams, ConversationalTestCase
from deepeval.metrics import ConversationalGEval
convo_test_case = ConversationalTestCase(
@@ -45,7 +45,7 @@ There are **THREE** mandatory and **SIX** optional parameters required when inst
- `name`: name of metric. This will **not** affect the evaluation.
- `criteria`: a description outlining the specific evaluation aspects for each test case.
-- [Optional] `evaluation_params`: a list of type `TurnParams`, include only the parameters that are relevant for evaluation. Defaulted to `[TurnParams.CONTENT]`.
+- [Optional] `evaluation_params`: a list of type `MultiTurnParams`, include only the parameters that are relevant for evaluation. Defaulted to `[MultiTurnParams.CONTENT]`.
- [Optional] `evaluation_steps`: a list of strings outlining the exact steps the LLM should take for evaluation. If `evaluation_steps` is not provided, `ConversationalGEval` will generate a series of `evaluation_steps` on your behalf based on the provided `criteria`. You can only provide either `evaluation_steps` **OR** `criteria`, and not both.
- [Optional] `threshold`: the passing threshold, defaulted to 0.5.
- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](/docs/metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to .
diff --git a/docs/content/docs/(custom)/metrics-dag.mdx b/docs/content/docs/(custom)/metrics-dag.mdx
index f91a714a9d..8ae587e55e 100644
--- a/docs/content/docs/(custom)/metrics-dag.mdx
+++ b/docs/content/docs/(custom)/metrics-dag.mdx
@@ -28,7 +28,7 @@ If you were to do this using `GEval`, your `evaluation_steps` might look somethi
Which in term looks something like this in code:
```python
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
from deepeval.metrics import GEval
metric = GEval(
@@ -38,7 +38,7 @@ metric = GEval(
"If the `actual_output` has all the complete headings but are in the wrong order, penalize it.",
"If the summary has all the correct headings and they are in the right order, give it a perfect score."
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT]
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT]
)
```
@@ -148,7 +148,7 @@ Some might be skeptical if this complexity is necessary but in reality, you'll q
Here's how this decision tree would look like in code:
```python
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
from deepeval.metrics.dag import (
DeepAcyclicGraph,
TaskNode,
@@ -176,7 +176,7 @@ correct_headings_node = BinaryJudgementNode(
extract_headings_node = TaskNode(
instructions="Extract all headings in `actual_output`",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
output_label="Summary headings",
children=[correct_headings_node, correct_order_node],
)
@@ -240,13 +240,13 @@ The `TaskNode` is designed specifically for processing data such as parameters f
```python
from typing import Optional, List
from deepeval.metrics.dag import BaseNode
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
class TaskNode(BaseNode):
instructions: str
output_label: str
children: List[BaseNode]
- evaluation_params: Optional[List[LLMTestCaseParams]] = None
+ evaluation_params: Optional[List[SingleTurnParams]] = None
label: Optional[str] = None
```
@@ -255,7 +255,7 @@ There are **THREE** mandatory and **TWO** optional parameter when creating a `Ta
- `instructions`: a string specifying how to process parameters of an `LLMTestCase`, and/or outputs from a previous parent `TaskNode`.
- `output_label`: a string representing the final output. The `children` `BaseNode`s will use the `output_label` to reference the output from the current `TaskNode`.
- `children`: a list of `BaseNode`s. There **must not** be a `VerdictNode` in the list of children.
-- [Optional] `evaluation_params`: a list of type `LLMTestCaseParams`. Include only the parameters that are relevant for processing.
+- [Optional] `evaluation_params`: a list of type `SingleTurnParams`. Include only the parameters that are relevant for processing.
- [Optional] `label`: a string that will be displayed in the verbose logs if `verbose_mode` is `True`.
:::info
@@ -269,12 +269,12 @@ The `BinaryJudgementNode` determines whether the verdict is `True` or `False` ba
```python
from typing import Optional, List
from deepeval.metrics.dag import BaseNode, VerdictNode
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
class BinaryJudgementNode(BaseNode):
criteria: str
children: List[VerdictNode]
- evaluation_params: Optional[List[LLMTestCaseParams]] = None
+ evaluation_params: Optional[List[SingleTurnParams]] = None
label: Optional[str] = None
```
@@ -282,7 +282,7 @@ There are **TWO** mandatory and **TWO** optional parameter when creating a `Bina
- `criteria`: a yes/no question based on output from parent node(s) and optionally parameters from the `LLMTestCase`. You **DON'T HAVE TO TELL IT** to output `True` or `False`.
- `children`: a list of exactly two `VerdictNode`s, one with a `verdict` value of `True`, and the other with a value of `False`.
-- [Optional] `evaluation_params`: a list of type `LLMTestCaseParams`. Include only the parameters that are relevant for evaluation.
+- [Optional] `evaluation_params`: a list of type `SingleTurnParams`. Include only the parameters that are relevant for evaluation.
- [Optional] `label`: a string that will be displayed in the verbose logs if `verbose_mode` is `True`.
:::tip
@@ -298,12 +298,12 @@ The `NonBinaryJudgementNode` determines what the verdict is based on the given `
```python
from typing import Optional, List
from deepeval.metrics.dag import BaseNode, VerdictNode
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
class NonBinaryJudgementNode(BaseNode):
criteria: str
children: List[VerdictNode]
- evaluation_params: Optional[List[LLMTestCaseParams]] = None
+ evaluation_params: Optional[List[SingleTurnParams]] = None
label: Optional[str] = None
```
@@ -311,7 +311,7 @@ There are **TWO** mandatory and **TWO** optional parameter when creating a `NonB
- `criteria`: an open-ended question based on output from parent node(s) and optionally parameters from the `LLMTestCase`. You **DON'T HAVE TO TELL IT** what to output.
- `children`: a list of `VerdictNode`s, where the `verdict` values determine the possible verdict of the current `NonBinaryJudgementNode`.
-- [Optional] `evaluation_params`: a list of type `LLMTestCaseParams`. Include only the parameters that are relevant for evaluation.
+- [Optional] `evaluation_params`: a list of type `SingleTurnParams`. Include only the parameters that are relevant for evaluation.
- [Optional] `label`: a string that will be displayed in the verbose logs if `verbose_mode` is `True`.
### `VerdictNode`
diff --git a/docs/content/docs/(custom)/metrics-llm-evals.mdx b/docs/content/docs/(custom)/metrics-llm-evals.mdx
index 00e5fca26c..2fc0c27643 100644
--- a/docs/content/docs/(custom)/metrics-llm-evals.mdx
+++ b/docs/content/docs/(custom)/metrics-llm-evals.mdx
@@ -31,7 +31,7 @@ To create a custom metric that uses LLMs for evaluation, simply instantiate an `
```python
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
correctness_metric = GEval(
name="Correctness",
@@ -42,7 +42,7 @@ correctness_metric = GEval(
"You should also heavily penalize omission of detail",
"Vague language, or contradicting OPINIONS, are OK"
],
- evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.INPUT, SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
)
```
@@ -50,7 +50,7 @@ There are **THREE** mandatory and **SEVEN** optional parameters required when in
- `name`: name of custom metric.
- `criteria`: a description outlining the specific evaluation aspects for each test case.
-- `evaluation_params`: a list of type `LLMTestCaseParams`. Include only the parameters that are relevant for evaluation.
+- `evaluation_params`: a list of type `SingleTurnParams`. Include only the parameters that are relevant for evaluation.
- [Optional] `evaluation_steps`: a list of strings outlining the exact steps the LLM should take for evaluation. If `evaluation_steps` is not provided, `GEval` will generate a series of `evaluation_steps` on your behalf based on the provided `criteria`.
- [Optional] `rubric`: a list of `Rubric`s that allows you to [confine the range](/docs/metrics-llm-evals#rubric) of the final metric score.
- [Optional] `threshold`: the passing threshold, defaulted to 0.5.
@@ -112,7 +112,7 @@ correctness_metric = GEval(
"You should also heavily penalize omission of detail",
"Vague language, or contradicting OPINIONS, are OK"
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
)
```
@@ -127,7 +127,7 @@ from deepeval.metrics.g_eval import Rubric
correctness_metric = GEval(
name="Correctness",
criteria="Determine whether the actual output is factually correct based on the expected output.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
rubric=[
Rubric(score_range=(0,2), expected_outcome="Factually incorrect."),
Rubric(score_range=(3,6), expected_outcome="Mostly correct."),
@@ -249,7 +249,7 @@ Answer correctness is the most used G-Eval metric of all and usually involves co
```python
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
correctness = GEval(
name="Correctness",
@@ -258,7 +258,7 @@ correctness = GEval(
"You should also heavily penalize omission of detail",
"Vague language, or contradicting OPINIONS, are OK"
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
)
```
@@ -270,7 +270,7 @@ Coherence is usually a referenceless metric that covers several criteria such as
```python
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
clarity = GEval(
name="Clarity",
@@ -280,7 +280,7 @@ clarity = GEval(
"Assess whether complex ideas are presented in a way that's easy to follow.",
"Identify any vague or confusing parts that reduce understanding."
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -292,7 +292,7 @@ Tonality is similar to coherence in the sense that it is also a referenceless me
```python
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
professionalism = GEval(
name="Professionalism",
@@ -302,7 +302,7 @@ professionalism = GEval(
"Ensure the actual output stays contextually appropriate and avoids casual or ambiguous expressions.",
"Check if the actual output is clear, respectful, and avoids slang or overly informal phrasing."
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -314,7 +314,7 @@ Safety evaluates whether your LLM's `actual_output` aligns with whatever ethical
```python
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
pii_leakage = GEval(
name="PII Leakage",
@@ -324,7 +324,7 @@ pii_leakage = GEval(
"Ensure the output uses placeholders or anonymized data when applicable.",
"Verify that sensitive information is not exposed even in edge cases or unclear prompts."
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -336,7 +336,7 @@ Although `deepeval` already offer RAG metrics such as the `AnswerRelevancyMetric
```python
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
medical_faithfulness = GEval(
name="Medical Faithfulness",
@@ -347,7 +347,7 @@ medical_faithfulness = GEval(
"Heavily penalize hallucinations, especially those that could result in incorrect medical advice.",
"Provide reasons for the faithfulness score, emphasizing the importance of clinical accuracy and patient safety."
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.RETRIEVAL_CONTEXT],
)
```
diff --git a/docs/content/docs/(use-cases)/getting-started-llm-arena.mdx b/docs/content/docs/(use-cases)/getting-started-llm-arena.mdx
index 38a34e9dea..e34a1a40b7 100644
--- a/docs/content/docs/(use-cases)/getting-started-llm-arena.mdx
+++ b/docs/content/docs/(use-cases)/getting-started-llm-arena.mdx
@@ -198,14 +198,14 @@ The [`ArenaGEval`](https://deepeval.com/docs/metrics-arena-g-eval) metric is the
```python
from deepeval.metrics import ArenaGEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
arena_geval = ArenaGEval(
name="Friendly",
criteria="Choose the winner of the more friendly contestant based on the input and actual output",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
)
```
@@ -227,7 +227,7 @@ Now that you have created an arena with contestants and defined a metric, you ca
You can run arena evals by using the `compare()` function.
```python {3,11} title="main.py"
-from deepeval.test_case import ArenaTestCase, LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import ArenaTestCase, LLMTestCase, SingleTurnParams
from deepeval.metrics import ArenaGEval
from deepeval import compare
diff --git a/docs/content/docs/(use-cases)/getting-started-rag.mdx b/docs/content/docs/(use-cases)/getting-started-rag.mdx
index 5066079270..cba141c2f0 100644
--- a/docs/content/docs/(use-cases)/getting-started-rag.mdx
+++ b/docs/content/docs/(use-cases)/getting-started-rag.mdx
@@ -595,7 +595,7 @@ Define a multi-turn RAG metric to evaluate your chatbot system:
```python
from deepeval.metrics import TurnRelevancy, TurnFaithfulness
-from deepeval.test_case import TurnParams
+from deepeval.test_case import MultiTurnParams
turn_faithfulness = TurnFaithfulness()
turn_relevancy = TurnRelevancy()
diff --git a/docs/content/docs/getting-started.mdx b/docs/content/docs/getting-started.mdx
index 4805605f55..f176c95686 100644
--- a/docs/content/docs/getting-started.mdx
+++ b/docs/content/docs/getting-started.mdx
@@ -74,14 +74,14 @@ Run `touch test_example.py` in your terminal and paste in the following code:
```python title="test_example.py"
from deepeval import assert_test
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics import GEval
def test_correctness():
correctness_metric = GEval(
name="Correctness",
criteria="Determine if the 'actual output' is correct based on the 'expected output'.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
threshold=0.5
)
test_case = LLMTestCase(
diff --git a/docs/content/docs/metrics-introduction.mdx b/docs/content/docs/metrics-introduction.mdx
index dbd5a6ed16..7c5ec53ff3 100644
--- a/docs/content/docs/metrics-introduction.mdx
+++ b/docs/content/docs/metrics-introduction.mdx
@@ -155,14 +155,14 @@ Apart from the variety of metrics offered, `deepeval`'s metrics are a step up to
```python
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics import GEval
test_case = LLMTestCase(input="...", actual_output="...", expected_output="...")
correctness = GEval(
name="Correctness",
criteria="Correctness - determine if the actual output is correct according to the expected output.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
strict_mode=True
)
@@ -174,14 +174,14 @@ print(correctness.score, correctness.reason)
```python
-from deepeval.test_case import Turn, TurnParams, ConversationalTestCase
+from deepeval.test_case import Turn, MultiTurnParams, ConversationalTestCase
from deepeval.metrics import ConversationalGEval
convo_test_case = ConversationalTestCase(turns=[Turn(role="...", content="..."), Turn(role="...", content="...")])
professionalism_metric = ConversationalGEval(
name="Professionalism",
criteria="Determine whether the assistant has acted professionally based on the content."
- evaluation_params=[TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.CONTENT],
strict_mode=True
)
diff --git a/docs/content/guides/guides-ai-agent-evaluation.mdx b/docs/content/guides/guides-ai-agent-evaluation.mdx
index 488387e360..0a53723000 100644
--- a/docs/content/guides/guides-ai-agent-evaluation.mdx
+++ b/docs/content/guides/guides-ai-agent-evaluation.mdx
@@ -440,13 +440,13 @@ Define your custom metric locally using the `GEval` class:
```python
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
# Define a custom metric for your specific use case
reasoning_clarity = GEval(
name="Reasoning Clarity",
criteria="Evaluate how clearly the agent explains its reasoning and decision-making process before taking actions.",
- evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.INPUT, SingleTurnParams.ACTUAL_OUTPUT],
)
```
diff --git a/docs/content/guides/guides-answer-correctness-metric.mdx b/docs/content/guides/guides-answer-correctness-metric.mdx
index c2dd9ac5b5..c232d27527 100644
--- a/docs/content/guides/guides-answer-correctness-metric.mdx
+++ b/docs/content/guides/guides-answer-correctness-metric.mdx
@@ -38,13 +38,13 @@ G-Eval is most effective when employing a model from the **GPT-4 model family**
### 2. Select your evaluation parameters
-G-Eval allows you to select parameters that are relevant for evaluation by providing a list of `LLMTestCaseParams`, which includes:
+G-Eval allows you to select parameters that are relevant for evaluation by providing a list of `SingleTurnParams`, which includes:
-- `LLMTestCaseParams.INPUT`
-- `LLMTestCaseParams.ACTUAL_OUTPUT`
-- `LLMTestCaseParams.EXPECTED_OUTPUT`
-- `LLMTestCaseParams.CONTEXT`
-- `LLMTestCaseParams.RETRIEVAL_CONTEXT`
+- `SingleTurnParams.INPUT`
+- `SingleTurnParams.ACTUAL_OUTPUT`
+- `SingleTurnParams.EXPECTED_OUTPUT`
+- `SingleTurnParams.CONTEXT`
+- `SingleTurnParams.RETRIEVAL_CONTEXT`
`ACTUAL_OUTPUT` should **always** be included in your `evaluation_params`, as this is what every Correctness metric will be directly evaluating. As mentioned earlier, Correctness is determined by how well the actual output aligns with the ground truth, which is typically more variable. The ground truth is best represented by `EXPECTED_OUTPUT`, where the expected output serves as the **ideal reference** for the actual output, with an exact match earning a score of 1.
@@ -54,8 +54,8 @@ correctness_metric = GEval(
name="Correctness",
model="gpt-4.1",
evaluation_params=[
- LLMTestCaseParams.EXPECTED_OUTPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT],
+ SingleTurnParams.EXPECTED_OUTPUT,
+ SingleTurnParams.ACTUAL_OUTPUT],
...
)
```
@@ -68,8 +68,8 @@ correctness_metric = GEval(
name="Correctness",
model="gpt-4.1",
evaluation_params=[
- LLMTestCaseParams.CONTEXT,
- LLMTestCaseParams.ACTUAL_OUTPUT],
+ SingleTurnParams.CONTEXT,
+ SingleTurnParams.ACTUAL_OUTPUT],
...
)
```
@@ -86,8 +86,8 @@ correctness_metric = GEval(
name="Correctness",
model="gpt-4.1",
evaluation_params=[
- LLMTestCaseParams.CONTEXT,
- LLMTestCaseParams.ACTUAL_OUTPUT],
+ SingleTurnParams.CONTEXT,
+ SingleTurnParams.ACTUAL_OUTPUT],
evaluation_steps=[
"Determine whether the actual output is factually correct based on the expected output."
],
@@ -101,8 +101,8 @@ correctness_metric = GEval(
name="Correctness",
model="gpt-4.1",
evaluation_params=[
- LLMTestCaseParams.CONTEXT,
- LLMTestCaseParams.ACTUAL_OUTPUT],
+ SingleTurnParams.CONTEXT,
+ SingleTurnParams.ACTUAL_OUTPUT],
evaluation_steps=[
'Compare the actual output directly with the expected output to verify factual accuracy.',
'Check if all elements mentioned in the expected output are present and correctly represented in the actual output.',
@@ -118,8 +118,8 @@ correctness_metric = GEval(
name="Correctness",
model="gpt-4.1",
evaluation_params=[
- LLMTestCaseParams.CONTEXT,
- LLMTestCaseParams.ACTUAL_OUTPUT],
+ SingleTurnParams.CONTEXT,
+ SingleTurnParams.ACTUAL_OUTPUT],
evaluation_steps=[
"Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
"You should also lightly penalize omission of detail, and focus on the main idea",
@@ -181,8 +181,8 @@ correctness_metric = GEval(
name="Correctness",
model="gpt-4.1",
evaluation_params=[
- LLMTestCaseParams.CONTEXT,
- LLMTestCaseParams.ACTUAL_OUTPUT],
+ SingleTurnParams.CONTEXT,
+ SingleTurnParams.ACTUAL_OUTPUT],
evaluation_steps=[
"Check whether the facts in 'actual output' contradict any facts in 'expected output'",
"Lightly penalize omissions of detail, focusing on the main idea",
diff --git a/docs/content/guides/guides-multi-turn-evaluation-metrics.mdx b/docs/content/guides/guides-multi-turn-evaluation-metrics.mdx
index 0e1ab685b0..86073cce52 100644
--- a/docs/content/guides/guides-multi-turn-evaluation-metrics.mdx
+++ b/docs/content/guides/guides-multi-turn-evaluation-metrics.mdx
@@ -378,7 +378,7 @@ The `ConversationalDAGMetric` lets you build **deterministic decision trees** fo
```python
from deepeval import evaluate
-from deepeval.test_case import Turn, ConversationalTestCase, TurnParams
+from deepeval.test_case import Turn, ConversationalTestCase, MultiTurnParams
from deepeval.metrics import ConversationalDAGMetric
from deepeval.metrics.dag import DeepAcyclicGraph
from deepeval.metrics.conversational_dag import (
@@ -408,7 +408,7 @@ binary_node = ConversationalBinaryJudgementNode(
task_node = ConversationalTaskNode(
instructions="Summarize the conversation and explain assistant's behaviour overall.",
output_label="Summary",
- evaluation_params=[TurnParams.ROLE, TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.ROLE, MultiTurnParams.CONTENT],
children=[binary_node],
)
diff --git a/docs/content/guides/guides-rag-evaluation.mdx b/docs/content/guides/guides-rag-evaluation.mdx
index 9a3d696d2b..495634a6f8 100644
--- a/docs/content/guides/guides-rag-evaluation.mdx
+++ b/docs/content/guides/guides-rag-evaluation.mdx
@@ -244,13 +244,13 @@ Here is where you can take advantage of `deepeval`'s `GEval` metric, capable of
```python
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
...
dark_humor = GEval(
name="Dark Humor",
criteria="Determine how funny the dark humor in the actual output is",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
dark_humor.measure(test_case)
diff --git a/docs/content/guides/guides-tracing-rag.mdx b/docs/content/guides/guides-tracing-rag.mdx
index 2d4eaa255b..90f2cf75d7 100644
--- a/docs/content/guides/guides-tracing-rag.mdx
+++ b/docs/content/guides/guides-tracing-rag.mdx
@@ -313,12 +313,12 @@ Now call your function using the `evals_iterator` of `EvaluationDataset` with me
```python
from deepeval.dataset import EvaluationDataset, Golden
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
correctness_metric = GEval(
name="Correctness",
criteria="Determine whether the actual output is factually correct based on the expected output.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
threshold=0.7,
)
diff --git a/docs/content/integrations/frameworks/huggingface.mdx b/docs/content/integrations/frameworks/huggingface.mdx
index bd187116bc..cc1096f880 100644
--- a/docs/content/integrations/frameworks/huggingface.mdx
+++ b/docs/content/integrations/frameworks/huggingface.mdx
@@ -98,7 +98,7 @@ trainer = Trainer(
Use `deepeval` to define an `EvaluationDataset` and the metrics you want to evaluate your LLM on:
```python
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
from deepeval.dataset import EvaluationDataset, Golden
from deepeval.metrics import GEval
@@ -109,7 +109,7 @@ dataset = EvaluationDataset(goldens=[first_golden, second_golden])
coherence_metric = GEval(
name="Coherence",
criteria="Coherence - determine if the actual output is coherent with the input.",
- evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.INPUT, SingleTurnParams.ACTUAL_OUTPUT],
)
```
diff --git a/docs/content/tutorials/rag-qa-agent/evals-in-prod.mdx b/docs/content/tutorials/rag-qa-agent/evals-in-prod.mdx
index c6e31c9ac8..392b3bacd0 100644
--- a/docs/content/tutorials/rag-qa-agent/evals-in-prod.mdx
+++ b/docs/content/tutorials/rag-qa-agent/evals-in-prod.mdx
@@ -25,7 +25,7 @@ from deepeval.metrics import (
)
from deepeval.dataset import EvaluationDataset
from deepeval.tracing import observe, update_current_span
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
import tempfile
class RAGAgent:
diff --git a/docs/content/tutorials/rag-qa-agent/evaluation.mdx b/docs/content/tutorials/rag-qa-agent/evaluation.mdx
index a082d15abb..4fc8be8667 100644
--- a/docs/content/tutorials/rag-qa-agent/evaluation.mdx
+++ b/docs/content/tutorials/rag-qa-agent/evaluation.mdx
@@ -184,13 +184,13 @@ from deepeval.metrics import GEval
answer_correctness = GEval(
name="Answer Correctness",
criteria="Evaluate if the actual output's 'answer' property is correct and complete from the input and retrieved context. If the answer is not correct or complete, reduce score."
- evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT]
+ evaluation_params=[SingleTurnParams.INPUT, SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.RETRIEVAL_CONTEXT]
)
citation_accuracy = GEval(
name="Citation Accuracy",
criteria="Check if the citations in the actual output are correct and relevant based on input and retrieved context. If they're not correct, reduce score."
- evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT]
+ evaluation_params=[SingleTurnParams.INPUT, SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.RETRIEVAL_CONTEXT]
)
```
diff --git a/docs/content/tutorials/rag-qa-agent/improvement.mdx b/docs/content/tutorials/rag-qa-agent/improvement.mdx
index 95098c82ee..d36e8ea1e5 100644
--- a/docs/content/tutorials/rag-qa-agent/improvement.mdx
+++ b/docs/content/tutorials/rag-qa-agent/improvement.mdx
@@ -73,7 +73,7 @@ We will iterate on different retriever hyperparameters like chunk size, embeddin
```python
from deepeval.dataset import EvaluatinDataset
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics import (
ContextualRelevancyMetric,
ContextualRecallMetric,
@@ -222,7 +222,7 @@ This is a more elaborate and clear prompt template that was updated by taking th
```python
from deepeval.dataset import EvaluatinDataset
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics import GEval
from langchain.llms import Ollama, OpenAI, HuggingFaceHub
from qa_agent import RAGAgent
diff --git a/docs/content/tutorials/summarization-agent/evals-in-prod.mdx b/docs/content/tutorials/summarization-agent/evals-in-prod.mdx
index 67462ad44c..deecbfbbf1 100644
--- a/docs/content/tutorials/summarization-agent/evals-in-prod.mdx
+++ b/docs/content/tutorials/summarization-agent/evals-in-prod.mdx
@@ -21,7 +21,7 @@ from openai import OpenAI
from dotenv import load_dotenv
from deepeval.metrics import GEval
from deepeval.tracing import observe, update_current_span
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
load_dotenv()
diff --git a/docs/content/tutorials/summarization-agent/evaluation.mdx b/docs/content/tutorials/summarization-agent/evaluation.mdx
index cd1042b64f..6ad02d9a75 100644
--- a/docs/content/tutorials/summarization-agent/evaluation.mdx
+++ b/docs/content/tutorials/summarization-agent/evaluation.mdx
@@ -112,7 +112,7 @@ We will now call our summarization agent on the dataset `input`s and create our
Here's how we can pull our dataset and create test cases:
```python {1-2,6,13-20}
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.dataset import EvaluationDataset
from meeting_summarizer import MeetingSummarizer # import your summarizer here
@@ -166,7 +166,7 @@ summary_concision = GEval(
# Write your criteria here
criteria="Assess whether the summary is concise and focused only on the essential points of the meeting? It should avoid repetition, irrelevant details, and unnecessary elaboration.",
threshold=0.9,
- evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
+ evaluation_params=[SingleTurnParams.INPUT, SingleTurnParams.ACTUAL_OUTPUT]
)
```
@@ -182,7 +182,7 @@ action_item_check = GEval(
# Write your criteria here
criteria="Are the action items accurate, complete, and clearly reflect the key tasks or follow-ups mentioned in the meeting?",
threshold=0.9,
- evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
+ evaluation_params=[SingleTurnParams.INPUT, SingleTurnParams.ACTUAL_OUTPUT]
)
```
diff --git a/docs/content/tutorials/summarization-agent/improvement.mdx b/docs/content/tutorials/summarization-agent/improvement.mdx
index 3a6d8602a2..aaed63bc59 100644
--- a/docs/content/tutorials/summarization-agent/improvement.mdx
+++ b/docs/content/tutorials/summarization-agent/improvement.mdx
@@ -136,7 +136,7 @@ These are more elaborate and clear system prompts that are updated by taking the
We can pull a dataset and use that dataset to iterate over our hyperparameters to initialize our summarization agent with different configurations to produce different test cases. Here's how we can do that:
```python
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.dataset import EvaluationDataset
from deepeval.metrics import GEval
from deepeval import evaluate
diff --git a/docs/content/tutorials/tutorial-setup.mdx b/docs/content/tutorials/tutorial-setup.mdx
index 14f705aa0f..d0e183bc6f 100644
--- a/docs/content/tutorials/tutorial-setup.mdx
+++ b/docs/content/tutorials/tutorial-setup.mdx
@@ -26,13 +26,13 @@ Your test file must be named with a `test_` prefix (like `test_app.py`) for Deep
```python title="test_app.py"
from deepeval import evaluate
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics import GEval
correctness_metric = GEval(
name="Correctness",
criteria="Determine if the 'actual output' is correct based on the 'expected output'.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
threshold=0.5
)
diff --git a/docs/public/llms-full.txt b/docs/public/llms-full.txt
index 71ea046de8..b8ad010db2 100644
--- a/docs/public/llms-full.txt
+++ b/docs/public/llms-full.txt
@@ -2127,7 +2127,7 @@ If you were to do this using `GEval`, your `evaluation_steps` might look somethi
Which in term looks something like this in code:
```codeBlockLines_e6Vv
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
from deepeval.metrics import GEval
metric = GEval(
@@ -2137,7 +2137,7 @@ metric = GEval(
"If the `actual_output` has all the complete headings but are in the wrong order, penalize it.",\
"If the summary has all the correct headings and they are in the right order, give it a perfect score."\
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT]
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT]
)
```
@@ -2178,7 +2178,7 @@ Some might skeptical if this complexity is necessary but in reality, you'll quic
Here's how this decision tree would look like in code:
```codeBlockLines_e6Vv
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
from deepeval.metrics.dag import (
DeepAcyclicGraph,
TaskNode,
@@ -2206,7 +2206,7 @@ correct_headings_node = BinaryJudgementNode(
extract_headings_node = TaskNode(
instructions="Extract all headings in `actual_output`",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
output_label="Summary headings",
children=[correct_headings_node, correct_order_node],
)
@@ -2273,13 +2273,13 @@ The `TaskNode` is designed specifically for processing data such as parameters f
```codeBlockLines_e6Vv
from typing import Optional, List
from deepeval.metrics.dag import BaseNode
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
class TaskNode(BaseNode):
instructions: str
output_label: str
children: List[BaseNode]
- evaluation_params: Optional[List[LLMTestCaseParams]] = None
+ evaluation_params: Optional[List[SingleTurnParams]] = None
label: Optional[str] = None
```
@@ -2289,7 +2289,7 @@ There are **THREE** mandatory and **TWO** optional parameter when creating a `Ta
- `instructions`: a string specifying how to process parameters of an `LLMTestCase`, and/or outputs from a previous parent `TaskNode`.
- `output_label`: a string representing the final output. The `children` `BaseNode` s will use the `output_label` to reference the output from the current `TaskNode`.
- `children`: a list of `BaseNode` s. There **must not** be a `VerdictNode` in the list of children.
-- \[Optional\] `evaluation_params`: a list of type `LLMTestCaseParams`. Include only the parameters that are relevant for processing.
+- \[Optional\] `evaluation_params`: a list of type `SingleTurnParams`. Include only the parameters that are relevant for processing.
- \[Optional\] `label`: a string that will be displayed in the verbose logs if `verbose_mode` is `True`.
info
@@ -2303,12 +2303,12 @@ The `BinaryJudgementNode` determines whether the verdict is `True` or `False` ba
```codeBlockLines_e6Vv
from typing import Optional, List
from deepeval.metrics.dag import BaseNode, VerdictNode
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
class BinaryJudgementNode(BaseNode):
criteria: str
children: List[VerdictNode]
- evaluation_params: Optional[List[LLMTestCaseParams]] = None
+ evaluation_params: Optional[List[SingleTurnParams]] = None
label: Optional[str] = None
```
@@ -2317,7 +2317,7 @@ There are **TWO** mandatory and **TWO** optional parameter when creating a `Bina
- `criteria`: a yes/no question based on output from parent node(s) and optionally parameters from the `LLMTestCase`. You **DON'T HAVE TO TELL IT** to output `True` or `False`.
- `children`: a list of exactly two `VerdictNode` s, one with a `verdict` value of `True`, and the other with a value of `False`.
-- \[Optional\] `evaluation_params`: a list of type `LLMTestCaseParams`. Include only the parameters that are relevant for evaluation.
+- \[Optional\] `evaluation_params`: a list of type `SingleTurnParams`. Include only the parameters that are relevant for evaluation.
- \[Optional\] `label`: a string that will be displayed in the verbose logs if `verbose_mode` is `True`.
tip
@@ -2333,12 +2333,12 @@ The `NonBinaryJudgementNode` determines what the verdict is based on the given `
```codeBlockLines_e6Vv
from typing import Optional, List
from deepeval.metrics.dag import BaseNode, VerdictNode
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
class NonBinaryJudgementNode(BaseNode):
criteria: str
children: List[VerdictNode]
- evaluation_params: Optional[List[LLMTestCaseParams]] = None
+ evaluation_params: Optional[List[SingleTurnParams]] = None
label: Optional[str] = None
```
@@ -2347,7 +2347,7 @@ There are **TWO** mandatory and **TWO** optional parameter when creating a `NonB
- `criteria`: an open-ended question based on output from parent node(s) and optionally parameters from the `LLMTestCase`. You **DON'T HAVE TO TELL IT** what to output.
- `children`: a list of `VerdictNode` s, where the `verdict` values determine the possible verdict of the current `NonBinaryJudgementNode`.
-- \[Optional\] `evaluation_params`: a list of type `LLMTestCaseParams`. Include only the parameters that are relevant for evaluation.
+- \[Optional\] `evaluation_params`: a list of type `SingleTurnParams`. Include only the parameters that are relevant for evaluation.
- \[Optional\] `label`: a string that will be displayed in the verbose logs if `verbose_mode` is `True`.
### `VerdictNode` [](https://deepeval.com/docs/metrics-dag\#verdictnode "Direct link to verdictnode")
@@ -2461,14 +2461,14 @@ test\_example.py
```codeBlockLines_e6Vv
from deepeval import assert_test
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics import GEval
def test_correctness():
correctness_metric = GEval(
name="Correctness",
criteria="Determine if the 'actual output' is correct based on the 'expected output'.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
threshold=0.5
)
test_case = LLMTestCase(
@@ -2546,7 +2546,7 @@ from deepeval.dataset import Golden
from deepeval.metrics import GEval
from deepeval import evaluate
-correctness = GEval(name="Correctness", criteria="Determine if the 'actual output' is correct based on the 'expected output'.", evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT])
+correctness = GEval(name="Correctness", criteria="Determine if the 'actual output' is correct based on the 'expected output'.", evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT])
@observe(metrics=[correctness])
def inner_component():
@@ -2618,14 +2618,14 @@ All default metrics are evaluated using LLMs, and you can use **ANY** LLM of you
`deepeval` provides G-Eval, a state-of-the-art LLM evaluation framework for anyone to create a custom LLM-evaluated metric using natural language. Here's an example:
```codeBlockLines_e6Vv
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics import GEval
test_case = LLMTestCase(input="...", actual_output="...", expected_output="...")
correctness = GEval(
name="Correctness",
criteria="Correctness - determine if the actual output is correct according to the expected output.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
strict_mode=True
)
@@ -2673,7 +2673,7 @@ correct_headings_node = BinaryJudgementNode(
extract_headings_node = TaskNode(
instructions="Extract all headings in `actual_output`",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
output_label="Summary headings",
children=[correct_headings_node, correct_order_node],
)
@@ -5675,7 +5675,7 @@ contextual_relevancy_metric = ContextualRelevancyMetric()
Next, we'll define our custom G-Eval metric for professionalism. This involves specifying the name of the metric, the evaluation criteria, and the parameters to evaluate. In this case, we're only assessing the LLM's `actual_output`.
```codeBlockLines_e6Vv
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
from deepeval.metrics import GEval
# Define criteria for evaluating professionalism
@@ -5686,7 +5686,7 @@ clear, respectful, and maintaining an empathetic tone consistent with medical in
professionalism_metric = GEval(
name="Professionalism",
criteria=criteria,
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT]
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT]
)
```
@@ -7548,7 +7548,7 @@ To create a custom metric that uses LLMs for evaluation, simply instantiate an `
```codeBlockLines_e6Vv
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
correctness_metric = GEval(
name="Correctness",
@@ -7559,7 +7559,7 @@ correctness_metric = GEval(
"You should also heavily penalize omission of detail",\
"Vague language, or contradicting OPINIONS, are OK"\
],
- evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.INPUT, SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
)
```
@@ -7568,7 +7568,7 @@ There are **THREE** mandatory and **SEVEN** optional parameters required when in
- `name`: name of custom metric.
- `criteria`: a description outlining the specific evaluation aspects for each test case.
-- `evaluation_params`: a list of type `LLMTestCaseParams`. Include only the parameters that are relevant for evaluation.
+- `evaluation_params`: a list of type `SingleTurnParams`. Include only the parameters that are relevant for evaluation.
- \[Optional\] `evaluation_steps`: a list of strings outlining the exact steps the LLM should take for evaluation. If `evaluation_steps` is not provided, `GEval` will generate a series of `evaluation_steps` on your behalf based on the provided `criteria`.
- \[Optional\] `rubric`: a list of `Rubric` s that allows you to [confine the range](https://deepeval.com/docs/metrics-llm-evals#rubric) of the final metric score.
- \[Optional\] `threshold`: the passing threshold, defaulted to 0.5.
@@ -7619,7 +7619,7 @@ correctness_metric = GEval(
"You should also heavily penalize omission of detail",\
"Vague language, or contradicting OPINIONS, are OK"\
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
)
```
@@ -7635,7 +7635,7 @@ from deepeval.metrics.g_eval import Rubric
correctness_metric = GEval(
name="Correctness",
criteria="Determine whether the actual output is factually correct based on the expected output.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
rubric=[\
Rubric(score_range=(0,2), expected_outcome="Factually incorrect."),\
Rubric(score_range=(3,6), expected_outcome="Mostly correct."),\
@@ -7772,7 +7772,7 @@ Answer correctness is the most used G-Eval metric of all and usually involves co
```codeBlockLines_e6Vv
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
correctness = GEval(
name="Correctness",
@@ -7781,7 +7781,7 @@ correctness = GEval(
"You should also heavily penalize omission of detail",\
"Vague language, or contradicting OPINIONS, are OK"\
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
)
```
@@ -7794,7 +7794,7 @@ Coherence is usually a referenceless metric that covers several criteria such as
```codeBlockLines_e6Vv
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
clarity = GEval(
name="Clarity",
@@ -7804,7 +7804,7 @@ clarity = GEval(
"Assess whether complex ideas are presented in a way that's easy to follow.",\
"Identify any vague or confusing parts that reduce understanding."\
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -7817,7 +7817,7 @@ Tonality is similar to coherence in the sense that it is also a referenceless me
```codeBlockLines_e6Vv
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
professionalism = GEval(
name="Professionalism",
@@ -7827,7 +7827,7 @@ professionalism = GEval(
"Ensure the actual output stays contextually appropriate and avoids casual or ambiguous expressions.",\
"Check if the actual output is clear, respectful, and avoids slang or overly informal phrasing."\
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -7840,7 +7840,7 @@ Safety evaluates whether your LLM's `actual_output` aligns with whatever ethical
```codeBlockLines_e6Vv
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
pii_leakage = GEval(
name="PII Leakage",
@@ -7850,7 +7850,7 @@ pii_leakage = GEval(
"Ensure the output uses placeholders or anonymized data when applicable.",\
"Verify that sensitive information is not exposed even in edge cases or unclear prompts."\
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -7863,7 +7863,7 @@ Although `deepeval` already offer RAG metrics such as the `AnswerRelevancyMetric
```codeBlockLines_e6Vv
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
medical_faithfulness = GEval(
name="Medical Faithfulness",
@@ -7874,7 +7874,7 @@ medical_faithfulness = GEval(
"Heavily penalize hallucinations, especially those that could result in incorrect medical advice.",\
"Provide reasons for the faithfulness score, emphasizing the importance of clinical accuracy and patient safety."\
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.RETRIEVAL_CONTEXT],
)
```
@@ -16443,13 +16443,13 @@ Here's how to define a G-Eval metric in DeepEval with just a few lines of code:
```codeBlockLines_e6Vv
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
# Define a custom G-Eval metric
custom_metric = GEval(
name="Relevancy",
criteria="Check if the actual output directly addresses the input.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.INPUT]
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.INPUT]
)
```
@@ -16475,7 +16475,7 @@ Here's an example answer correctness metric defined using G-Eval:
```codeBlockLines_e6Vv
# Create a custom correctness metric
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
correctness_metric = GEval(
name="Correctness",
@@ -16486,7 +16486,7 @@ correctness_metric = GEval(
"You should also heavily penalize omission of detail",\
"Vague language, or contradicting OPINIONS, are OK"\
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.EXPECTED_OUTPUT],
)
```
@@ -16525,7 +16525,7 @@ Here's a an example coherence metric assessing clarify defined using G-Eval:
```codeBlockLines_e6Vv
# Create a custom clarity metric focused on clear communication
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
clarity_metric = GEval(
name="Clarity",
@@ -16535,7 +16535,7 @@ clarity_metric = GEval(
"Assess whether complex ideas are presented in a way that’s easy to follow.",\
"Identify any vague or confusing parts that reduce understanding."\
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -16570,7 +16570,7 @@ Here's an example professionalism metric defined using G-Eval:
```codeBlockLines_e6Vv
# Create a custom professionalism metric
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
professionalism_metric = GEval(
name="Professionalism",
@@ -16582,7 +16582,7 @@ professionalism_metric = GEval(
"Ensure the actual output stays contextually appropriate and avoids casual or ambiguous expressions.",\
"Check if the actual output is clear, respectful, and avoids slang or overly informal phrasing."\
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -16615,7 +16615,7 @@ Here's an example custom PII Leakage metric.
```codeBlockLines_e6Vv
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
pii_leakage_metric = GEval(
name="PII Leakage",
@@ -16625,7 +16625,7 @@ pii_leakage_metric = GEval(
"Ensure the output uses placeholders or anonymized data when applicable.",\
"Verify that sensitive information is not exposed even in edge cases or unclear prompts."\
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -16661,7 +16661,7 @@ Below is an example of a custom **Faithfulness** metric for a medical diagnosis
```codeBlockLines_e6Vv
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
custom_faithfulness_metric = GEval(
name="Medical Diagnosis Faithfulness",
@@ -16674,7 +16674,7 @@ custom_faithfulness_metric = GEval(
"Heavily penalize hallucinations, especially those that could result in incorrect medical advice.",\
"Provide reasons for the faithfulness score, emphasizing the importance of clinical accuracy and patient safety."\
],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.RETRIEVAL_CONTEXT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT, SingleTurnParams.RETRIEVAL_CONTEXT],
)
```
@@ -16699,12 +16699,12 @@ This is a naive G-Eval approach to evaluate the persuasiveness of a sales email
```codeBlockLines_e6Vv
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
geval_metric = GEval(
name="Persuasiveness",
criteria="Determine how persuasive the `actual output` is to getting a user booking in a call.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
```
@@ -16737,7 +16737,7 @@ A **DAG** handles the above use case deterministically by splitting the logic, a
Here is an example of a G-Eval + DAG approach:
```codeBlockLines_e6Vv
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics.dag import (
DeepAcyclicGraph,
TaskNode,
@@ -16750,7 +16750,7 @@ from deepeval.metrics import DAGMetric, GEval
geval_metric = GEval(
name="Persuasiveness",
criteria="Determine how persuasive the `actual output` is to getting a user booking in a call.",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
conciseness_node = BinaryJudgementNode(
@@ -21640,13 +21640,13 @@ Here is where you can take advantage of `deepeval`'s `GEval` metric, capable of
```codeBlockLines_e6Vv
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
...
dark_humor = GEval(
name="Dark Humor",
criteria="Determine how funny the dark humor in the actual output is",
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
dark_humor.measure(test_case)
diff --git a/examples/dag-examples/conversational_dag.ipynb b/examples/dag-examples/conversational_dag.ipynb
index 37611c5b7b..eece76d3bd 100644
--- a/examples/dag-examples/conversational_dag.ipynb
+++ b/examples/dag-examples/conversational_dag.ipynb
@@ -215,11 +215,11 @@
" ConversationalNonBinaryJudgementNode,\n",
" ConversationalVerdictNode,\n",
")\n",
- "from deepeval.test_case import TurnParams\n",
+ "from deepeval.test_case import MultiTurnParams\n",
"\n",
"non_binary_node = ConversationalNonBinaryJudgementNode(\n",
" criteria=\"How was the assistant's behaviour towards user?\",\n",
- " evaluation_params=[TurnParams.ROLE, TurnParams.CONTENT],\n",
+ " evaluation_params=[MultiTurnParams.ROLE, MultiTurnParams.CONTENT],\n",
" children=[\n",
" ConversationalVerdictNode(verdict=\"Rude\", score=0),\n",
" ConversationalVerdictNode(verdict=\"Neutral\", score=5),\n",
@@ -238,7 +238,7 @@
"task_node = ConversationalTaskNode(\n",
" instructions=\"Summarize the conversation and explain assiatant's behaviour overall.\",\n",
" output_label=\"Summary\",\n",
- " evaluation_params=[TurnParams.ROLE, TurnParams.CONTENT],\n",
+ " evaluation_params=[MultiTurnParams.ROLE, MultiTurnParams.CONTENT],\n",
" children=[binary_node],\n",
")\n",
"\n",
diff --git a/examples/getting_started/test_example.py b/examples/getting_started/test_example.py
index 31f3b537f9..f7dedb16be 100644
--- a/examples/getting_started/test_example.py
+++ b/examples/getting_started/test_example.py
@@ -2,7 +2,7 @@
import deepeval
from deepeval import assert_test
from deepeval.dataset import EvaluationDataset
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.metrics import AnswerRelevancyMetric, GEval
# To run this file: deepeval test run .py
@@ -26,8 +26,8 @@ def test_everything(test_case: LLMTestCase):
name="Correctness",
criteria="Correctness - determine if the actual output is correct according to the expected output.",
evaluation_params=[
- LLMTestCaseParams.ACTUAL_OUTPUT,
- LLMTestCaseParams.EXPECTED_OUTPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
+ SingleTurnParams.EXPECTED_OUTPUT,
],
strict_mode=True,
)
diff --git a/simulate_conversations_example.py b/simulate_conversations_example.py
index 18550700f7..093593136c 100644
--- a/simulate_conversations_example.py
+++ b/simulate_conversations_example.py
@@ -1,65 +1,41 @@
-import json
-
-from deepeval.dataset import ConversationalGolden
-from deepeval.simulator import (
- ConversationSimulator,
- ConversationSimulatorTemplate,
+from deepeval.metrics import GEval, ConversationalGEval
+from deepeval.test_case import (
+ SingleTurnParams,
+ LLMTestCase,
+ ConversationalTestCase,
+ Turn,
+ MultiTurnParams,
)
-from deepeval.simulator.controller import end, proceed
-from deepeval.test_case import Turn
-
-
-conversation_goldens = [
- ConversationalGolden(
- scenario="A customer wants to return a damaged laptop purchased last week.",
- expected_outcome="The customer understands the return process and receives the next step.",
- user_description="A frustrated but cooperative customer.",
- ),
- ConversationalGolden(
- scenario="A user wants to upgrade from a free plan to a team plan.",
- expected_outcome="The user knows which plan to choose and how to complete the upgrade.",
- user_description="A startup founder comparing pricing options.",
- ),
- ConversationalGolden(
- scenario="A patient wants to reschedule an appointment because of a work conflict.",
- expected_outcome="The patient gets a suitable new appointment time.",
- user_description="A busy patient who prefers concise answers.",
- ),
- ConversationalGolden(
- scenario="A traveler needs help changing a flight after a weather delay.",
- expected_outcome="The traveler understands available rebooking options.",
- user_description="An anxious traveler stuck at the airport.",
- ),
- ConversationalGolden(
- scenario="A developer is debugging a failed API authentication request.",
- expected_outcome="The developer identifies the likely authentication issue and next debugging step.",
- user_description="A technical user who can understand API terminology.",
- ),
-]
-
-
-async def model_callback(input: str, turns: list[Turn], thread_id: str) -> Turn:
- return Turn(
- role="assistant",
- content=f"I can help with that. You said: {input}",
- )
-
-
-def controller(simulated_user_turns: int):
- if simulated_user_turns >= 1:
- return end(reason="Stopped after two simulated user turns.")
- return proceed()
+from deepeval import evaluate
+metric = GEval(
+ name="G-Eval",
+ criteria="Determine whether the metadata has a source, if yes score favorably, if no score unfavorably.",
+ evaluation_params=[SingleTurnParams.METADATA],
+)
-simulator = ConversationSimulator(
- model_callback=model_callback,
- controller=controller,
+metric2 = ConversationalGEval(
+ name="Conversational G-Eval",
+ criteria="Determine whether the metadata has a source, if yes score favorably, if no score unfavorably.",
+ evaluation_params=[MultiTurnParams.METADATA],
)
-conversational_test_cases = simulator.simulate(
- conversational_goldens=conversation_goldens,
- max_user_simulations=5,
+
+test_case = LLMTestCase(
+ input="What is the capital of France?",
+ actual_output="Paris",
+ expected_output="Paris",
+ metadata={"source": "wikipedia"},
+ tags=["geography"],
)
+test_case2 = ConversationalTestCase(
+ turns=[
+ Turn(role="user", content="What is the capital of France?"),
+ Turn(role="assistant", content="Paris"),
+ ],
+ scenario="User asks about the capital of France",
+ expected_outcome="Assistant provides the capital of France",
+ metadata={"source": "wikipedia"},
+)
-for test_case in conversational_test_cases:
- print(test_case.turns)
+evaluate([test_case2], [metric2])
diff --git a/tests/test_confident/test_compare.py b/tests/test_confident/test_compare.py
index 4837a02573..88b5551b5f 100644
--- a/tests/test_confident/test_compare.py
+++ b/tests/test_confident/test_compare.py
@@ -1,7 +1,7 @@
from deepeval.test_case import (
ArenaTestCase,
LLMTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
Contestant,
)
from deepeval.metrics import ArenaGEval
@@ -16,8 +16,8 @@ def test_compare():
name="Friendly",
criteria="Choose the winner of the more friendly contestant based on the input and actual output",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
)
a_test_case = ArenaTestCase(
@@ -67,8 +67,8 @@ def test_compare_with_hyperparameters():
name="Friendly",
criteria="Choose the winner of the more friendly contestant based on the input and actual output",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
)
a_test_case = ArenaTestCase(
diff --git a/tests/test_confident/test_conversational_g_eval_upload.py b/tests/test_confident/test_conversational_g_eval_upload.py
index 6c478a1fe4..4ee587b8f8 100644
--- a/tests/test_confident/test_conversational_g_eval_upload.py
+++ b/tests/test_confident/test_conversational_g_eval_upload.py
@@ -2,7 +2,7 @@
import uuid
import pytest
from deepeval.metrics import ConversationalGEval
-from deepeval.test_case import TurnParams
+from deepeval.test_case import MultiTurnParams
from deepeval.metrics.g_eval import Rubric
from deepeval.confident.api import Api, HttpMethods, Endpoints
from deepeval.confident.types import ConfidentApiError
@@ -25,10 +25,10 @@ def test_conversational_geval_upload_and_fetch(self):
metric = ConversationalGEval(
name=metric_name,
evaluation_params=[
- TurnParams.EXPECTED_OUTCOME,
- TurnParams.RETRIEVAL_CONTEXT,
- TurnParams.SCENARIO,
- # TurnParams.TOOLS_CALLED,
+ MultiTurnParams.EXPECTED_OUTCOME,
+ MultiTurnParams.RETRIEVAL_CONTEXT,
+ MultiTurnParams.SCENARIO,
+ # MultiTurnParams.TOOLS_CALLED,
],
criteria=(
"Test whether the assistant responses are relevant, grounded, "
@@ -68,7 +68,7 @@ def test_conversational_geval_upload_and_fetch(self):
duplicate_metric = ConversationalGEval(
name=metric_name,
evaluation_params=[
- TurnParams.SCENARIO,
+ MultiTurnParams.SCENARIO,
],
criteria="Test whether actual output is relevant to the input given",
)
diff --git a/tests/test_confident/test_g_eval_upload.py b/tests/test_confident/test_g_eval_upload.py
index 3a50b77722..f0f24739bd 100644
--- a/tests/test_confident/test_g_eval_upload.py
+++ b/tests/test_confident/test_g_eval_upload.py
@@ -2,7 +2,7 @@
import uuid
import pytest
from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
from deepeval.metrics.g_eval import Rubric
from deepeval.confident.api import Api, HttpMethods, Endpoints
from deepeval.confident.types import ConfidentApiError
@@ -25,12 +25,14 @@ def test_geval_upload_and_fetch(self):
metric = GEval(
name=metric_name,
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
- LLMTestCaseParams.EXPECTED_OUTPUT,
- LLMTestCaseParams.CONTEXT,
- # LLMTestCaseParams.TOOLS_CALLED,
- LLMTestCaseParams.RETRIEVAL_CONTEXT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
+ SingleTurnParams.EXPECTED_OUTPUT,
+ SingleTurnParams.CONTEXT,
+ # SingleTurnParams.TOOLS_CALLED,
+ SingleTurnParams.RETRIEVAL_CONTEXT,
+ SingleTurnParams.METADATA,
+ SingleTurnParams.TAGS,
],
criteria="Test whether actual output is relevant to the input given",
rubric=[
@@ -63,13 +65,15 @@ def test_geval_upload_and_fetch(self):
"context",
# "toolsCalled",
"retrievalContext",
+ "metadata",
+ "tags",
}
duplicate_metric = GEval(
name=metric_name,
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
criteria="Test whether actual output is relevant to the input given",
)
diff --git a/tests/test_core/conftest.py b/tests/test_core/conftest.py
index 5ddb112839..bbc1b9438d 100644
--- a/tests/test_core/conftest.py
+++ b/tests/test_core/conftest.py
@@ -158,7 +158,6 @@ def _core_mode_no_confident(
# Clear the in-memory Settings fields (no persistence)
s = get_settings()
with s.edit(persist=False) as ctx:
- ctx.s.API_KEY = None
ctx.s.CONFIDENT_API_KEY = None
# Yield control to the test
diff --git a/tests/test_core/test_core.py b/tests/test_core/test_core.py
index 61853b4bd1..7b5d4c0aef 100644
--- a/tests/test_core/test_core.py
+++ b/tests/test_core/test_core.py
@@ -1,6 +1,25 @@
+from pydantic import SecretStr
+
+from deepeval.confident import api as confident_api
from deepeval.confident.api import is_confident, get_confident_api_key
def test_confident_boundary_off_in_core():
assert get_confident_api_key() is None
assert is_confident() is False
+
+
+def test_confident_api_key_takes_precedence(monkeypatch):
+ class DummySettings:
+ CONFIDENT_API_KEY = SecretStr("legacy-unprefixed-confident-key")
+
+ monkeypatch.setattr(confident_api, "get_settings", lambda: DummySettings())
+
+ assert get_confident_api_key() == "legacy-unprefixed-confident-key"
+ assert is_confident() is True
+
+
+def test_confident_api_key_field_is_required():
+ from deepeval.config.settings import Settings
+
+ assert "API_KEY" not in Settings.model_fields
diff --git a/tests/test_core/test_imports.py b/tests/test_core/test_imports.py
index 75e46094cf..91f3d3ae8b 100644
--- a/tests/test_core/test_imports.py
+++ b/tests/test_core/test_imports.py
@@ -206,8 +206,8 @@ def test_test_case_imports():
MLLMImage,
ToolCall,
ToolCallParams,
- TurnParams,
- LLMTestCaseParams,
+ MultiTurnParams,
+ SingleTurnParams,
MCPServer,
MCPPromptCall,
MCPResourceCall,
@@ -221,8 +221,8 @@ def test_test_case_imports():
assert MLLMImage is not None
assert ToolCall is not None
assert ToolCallParams is not None
- assert TurnParams is not None
- assert LLMTestCaseParams is not None
+ assert MultiTurnParams is not None
+ assert SingleTurnParams is not None
assert MCPServer is not None
assert MCPPromptCall is not None
assert MCPResourceCall is not None
diff --git a/tests/test_core/test_run/test_turns_table.py b/tests/test_core/test_run/test_turns_table.py
index 667c2831db..dbfdf77c1b 100644
--- a/tests/test_core/test_run/test_turns_table.py
+++ b/tests/test_core/test_run/test_turns_table.py
@@ -35,7 +35,7 @@ def test_turns_table_no_role_or_tools_duplication_with_format_turn():
SimpleNamespace(name="fs.list"),
SimpleNamespace(name="fs.read"),
],
- additional_metadata={"session_id": "sess-9"},
+ metadata={"session_id": "sess-9"},
comments="planner step",
)
diff --git a/tests/test_core/test_test_case/test_deprecated_params.py b/tests/test_core/test_test_case/test_deprecated_params.py
new file mode 100644
index 0000000000..544eba98f4
--- /dev/null
+++ b/tests/test_core/test_test_case/test_deprecated_params.py
@@ -0,0 +1,51 @@
+import warnings
+
+
+def test_llm_test_case_params_alias_is_single_turn_params():
+ from deepeval.test_case import SingleTurnParams
+
+ with warnings.catch_warnings(record=True) as caught:
+ warnings.simplefilter("always")
+ from deepeval.test_case import LLMTestCaseParams
+
+ assert any(
+ issubclass(w.category, DeprecationWarning) for w in caught
+ ), "expected DeprecationWarning when importing LLMTestCaseParams"
+ assert LLMTestCaseParams is SingleTurnParams
+ assert LLMTestCaseParams.METADATA is SingleTurnParams.METADATA
+
+
+def test_turn_params_alias_is_multi_turn_params():
+ from deepeval.test_case import MultiTurnParams
+
+ with warnings.catch_warnings(record=True) as caught:
+ warnings.simplefilter("always")
+ from deepeval.test_case import TurnParams
+
+ assert any(
+ issubclass(w.category, DeprecationWarning) for w in caught
+ ), "expected DeprecationWarning when importing TurnParams"
+ assert TurnParams is MultiTurnParams
+ assert TurnParams.METADATA is MultiTurnParams.METADATA
+
+
+def test_llm_test_case_params_alias_from_submodule():
+ from deepeval.test_case.llm_test_case import SingleTurnParams
+
+ with warnings.catch_warnings(record=True) as caught:
+ warnings.simplefilter("always")
+ from deepeval.test_case.llm_test_case import LLMTestCaseParams
+
+ assert any(issubclass(w.category, DeprecationWarning) for w in caught)
+ assert LLMTestCaseParams is SingleTurnParams
+
+
+def test_turn_params_alias_from_submodule():
+ from deepeval.test_case.conversational_test_case import MultiTurnParams
+
+ with warnings.catch_warnings(record=True) as caught:
+ warnings.simplefilter("always")
+ from deepeval.test_case.conversational_test_case import TurnParams
+
+ assert any(issubclass(w.category, DeprecationWarning) for w in caught)
+ assert TurnParams is MultiTurnParams
diff --git a/tests/test_core/test_test_case/test_multi_turn/test_conversational_test_case.py b/tests/test_core/test_test_case/test_multi_turn/test_conversational_test_case.py
index b0a24c17d4..09cf70f40d 100644
--- a/tests/test_core/test_test_case/test_multi_turn/test_conversational_test_case.py
+++ b/tests/test_core/test_test_case/test_multi_turn/test_conversational_test_case.py
@@ -1,6 +1,7 @@
import pytest
from pydantic import ValidationError
-from deepeval.test_case import ConversationalTestCase, Turn, TurnParams
+from deepeval.test_case import ConversationalTestCase, Turn
+from deepeval.test_case.api import create_api_test_case
class TestConversationalTestCaseInitialization:
@@ -18,6 +19,7 @@ def test_minimal_initialization(self):
assert test_case.user_description is None
assert test_case.expected_outcome is None
assert test_case.chatbot_role is None
+ assert test_case.metadata is None
assert test_case.additional_metadata is None
assert test_case.comments is None
assert test_case.tags is None
@@ -58,6 +60,10 @@ def test_full_initialization(self):
)
assert test_case.expected_outcome == "Issue resolved satisfactorily"
assert test_case.chatbot_role == "Helpful customer service agent"
+ assert test_case.metadata == {
+ "priority": "high",
+ "department": "billing",
+ }
assert test_case.additional_metadata == {
"priority": "high",
"department": "billing",
@@ -266,8 +272,29 @@ def test_empty_tags_list(self):
def test_empty_additional_metadata(self):
turns = [Turn(role="user", content="Hello")]
test_case = ConversationalTestCase(turns=turns, additional_metadata={})
+ assert test_case.metadata == {}
assert test_case.additional_metadata == {}
+ def test_metadata_input_compatibility(self):
+ turns = [Turn(role="user", content="Hello")]
+ metadata = {"key": "value"}
+ test_case = ConversationalTestCase(turns=turns, metadata=metadata)
+ assert test_case.metadata == metadata
+ assert test_case.additional_metadata == metadata
+
+ def test_api_test_case_uses_metadata(self):
+ metadata = {"key": "value"}
+ test_case = ConversationalTestCase(
+ turns=[Turn(role="user", content="Hello")],
+ metadata=metadata,
+ )
+
+ api_test_case = create_api_test_case(test_case)
+ model_dict = api_test_case.model_dump(by_alias=True)
+
+ assert model_dict["metadata"] == metadata
+ assert "additionalMetadata" not in model_dict
+
class TestConversationalTestCaseEquality:
@@ -335,7 +362,8 @@ def test_serialization_aliases(self):
assert "userDescription" in dumped
assert "expectedOutcome" in dumped
assert "chatbotRole" in dumped
- assert "additionalMetadata" in dumped
+ assert "metadata" in dumped
+ assert "additionalMetadata" not in dumped
class TestConversationalTestCaseCamelCaseInitialization:
@@ -381,6 +409,7 @@ def test_camelcase_field_initialization(self):
assert test_case.user_description == user_description_text
assert test_case.expected_outcome == expected_outcome_text
assert test_case.chatbot_role == chatbot_role_text
+ assert test_case.metadata == metadata_dict
assert test_case.additional_metadata == metadata_dict
assert test_case.comments == comments_text
assert test_case.tags == tags_list
@@ -408,4 +437,5 @@ def test_mixed_case_initialization(self):
assert test_case.user_description == user_description_text
assert test_case.expected_outcome == expected_outcome_text
assert test_case.chatbot_role == chatbot_role_text
+ assert test_case.metadata == metadata_dict
assert test_case.additional_metadata == metadata_dict
diff --git a/tests/test_core/test_test_case/test_multi_turn/test_turn.py b/tests/test_core/test_test_case/test_multi_turn/test_turn.py
index 4e1e973b12..99d39d9888 100644
--- a/tests/test_core/test_test_case/test_multi_turn/test_turn.py
+++ b/tests/test_core/test_test_case/test_multi_turn/test_turn.py
@@ -16,6 +16,7 @@ def test_minimal_initialization(self):
assert turn.mcp_tools_called is None
assert turn.mcp_resources_called is None
assert turn.mcp_prompts_called is None
+ assert turn.metadata is None
assert turn.additional_metadata is None
def test_user_role_initialization(self):
@@ -54,6 +55,7 @@ def test_full_initialization(self):
assert len(turn.retrieval_context) == 2
assert len(turn.tools_called) == 1
assert turn.tools_called[0].name == "weather_tool"
+ assert turn.metadata["model"] == "gpt-4"
assert turn.additional_metadata["model"] == "gpt-4"
@@ -163,10 +165,25 @@ def test_empty_tools_called_list(self):
class TestTurnWithMetadata:
def test_simple_metadata(self):
+ metadata = {"model": "gpt-4", "tokens": 150}
+ turn = Turn(role="assistant", content="Response", metadata=metadata)
+ assert turn.metadata == metadata
+ assert turn.additional_metadata == metadata
+
+ def test_additional_metadata_input_compatibility(self):
metadata = {"model": "gpt-4", "tokens": 150}
turn = Turn(
role="assistant", content="Response", additional_metadata=metadata
)
+ assert turn.metadata == metadata
+ assert turn.additional_metadata == metadata
+
+ def test_additional_metadata_camelcase_input_compatibility(self):
+ metadata = {"model": "gpt-4", "tokens": 150}
+ turn = Turn(
+ role="assistant", content="Response", additionalMetadata=metadata
+ )
+ assert turn.metadata == metadata
assert turn.additional_metadata == metadata
def test_complex_metadata(self):
@@ -229,7 +246,7 @@ def test_repr_with_optional_fields(self):
assert "role='assistant'" in repr_str
assert "content='Hi there!'" in repr_str
assert "user_id='user123'" in repr_str
- assert "additional_metadata=" in repr_str
+ assert "metadata=" in repr_str
def test_repr_with_tools(self):
tool_call = ToolCall(
@@ -332,7 +349,7 @@ def test_model_dump_with_all_fields(self):
assert dumped["user_id"] == "user123"
assert len(dumped["retrieval_context"]) == 2
assert len(dumped["tools_called"]) == 1
- assert dumped["additional_metadata"]["key"] == "value"
+ assert dumped["metadata"]["key"] == "value"
def test_model_dump_exclude_none(self):
turn = Turn(role="user", content="Hello")
@@ -341,7 +358,7 @@ def test_model_dump_exclude_none(self):
assert "user_id" not in dumped
assert "retrieval_context" not in dumped
assert "tools_called" not in dumped
- assert "additional_metadata" not in dumped
+ assert "metadata" not in dumped
class TestTurnCamelCaseInitialization:
diff --git a/tests/test_core/test_test_case/test_single_turn.py b/tests/test_core/test_test_case/test_single_turn.py
index b61878b119..e7d4182789 100644
--- a/tests/test_core/test_test_case/test_single_turn.py
+++ b/tests/test_core/test_test_case/test_single_turn.py
@@ -6,9 +6,10 @@
from deepeval.test_case import (
LLMTestCase,
ToolCall,
- LLMTestCaseParams,
+ SingleTurnParams,
ToolCallParams,
)
+from deepeval.test_case.api import create_api_test_case
from deepeval.test_case.mcp import MCPServer
@@ -606,7 +607,7 @@ def test_serialization_aliases(self):
assert "tokenCost" in model_dict
assert "completionTime" in model_dict
- def test_additional_metadata_serialization(self):
+ def test_metadata_serialization(self):
metadata = {
"source": "test",
"timestamp": "2024-01-01",
@@ -614,33 +615,56 @@ def test_additional_metadata_serialization(self):
"list": [1, 2, 3],
}
- test_case = LLMTestCase(input="test", additional_metadata=metadata)
+ test_case = LLMTestCase(input="test", metadata=metadata)
+ assert test_case.metadata == metadata
assert test_case.additional_metadata == metadata
model_dict = test_case.model_dump(by_alias=True)
- assert "additionalMetadata" in model_dict
- assert model_dict["additionalMetadata"] == metadata
+ assert "metadata" in model_dict
+ assert "additionalMetadata" not in model_dict
+ assert model_dict["metadata"] == metadata
+
+ def test_additional_metadata_input_compatibility(self):
+ metadata = {"source": "test"}
+
+ snake_case = LLMTestCase(input="test", additional_metadata=metadata)
+ camel_case = LLMTestCase(input="test", additionalMetadata=metadata)
+
+ assert snake_case.metadata == metadata
+ assert camel_case.metadata == metadata
+ assert snake_case.additional_metadata == metadata
+ assert camel_case.additional_metadata == metadata
+
+ def test_api_test_case_uses_metadata(self):
+ metadata = {"source": "test"}
+ test_case = LLMTestCase(input="test", metadata=metadata)
+
+ api_test_case = create_api_test_case(test_case)
+ model_dict = api_test_case.model_dump(by_alias=True)
+
+ assert model_dict["metadata"] == metadata
+ assert "additionalMetadata" not in model_dict
class TestLLMTestCaseParams:
def test_enum_values(self):
- assert LLMTestCaseParams.INPUT.value == "input"
- assert LLMTestCaseParams.ACTUAL_OUTPUT.value == "actual_output"
- assert LLMTestCaseParams.EXPECTED_OUTPUT.value == "expected_output"
- assert LLMTestCaseParams.CONTEXT.value == "context"
- assert LLMTestCaseParams.RETRIEVAL_CONTEXT.value == "retrieval_context"
- assert LLMTestCaseParams.TOOLS_CALLED.value == "tools_called"
- assert LLMTestCaseParams.EXPECTED_TOOLS.value == "expected_tools"
- assert LLMTestCaseParams.MCP_SERVERS.value == "mcp_servers"
- assert LLMTestCaseParams.MCP_TOOLS_CALLED.value == "mcp_tools_called"
+ assert SingleTurnParams.INPUT.value == "input"
+ assert SingleTurnParams.ACTUAL_OUTPUT.value == "actual_output"
+ assert SingleTurnParams.EXPECTED_OUTPUT.value == "expected_output"
+ assert SingleTurnParams.CONTEXT.value == "context"
+ assert SingleTurnParams.RETRIEVAL_CONTEXT.value == "retrieval_context"
+ assert SingleTurnParams.METADATA.value == "metadata"
+ assert SingleTurnParams.TAGS.value == "tags"
+ assert SingleTurnParams.TOOLS_CALLED.value == "tools_called"
+ assert SingleTurnParams.EXPECTED_TOOLS.value == "expected_tools"
+ assert SingleTurnParams.MCP_SERVERS.value == "mcp_servers"
+ assert SingleTurnParams.MCP_TOOLS_CALLED.value == "mcp_tools_called"
assert (
- LLMTestCaseParams.MCP_RESOURCES_CALLED.value
+ SingleTurnParams.MCP_RESOURCES_CALLED.value
== "mcp_resources_called"
)
- assert (
- LLMTestCaseParams.MCP_PROMPTS_CALLED.value == "mcp_prompts_called"
- )
+ assert SingleTurnParams.MCP_PROMPTS_CALLED.value == "mcp_prompts_called"
class TestToolCallParams:
diff --git a/tests/test_core/test_tracing/example_e2e_trace_evals.py b/tests/test_core/test_tracing/example_e2e_trace_evals.py
index f636eecc37..f1d154dde2 100644
--- a/tests/test_core/test_tracing/example_e2e_trace_evals.py
+++ b/tests/test_core/test_tracing/example_e2e_trace_evals.py
@@ -1,6 +1,6 @@
from deepeval.metrics import GEval
from deepeval.tracing import observe, update_current_trace
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
from deepeval.test_case import ToolCall
from deepeval.dataset import EvaluationDataset
@@ -8,16 +8,16 @@
name="Relevancy",
criteria="For the given input, the output should be relevant to the input.",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
)
correctness = GEval(
name="Correctness",
criteria="Given the expected output, determine whether the output is correct or not.",
evaluation_params=[
- LLMTestCaseParams.EXPECTED_OUTPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.EXPECTED_OUTPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
)
diff --git a/tests/test_integrations/test_langchain/conftest.py b/tests/test_integrations/test_langchain/conftest.py
index 7b6270646b..3dcfa26b2e 100644
--- a/tests/test_integrations/test_langchain/conftest.py
+++ b/tests/test_integrations/test_langchain/conftest.py
@@ -368,7 +368,7 @@ def _add_test_case_to_run(
tokenCost=token_cost,
completionTime=completion_time,
tags=tags,
- additionalMetadata=additional_metadata,
+ metadata=additional_metadata,
success=passed,
metricsData=None,
trace=None,
diff --git a/tests/test_integrations/test_langgraph/conftest.py b/tests/test_integrations/test_langgraph/conftest.py
index e5ae66a13b..a5412bb44a 100644
--- a/tests/test_integrations/test_langgraph/conftest.py
+++ b/tests/test_integrations/test_langgraph/conftest.py
@@ -605,7 +605,7 @@ def _add_test_case_to_run(
tokenCost=token_cost, # Total token count from llmSpans
completionTime=completion_time, # Duration in seconds from timestamps
tags=tags, # From CallbackHandler tags
- additionalMetadata=additional_metadata,
+ metadata=additional_metadata,
success=passed,
metricsData=None, # None = "no metrics evaluated" (bypasses guard)
trace=None, # Must be None - embedding traces causes 500s
diff --git a/tests/test_metrics/test_answer_relevancy_metric_empty_output.py b/tests/test_metrics/test_answer_relevancy_metric_empty_output.py
index a1f3103636..03d1e63093 100644
--- a/tests/test_metrics/test_answer_relevancy_metric_empty_output.py
+++ b/tests/test_metrics/test_answer_relevancy_metric_empty_output.py
@@ -16,7 +16,7 @@
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
from deepeval.metrics.utils import check_llm_test_case_params
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, SingleTurnParams
from deepeval.errors import MissingTestCaseParamsError
from tests.test_core.stubs import DummyModel
diff --git a/tests/test_metrics/test_arena_geval_metric.py b/tests/test_metrics/test_arena_geval_metric.py
index 14684afc50..23fe7ac40f 100644
--- a/tests/test_metrics/test_arena_geval_metric.py
+++ b/tests/test_metrics/test_arena_geval_metric.py
@@ -5,7 +5,7 @@
LLMTestCase,
MLLMImage,
ArenaTestCase,
- LLMTestCaseParams,
+ SingleTurnParams,
Contestant,
)
from deepeval import compare
@@ -57,8 +57,8 @@ def test_normal_sync_metric_measure(self):
name="Friendly",
criteria="Choose the winner of the more accurate contestant based on the input and actual output",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
async_mode=False,
)
@@ -102,8 +102,8 @@ def test_normal_async_metric_measure(self):
name="Friendly",
criteria="Choose the winner of the more accurate contestant based on the input and actual output",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
)
metric.measure(test_case)
@@ -147,8 +147,8 @@ def test_multimodal_async_metric_measure(self):
name="Friendly",
criteria="Choose the winner of the more accurate contestant based on the input and actual output",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
)
metric.measure(test_case)
@@ -192,8 +192,8 @@ def test_multimodal_sync_metric_measure(self):
name="Friendly",
criteria="Choose the winner of the more accurate contestant based on the input and actual output",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
async_mode=False,
)
@@ -239,8 +239,8 @@ def test_invalid_model_throws_error_for_multimodal(self):
name="Friendly",
criteria="Choose the winner of the more accurate contestant based on the input and actual output",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
model="gpt-3.5-turbo",
)
@@ -280,8 +280,8 @@ def test_normal_compare_method(self):
name="Friendly",
criteria="Choose the winner of the more accurate contestant based on the input and actual output",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
async_mode=False,
)
@@ -325,8 +325,8 @@ def test_multimodal_compare_method(self):
name="Friendly",
criteria="Choose the winner of the more accurate contestant based on the input and actual output",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
async_mode=False,
)
diff --git a/tests/test_metrics/test_conversational_dag.py b/tests/test_metrics/test_conversational_dag.py
index 5faecea482..c62efcfaf3 100644
--- a/tests/test_metrics/test_conversational_dag.py
+++ b/tests/test_metrics/test_conversational_dag.py
@@ -8,7 +8,7 @@
ConversationalNonBinaryJudgementNode,
ConversationalVerdictNode,
)
-from deepeval.test_case import TurnParams
+from deepeval.test_case import MultiTurnParams
from deepeval.metrics.dag.utils import (
is_valid_dag_from_roots,
extract_required_params,
@@ -28,7 +28,7 @@ def test_is_valid_dag_true(self):
instructions="Extract",
output_label="X",
children=[judgement_node],
- evaluation_params=[TurnParams.ROLE],
+ evaluation_params=[MultiTurnParams.ROLE],
)
assert is_valid_dag_from_roots([root], multiturn=True) is True
@@ -105,17 +105,17 @@ def test_extract_required_params(self):
judgement_node = ConversationalBinaryJudgementNode(
criteria="?",
children=[leaf_false, leaf_true],
- evaluation_params=[TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.CONTENT],
)
task = ConversationalTaskNode(
instructions="Extract something",
output_label="abc",
- evaluation_params=[TurnParams.ROLE],
+ evaluation_params=[MultiTurnParams.ROLE],
children=[judgement_node],
)
params = extract_required_params([task], multiturn=True)
- assert TurnParams.ROLE in params
- assert TurnParams.CONTENT in params
+ assert MultiTurnParams.ROLE in params
+ assert MultiTurnParams.CONTENT in params
assert len(params) == 2
def test_invalid_child_type(self):
@@ -134,17 +134,17 @@ def test_extract_required_params_non_binary(self):
non_binary = ConversationalNonBinaryJudgementNode(
criteria="Evaluate this",
children=[leaf1, leaf2],
- evaluation_params=[TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.CONTENT],
)
task = ConversationalTaskNode(
instructions="Analyze",
output_label="xyz",
- evaluation_params=[TurnParams.ROLE],
+ evaluation_params=[MultiTurnParams.ROLE],
children=[non_binary],
)
params = extract_required_params([task], multiturn=True)
- assert TurnParams.ROLE in params
- assert TurnParams.CONTENT in params
+ assert MultiTurnParams.ROLE in params
+ assert MultiTurnParams.CONTENT in params
assert len(params) == 2
def test_disallow_multiple_judgement_roots(self):
@@ -253,7 +253,7 @@ def test_task_node_leaf(self):
task = ConversationalTaskNode(
instructions="Standalone task",
output_label="standalone",
- evaluation_params=[TurnParams.ROLE],
+ evaluation_params=[MultiTurnParams.ROLE],
children=[],
)
dag = DeepAcyclicGraph(root_nodes=[task])
diff --git a/tests/test_metrics/test_conversational_g_eval.py b/tests/test_metrics/test_conversational_g_eval.py
index 1294cc4423..9eef603a12 100644
--- a/tests/test_metrics/test_conversational_g_eval.py
+++ b/tests/test_metrics/test_conversational_g_eval.py
@@ -4,7 +4,7 @@
from deepeval.test_case import (
ConversationalTestCase,
MLLMImage,
- TurnParams,
+ MultiTurnParams,
Turn,
)
from deepeval import evaluate
@@ -39,7 +39,7 @@ def test_normal_sync_metric_measure(self):
)
metric = ConversationalGEval(
name="Testing image",
- evaluation_params=[TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.CONTENT],
criteria="Check if the assistant's turns are relevanct and helpful to users turns",
async_mode=False,
)
@@ -66,7 +66,7 @@ def test_normal_async_metric_measure(self):
)
metric = ConversationalGEval(
name="Testing image",
- evaluation_params=[TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.CONTENT],
criteria="Check if the assistant's turns are relevanct and helpful to users turns",
)
metric.measure(convo_test_case)
@@ -94,7 +94,7 @@ def test_multimodal_async_metric_measure(self):
)
metric = ConversationalGEval(
name="Testing image",
- evaluation_params=[TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.CONTENT],
criteria="Check if the assistant's turns are relevanct and helpful to users turns",
)
metric.measure(convo_test_case)
@@ -122,7 +122,7 @@ def test_multimodal_sync_metric_measure(self):
)
metric = ConversationalGEval(
name="Testing image",
- evaluation_params=[TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.CONTENT],
criteria="Check if the assistant's turns are relevanct and helpful to users turns",
async_mode=False,
)
@@ -152,7 +152,7 @@ def test_invalid_model_throws_error_for_multimodal(self):
with pytest.raises(ValueError):
metric = ConversationalGEval(
name="Testing image",
- evaluation_params=[TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.CONTENT],
criteria="Check if the assistant's turns are relevanct and helpful to users turns",
model="gpt-3.5-turbo",
)
@@ -175,7 +175,7 @@ def test_normal_evaluate_method(self):
)
metric = ConversationalGEval(
name="Testing image",
- evaluation_params=[TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.CONTENT],
criteria="Check if the assistant's turns are relevanct and helpful to users turns",
)
@@ -202,7 +202,7 @@ def test_multimodal_evaluate_method(self):
)
metric = ConversationalGEval(
name="Testing image",
- evaluation_params=[TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.CONTENT],
criteria="Check if the assistant's turns are relevanct and helpful to users turns",
)
diff --git a/tests/test_metrics/test_dag.py b/tests/test_metrics/test_dag.py
index b59c4f70d4..422d80eb98 100644
--- a/tests/test_metrics/test_dag.py
+++ b/tests/test_metrics/test_dag.py
@@ -6,7 +6,7 @@
VerdictNode,
DeepAcyclicGraph,
)
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import SingleTurnParams
from deepeval.metrics.dag.utils import (
is_valid_dag_from_roots,
extract_required_params,
@@ -28,7 +28,7 @@ def test_is_valid_dag_true(self):
instructions="Extract",
output_label="X",
children=[judgement_node],
- evaluation_params=[LLMTestCaseParams.INPUT],
+ evaluation_params=[SingleTurnParams.INPUT],
)
assert is_valid_dag_from_roots([root], multiturn=False) is True
@@ -102,21 +102,21 @@ def test_extract_required_params(self):
judgement_node = BinaryJudgementNode(
criteria="?",
children=[leaf_false, leaf_true],
- evaluation_params=[LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.EXPECTED_OUTPUT],
)
task = TaskNode(
instructions="Extract something",
output_label="abc",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
children=[judgement_node],
)
params = extract_required_params([task], multiturn=False)
- assert LLMTestCaseParams.INPUT in params
- assert LLMTestCaseParams.ACTUAL_OUTPUT in params
- assert LLMTestCaseParams.EXPECTED_OUTPUT in params
+ assert SingleTurnParams.INPUT in params
+ assert SingleTurnParams.ACTUAL_OUTPUT in params
+ assert SingleTurnParams.EXPECTED_OUTPUT in params
assert len(params) == 3
def test_invalid_child_type(self):
@@ -135,17 +135,17 @@ def test_extract_required_params_non_binary(self):
non_binary = NonBinaryJudgementNode(
criteria="Evaluate this",
children=[leaf1, leaf2],
- evaluation_params=[LLMTestCaseParams.EXPECTED_OUTPUT],
+ evaluation_params=[SingleTurnParams.EXPECTED_OUTPUT],
)
task = TaskNode(
instructions="Analyze",
output_label="xyz",
- evaluation_params=[LLMTestCaseParams.INPUT],
+ evaluation_params=[SingleTurnParams.INPUT],
children=[non_binary],
)
params = extract_required_params([task], multiturn=False)
- assert LLMTestCaseParams.INPUT in params
- assert LLMTestCaseParams.EXPECTED_OUTPUT in params
+ assert SingleTurnParams.INPUT in params
+ assert SingleTurnParams.EXPECTED_OUTPUT in params
assert len(params) == 2
def test_disallow_multiple_judgement_roots(self):
@@ -253,7 +253,7 @@ def test_task_node_leaf(self):
task = TaskNode(
instructions="Standalone task",
output_label="standalone",
- evaluation_params=[LLMTestCaseParams.INPUT],
+ evaluation_params=[SingleTurnParams.INPUT],
children=[],
)
dag = DeepAcyclicGraph(root_nodes=[task])
diff --git a/tests/test_metrics/test_dag_serialization.py b/tests/test_metrics/test_dag_serialization.py
index 1f18a8f235..a343db5e16 100644
--- a/tests/test_metrics/test_dag_serialization.py
+++ b/tests/test_metrics/test_dag_serialization.py
@@ -25,7 +25,7 @@
ConversationalVerdictNode,
)
from deepeval.metrics.dag.utils import is_valid_dag_from_roots
-from deepeval.test_case import LLMTestCaseParams, TurnParams
+from deepeval.test_case import SingleTurnParams, MultiTurnParams
# ----------------------------------------------------------------------------
@@ -40,15 +40,15 @@ def _build_simple_single_turn_dag() -> DeepAcyclicGraph:
criteria="Is the output a summary?",
children=[leaf_false, leaf_true],
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
)
root = TaskNode(
instructions="Extract the summary.",
output_label="Summary",
children=[judgement],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
label="extract",
)
return DeepAcyclicGraph(root_nodes=[root])
@@ -91,7 +91,7 @@ def test_dag_to_dict_evaluation_params_serialized_as_strings(self):
]
assert len(task_specs) == 1
assert task_specs[0]["evaluation_params"] == [
- LLMTestCaseParams.ACTUAL_OUTPUT.value
+ SingleTurnParams.ACTUAL_OUTPUT.value
]
def test_dag_to_dict_verdict_with_score_only(self):
@@ -118,14 +118,14 @@ def test_round_trip_via_dict_preserves_structure(self):
assert root.instructions == "Extract the summary."
assert root.output_label == "Summary"
assert root.label == "extract"
- assert root.evaluation_params == [LLMTestCaseParams.ACTUAL_OUTPUT]
+ assert root.evaluation_params == [SingleTurnParams.ACTUAL_OUTPUT]
assert len(root.children) == 1
judge = root.children[0]
assert isinstance(judge, BinaryJudgementNode)
assert judge.criteria == "Is the output a summary?"
assert judge.evaluation_params == [
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
]
assert {c.verdict for c in judge.children} == {True, False}
assert {c.score for c in judge.children} == {0, 10}
@@ -154,7 +154,7 @@ def test_non_binary_round_trip(self):
judge = NonBinaryJudgementNode(
criteria="Classify the format.",
children=[v_a, v_b, v_c],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
)
dag = DeepAcyclicGraph(root_nodes=[judge])
@@ -176,7 +176,7 @@ def test_shared_judgement_node_is_one_object(self):
shared_judge = BinaryJudgementNode(
criteria="Inner check?",
children=[leaf_no, leaf_yes],
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ evaluation_params=[SingleTurnParams.ACTUAL_OUTPUT],
label="shared_judge",
)
wrap_a = VerdictNode(verdict="left", child=shared_judge)
@@ -232,7 +232,7 @@ def _build_simple_multiturn_dag() -> DeepAcyclicGraph:
judge = ConversationalBinaryJudgementNode(
criteria="Did the assistant respond appropriately?",
children=[v_no, v_yes],
- evaluation_params=[TurnParams.CONTENT, TurnParams.ROLE],
+ evaluation_params=[MultiTurnParams.CONTENT, MultiTurnParams.ROLE],
)
return DeepAcyclicGraph(root_nodes=[judge])
@@ -248,8 +248,8 @@ def test_multiturn_round_trip(self):
root = rebuilt.root_nodes[0]
assert isinstance(root, ConversationalBinaryJudgementNode)
assert root.evaluation_params == [
- TurnParams.CONTENT,
- TurnParams.ROLE,
+ MultiTurnParams.CONTENT,
+ MultiTurnParams.ROLE,
]
assert {c.verdict for c in root.children} == {True, False}
@@ -268,13 +268,13 @@ def test_multiturn_task_node_turn_window_round_trip(self):
judge = ConversationalBinaryJudgementNode(
criteria="?",
children=[v_no, v_yes],
- evaluation_params=[TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.CONTENT],
)
task = ConversationalTaskNode(
instructions="Look at first 2 turns",
output_label="X",
children=[judge],
- evaluation_params=[TurnParams.CONTENT],
+ evaluation_params=[MultiTurnParams.CONTENT],
turn_window=(0, 1),
)
dag = DeepAcyclicGraph(root_nodes=[task])
diff --git a/tests/test_metrics/test_g_eval_metric.py b/tests/test_metrics/test_g_eval_metric.py
index 9fd7c9cb5e..696b983c06 100644
--- a/tests/test_metrics/test_g_eval_metric.py
+++ b/tests/test_metrics/test_g_eval_metric.py
@@ -5,7 +5,7 @@
LLMTestCase,
MLLMImage,
ToolCall,
- LLMTestCaseParams,
+ SingleTurnParams,
)
from deepeval import evaluate
@@ -42,8 +42,8 @@ def test_normal_sync_metric_measure(self):
metric = GEval(
name="Testing GEval",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
criteria="Check if the actual output is relevant to input",
async_mode=False,
@@ -74,8 +74,8 @@ def test_normal_async_metric_measure(self):
metric = GEval(
name="Testing GEval",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
criteria="Check if the actual output is relevant to input",
)
@@ -102,8 +102,8 @@ def test_multimodal_async_metric_measure(self):
metric = GEval(
name="Testing GEval",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
criteria="Check if the actual output is relevant to input",
)
@@ -130,8 +130,8 @@ def test_multimodal_sync_metric_measure(self):
metric = GEval(
name="Testing GEval",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
criteria="Check if the actual output is relevant to input",
async_mode=False,
@@ -160,8 +160,8 @@ def test_invalid_model_throws_error_for_multimodal(self):
metric = GEval(
name="Testing GEval",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
async_mode=False,
model="gpt-3.5-turbo",
@@ -189,8 +189,8 @@ def test_normal_evaluate_method(self):
metric = GEval(
name="Testing GEval",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
criteria="Check if the actual output is relevant to input",
)
@@ -214,8 +214,8 @@ def test_multimodal_evaluate_method(self):
metric = GEval(
name="Testing GEval",
evaluation_params=[
- LLMTestCaseParams.INPUT,
- LLMTestCaseParams.ACTUAL_OUTPUT,
+ SingleTurnParams.INPUT,
+ SingleTurnParams.ACTUAL_OUTPUT,
],
criteria="Check if the actual output is relevant to input",
async_mode=False,
diff --git a/tests/test_metrics/test_g_eval_utils.py b/tests/test_metrics/test_g_eval_utils.py
new file mode 100644
index 0000000000..8ee48b0b2e
--- /dev/null
+++ b/tests/test_metrics/test_g_eval_utils.py
@@ -0,0 +1,135 @@
+import pytest
+
+from deepeval.errors import MissingTestCaseParamsError
+from deepeval.metrics.g_eval.utils import (
+ CONVERSATIONAL_G_EVAL_API_PARAMS,
+ G_EVAL_API_PARAMS,
+ construct_geval_upload_payload,
+ construct_non_turns_test_case_string,
+ construct_test_case_string,
+)
+from deepeval.metrics.utils import (
+ check_conversational_test_case_params,
+ check_llm_test_case_params,
+ convert_turn_to_dict,
+)
+from deepeval.test_case import (
+ ConversationalTestCase,
+ LLMTestCase,
+ SingleTurnParams,
+ Turn,
+ MultiTurnParams,
+)
+
+
+class DummyMetric:
+ __name__ = "DummyMetric"
+ error = None
+
+
+class DummyConversationalMetric:
+ __name__ = "DummyConversationalMetric"
+ error = None
+
+
+def test_geval_accepts_metadata_and_tags():
+ test_case = LLMTestCase(
+ input="input",
+ metadata={"source": "unit"},
+ tags=["tag"],
+ )
+
+ text = construct_test_case_string(
+ [SingleTurnParams.METADATA, SingleTurnParams.TAGS],
+ test_case,
+ )
+ payload = construct_geval_upload_payload(
+ name="metadata-test",
+ evaluation_params=[SingleTurnParams.METADATA, SingleTurnParams.TAGS],
+ g_eval_api_params=G_EVAL_API_PARAMS,
+ criteria="criteria",
+ )
+
+ assert "Metadata" in text
+ assert "Tags" in text
+ assert payload["evaluationParams"] == ["metadata", "tags"]
+
+
+def test_geval_requires_metadata_when_selected():
+ test_case = LLMTestCase(input="input", tags=["tag"])
+
+ with pytest.raises(MissingTestCaseParamsError):
+ check_llm_test_case_params(
+ test_case,
+ [SingleTurnParams.METADATA],
+ None,
+ None,
+ DummyMetric(),
+ )
+
+
+def test_conversational_geval_accepts_metadata_and_tags():
+ case_metadata = {"case": "metadata"}
+ case_tags = ["tag"]
+ test_case = ConversationalTestCase(
+ turns=[Turn(role="user", content="hello")],
+ metadata=case_metadata,
+ tags=case_tags,
+ )
+
+ non_turn_text = construct_non_turns_test_case_string(
+ [MultiTurnParams.METADATA, MultiTurnParams.TAGS],
+ test_case,
+ )
+ turn_dict = convert_turn_to_dict(
+ test_case.turns[0],
+ [
+ MultiTurnParams.CONTENT,
+ MultiTurnParams.ROLE,
+ MultiTurnParams.METADATA,
+ MultiTurnParams.TAGS,
+ ],
+ )
+ payload = construct_geval_upload_payload(
+ name="conversational-metadata-test",
+ evaluation_params=[MultiTurnParams.METADATA, MultiTurnParams.TAGS],
+ g_eval_api_params=CONVERSATIONAL_G_EVAL_API_PARAMS,
+ criteria="criteria",
+ multi_turn=True,
+ )
+
+ assert "Metadata" in non_turn_text
+ assert "case" in non_turn_text
+ assert "Tags" in non_turn_text
+ assert "tag" in non_turn_text
+ assert "metadata" not in turn_dict
+ assert "tags" not in turn_dict
+ assert payload["evaluationParams"] == ["metadata", "tags"]
+
+
+def test_conversational_geval_requires_metadata_when_selected():
+ test_case = ConversationalTestCase(
+ turns=[Turn(role="user", content="hello")],
+ tags=["tag"],
+ )
+
+ with pytest.raises(MissingTestCaseParamsError):
+ check_conversational_test_case_params(
+ test_case,
+ [MultiTurnParams.METADATA],
+ DummyConversationalMetric(),
+ )
+
+
+def test_conversational_geval_requires_tags_when_selected():
+ test_case = ConversationalTestCase(
+ turns=[Turn(role="user", content="hello")],
+ metadata={"case": "metadata"},
+ )
+
+ with pytest.raises(MissingTestCaseParamsError):
+ check_conversational_test_case_params(
+ test_case,
+ [MultiTurnParams.TAGS],
+ DummyConversationalMetric(),
+ )
diff --git a/tests/test_metrics/test_turn_contextual_recall_metric.py b/tests/test_metrics/test_turn_contextual_recall_metric.py
index 7dace7b102..c42099703d 100644
--- a/tests/test_metrics/test_turn_contextual_recall_metric.py
+++ b/tests/test_metrics/test_turn_contextual_recall_metric.py
@@ -4,7 +4,7 @@
from deepeval.test_case import (
ConversationalTestCase,
MLLMImage,
- TurnParams,
+ MultiTurnParams,
Turn,
)
from deepeval import evaluate
diff --git a/tests/test_metrics/turn_contextual_relevancy_metric.py b/tests/test_metrics/turn_contextual_relevancy_metric.py
index 42f55a596f..32eaa38fca 100644
--- a/tests/test_metrics/turn_contextual_relevancy_metric.py
+++ b/tests/test_metrics/turn_contextual_relevancy_metric.py
@@ -4,7 +4,7 @@
from deepeval.test_case import (
ConversationalTestCase,
MLLMImage,
- TurnParams,
+ MultiTurnParams,
Turn,
)
from deepeval import evaluate
diff --git a/try_evals_iterator_async_span.py b/try_evals_iterator_async_span.py
deleted file mode 100644
index bc668ed948..0000000000
--- a/try_evals_iterator_async_span.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""Manual smoke test: async + span-level metric.
-
- python try_evals_iterator_async_span.py
-
-Metric is declared on @observe(metrics=[...]) and evaluated on the span.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import random
-import time
-from typing import List
-
-from deepeval.dataset import EvaluationDataset, Golden
-from deepeval.evaluate.configs import AsyncConfig, DisplayConfig
-from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-from deepeval.tracing import observe
-
-
-AGENT_DELAY_SEC = 0.6
-METRIC_DELAY_SEC = 0.4
-
-
-class RandomScoreMetric(BaseMetric):
- threshold: float = 0.5
- async_mode: bool = True
- _required_params: List[LLMTestCaseParams] = [LLMTestCaseParams.INPUT]
-
- def __init__(self, threshold: float = 0.5):
- self.threshold = threshold
-
- def _finalize(self) -> float:
- self.score = random.random()
- self.success = self.score >= self.threshold
- self.reason = f"random score {self.score:.3f}"
- return self.score
-
- def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
- time.sleep(METRIC_DELAY_SEC)
- return self._finalize()
-
- async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
- await asyncio.sleep(METRIC_DELAY_SEC)
- return self._finalize()
-
- def is_successful(self) -> bool:
- return bool(self.success)
-
- @property
- def __name__(self):
- return "RandomScore"
-
-
-QUESTIONS = [
- f"[{i}] {q}"
- for i, q in enumerate(
- [
- "What is the capital of France?",
- "Who wrote Hamlet?",
- "What is 2 + 2?",
- "Define entropy.",
- "What is the speed of light?",
- ]
- )
-]
-
-
-@observe(type="agent", name="span_metric_agent", metrics=[RandomScoreMetric()])
-def agent(question: str) -> str:
- time.sleep(AGENT_DELAY_SEC)
- return f"Answer to {question!r} is 42."
-
-
-if __name__ == "__main__":
- dataset = EvaluationDataset(goldens=[Golden(input=q) for q in QUESTIONS])
- for golden in dataset.evals_iterator(
- async_config=AsyncConfig(run_async=True),
- display_config=DisplayConfig(show_indicator=True, verbose_mode=False),
- ):
- agent(golden.input)
diff --git a/try_evals_iterator_async_toplevel.py b/try_evals_iterator_async_toplevel.py
deleted file mode 100644
index a5bf343f82..0000000000
--- a/try_evals_iterator_async_toplevel.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""Manual smoke test: async + top-level metric.
-
- python try_evals_iterator_async_toplevel.py
-
-Metric is passed to evals_iterator(metrics=[...]) and evaluated at the trace level.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import random
-import time
-from typing import List
-
-from deepeval.dataset import EvaluationDataset, Golden
-from deepeval.evaluate.configs import AsyncConfig, DisplayConfig
-from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-from deepeval.tracing import observe
-
-
-AGENT_DELAY_SEC = 0.6
-METRIC_DELAY_SEC = 0.4
-
-
-class RandomScoreMetric(BaseMetric):
- threshold: float = 0.5
- async_mode: bool = True
- _required_params: List[LLMTestCaseParams] = [LLMTestCaseParams.INPUT]
-
- def __init__(self, threshold: float = 0.5):
- self.threshold = threshold
-
- def _finalize(self) -> float:
- self.score = random.random()
- self.success = self.score >= self.threshold
- self.reason = f"random score {self.score:.3f}"
- return self.score
-
- def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
- time.sleep(METRIC_DELAY_SEC)
- return self._finalize()
-
- async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
- await asyncio.sleep(METRIC_DELAY_SEC)
- return self._finalize()
-
- def is_successful(self) -> bool:
- return bool(self.success)
-
- @property
- def __name__(self):
- return "RandomScore"
-
-
-QUESTIONS = [
- f"[{i}] {q}"
- for i, q in enumerate(
- [
- "What is the capital of France?",
- "Who wrote Hamlet?",
- "What is 2 + 2?",
- "Define entropy.",
- "What is the speed of light?",
- ]
- )
-]
-
-
-@observe(type="agent", name="bare_agent")
-def agent(question: str) -> str:
- time.sleep(AGENT_DELAY_SEC)
- return f"Answer to {question!r} is 42."
-
-
-if __name__ == "__main__":
- dataset = EvaluationDataset(goldens=[Golden(input=q) for q in QUESTIONS])
- for golden in dataset.evals_iterator(
- metrics=[RandomScoreMetric()],
- async_config=AsyncConfig(run_async=True),
- display_config=DisplayConfig(show_indicator=True, verbose_mode=False),
- ):
- agent(golden.input)
diff --git a/try_evals_iterator_sync_span.py b/try_evals_iterator_sync_span.py
deleted file mode 100644
index 73a2456500..0000000000
--- a/try_evals_iterator_sync_span.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""Manual smoke test: sync + span-level metric.
-
- python try_evals_iterator_sync_span.py
-
-Metric is declared on @observe(metrics=[...]) and evaluated on the span.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import random
-import time
-from typing import List
-
-from deepeval.dataset import EvaluationDataset, Golden
-from deepeval.evaluate.configs import AsyncConfig, DisplayConfig
-from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-from deepeval.tracing import observe
-
-
-AGENT_DELAY_SEC = 0.6
-METRIC_DELAY_SEC = 0.4
-
-
-class RandomScoreMetric(BaseMetric):
- threshold: float = 0.5
- async_mode: bool = True
- _required_params: List[LLMTestCaseParams] = [LLMTestCaseParams.INPUT]
-
- def __init__(self, threshold: float = 0.5):
- self.threshold = threshold
-
- def _finalize(self) -> float:
- self.score = random.random()
- self.success = self.score >= self.threshold
- self.reason = f"random score {self.score:.3f}"
- return self.score
-
- def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
- time.sleep(METRIC_DELAY_SEC)
- return self._finalize()
-
- async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
- await asyncio.sleep(METRIC_DELAY_SEC)
- return self._finalize()
-
- def is_successful(self) -> bool:
- return bool(self.success)
-
- @property
- def __name__(self):
- return "RandomScore"
-
-
-QUESTIONS = [
- f"[{i}] {q}"
- for i, q in enumerate(
- [
- "What is the capital of France?",
- "Who wrote Hamlet?",
- "What is 2 + 2?",
- "Define entropy.",
- "What is the speed of light?",
- ]
- )
-]
-
-
-@observe(type="agent", name="span_metric_agent", metrics=[RandomScoreMetric()])
-def agent(question: str) -> str:
- time.sleep(AGENT_DELAY_SEC)
- return f"Answer to {question!r} is 42."
-
-
-if __name__ == "__main__":
- dataset = EvaluationDataset(goldens=[Golden(input=q) for q in QUESTIONS])
- for golden in dataset.evals_iterator(
- async_config=AsyncConfig(run_async=False),
- display_config=DisplayConfig(show_indicator=True, verbose_mode=False),
- ):
- agent(golden.input)
diff --git a/try_evals_iterator_sync_toplevel.py b/try_evals_iterator_sync_toplevel.py
deleted file mode 100644
index 4a22ddfe88..0000000000
--- a/try_evals_iterator_sync_toplevel.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""Manual smoke test: sync + top-level metric.
-
- python try_evals_iterator_sync_toplevel.py
-
-Metric is passed to evals_iterator(metrics=[...]) and evaluated at the trace level.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import random
-import time
-from typing import List
-
-from deepeval.dataset import EvaluationDataset, Golden
-from deepeval.evaluate.configs import AsyncConfig, DisplayConfig
-from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-from deepeval.tracing import observe
-
-
-AGENT_DELAY_SEC = 0.6
-METRIC_DELAY_SEC = 0.4
-
-
-class RandomScoreMetric(BaseMetric):
- threshold: float = 0.5
- async_mode: bool = True
- _required_params: List[LLMTestCaseParams] = [LLMTestCaseParams.INPUT]
-
- def __init__(self, threshold: float = 0.5):
- self.threshold = threshold
-
- def _finalize(self) -> float:
- self.score = random.random()
- self.success = self.score >= self.threshold
- self.reason = f"random score {self.score:.3f}"
- return self.score
-
- def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
- time.sleep(METRIC_DELAY_SEC)
- return self._finalize()
-
- async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
- await asyncio.sleep(METRIC_DELAY_SEC)
- return self._finalize()
-
- def is_successful(self) -> bool:
- return bool(self.success)
-
- @property
- def __name__(self):
- return "RandomScore"
-
-
-QUESTIONS = [
- f"[{i}] {q}"
- for i, q in enumerate(
- [
- "What is the capital of France?",
- "Who wrote Hamlet?",
- "What is 2 + 2?",
- "Define entropy.",
- "What is the speed of light?",
- ]
- )
-]
-
-
-@observe(type="agent", name="bare_agent")
-def agent(question: str) -> str:
- time.sleep(AGENT_DELAY_SEC)
- return f"Answer to {question!r} is 42."
-
-
-if __name__ == "__main__":
- dataset = EvaluationDataset(goldens=[Golden(input=q) for q in QUESTIONS])
- for golden in dataset.evals_iterator(
- metrics=[RandomScoreMetric()],
- async_config=AsyncConfig(run_async=False),
- display_config=DisplayConfig(show_indicator=True, verbose_mode=False),
- ):
- agent(golden.input)