Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/evidently/descriptors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
from .generated_descriptors import OOVWordsPercentage
from .generated_descriptors import OpenAI
from .generated_descriptors import PIILLMEval
from .generated_descriptors import QualityLLMEval
from .generated_descriptors import SemanticSimilarity
from .generated_descriptors import SentenceCount
from .generated_descriptors import Sentiment
Expand Down Expand Up @@ -88,6 +89,7 @@
"CompletenessLLMEval",
"Contains",
"ContainsLink",
"QualityLLMEval",
"ContextQualityLLMEval",
"ContextRelevance",
"CorrectnessLLMEval",
Expand Down
41 changes: 41 additions & 0 deletions src/evidently/descriptors/generated_descriptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,47 @@ def CompletenessLLMEval(
return FeatureDescriptor(feature=feature, alias=alias, tests=tests)


def QualityLLMEval(
column_name: str,
provider: str = "openai",
model: str = "gpt-4o-mini",
additional_columns: Optional[Dict[str, str]] = None,
include_category: Optional[bool] = None,
include_score: Optional[bool] = None,
include_reasoning: Optional[bool] = None,
uncertainty: Optional[Uncertainty] = None,
alias: Optional[str] = None,
tests: Optional[List[Union["DescriptorTest", "GenericTest"]]] = None,
):
"""Score the reference free quality using LLM evaluation.

Args:
* `column_name`: Name of the text column to evaluate.
* `provider`: LLM provider name (e.g., "openai", "anthropic").
* `model`: Model name to use (e.g., "gpt-4o-mini").
* `additional_columns`: Optional mapping of prompt variables to column names.
* `include_category`: Whether to include category in output.
* `include_score`: Whether to include score in output.
* `include_reasoning`: Whether to include reasoning in output.
* `uncertainty`: Optional uncertainty handling strategy.
* `alias`: Optional alias for the descriptor.
* `tests`: Optional list of tests to apply.
"""
from evidently.legacy.descriptors.llm_judges import QualityLLMEval as QualityLLMEvalV1

feature = QualityLLMEvalV1(
provider=provider,
model=model,
additional_columns=additional_columns,
include_category=include_category,
include_score=include_score,
include_reasoning=include_reasoning,
uncertainty=uncertainty,
display_name=alias,
).feature(column_name)
return FeatureDescriptor(feature=feature, alias=alias, tests=tests)


def ContextQualityLLMEval(
column_name: str,
question: str,
Expand Down
2 changes: 2 additions & 0 deletions src/evidently/legacy/descriptors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from .llm_judges import LLMEval
from .llm_judges import NegativityLLMEval
from .llm_judges import PIILLMEval
from .llm_judges import QualityLLMEval
from .llm_judges import ToxicityLLMEval
from .non_letter_character_percentage_descriptor import NonLetterCharacterPercentage
from .oov_words_percentage_descriptor import OOV
Expand Down Expand Up @@ -52,6 +53,7 @@
"NegativityLLMEval",
"PIILLMEval",
"DeclineLLMEval",
"QualityLLMEval",
"ContextQualityLLMEval",
"BiasLLMEval",
"ToxicityLLMEval",
Expand Down
5 changes: 5 additions & 0 deletions src/evidently/legacy/descriptors/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@
"evidently.legacy.descriptors.llm_judges.BinaryClassificationLLMEval",
"evidently:descriptor:BinaryClassificationLLMEval",
)
register_type_alias(
FeatureDescriptor,
"evidently.legacy.descriptors.llm_judges.QualityLLMEval",
"evidently:descriptor:QualityLLMEval",
)
register_type_alias(
FeatureDescriptor,
"evidently.legacy.descriptors.llm_judges.ContextQualityLLMEval",
Expand Down
22 changes: 22 additions & 0 deletions src/evidently/legacy/descriptors/llm_judges.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,28 @@ class Config:
model = "gpt-4o-mini"


class QualityLLMEval(BinaryClassificationLLMEval):
class Config:
type_alias = "evidently:descriptor:QualityLLMEval"

name: ClassVar = "Quality"
template: ClassVar = BinaryClassificationPromptTemplate(
criteria=textwrap.dedent(
"""
"A LQ indicates that the post is of very low quality, semantically meaningless, and contains broken-off or repetitive text."
"A HQ indicates the post is of very high quality, addressing a complex topic with advanced vocabulary, phrasing, and style."
"""
).strip(),
target_category="HQ",
non_target_category="LQ",
uncertainty=Uncertainty.UNKNOWN,
include_reasoning=True,
pre_messages=[LLMMessage.system("You are a judge which evaluates text.")],
)
provider = "openai"
model = "gpt-4o-mini"


class ContextQualityLLMEval(BinaryClassificationLLMEval):
class Config:
type_alias = "evidently:descriptor:ContextQualityLLMEval"
Expand Down