Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/evidently/descriptors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from .generated_descriptors import BERTScore
from .generated_descriptors import BiasLLMEval
from .generated_descriptors import BinaryClassificationLLMEval
from .generated_descriptors import CoherenceLLMEval
from .generated_descriptors import CompletenessLLMEval
from .generated_descriptors import ContainsLink
from .generated_descriptors import ContextQualityLLMEval
Expand All @@ -46,6 +47,7 @@
from .generated_descriptors import EndsWith
from .generated_descriptors import ExactMatch
from .generated_descriptors import FaithfulnessLLMEval
from .generated_descriptors import FluencyLLMEval
from .generated_descriptors import HuggingFace
from .generated_descriptors import HuggingFaceToxicity
from .generated_descriptors import IsValidJSON
Expand Down Expand Up @@ -84,6 +86,7 @@
"BeginsWith",
"BiasLLMEval",
"BinaryClassificationLLMEval",
"CoherenceLLMEval",
"ColumnTest",
"CompletenessLLMEval",
"Contains",
Expand All @@ -99,6 +102,7 @@
"ExactMatch",
"ExcludesWords",
"FaithfulnessLLMEval",
"FluencyLLMEval",
"HuggingFace",
"HuggingFaceToxicity",
"IncludesWords",
Expand Down
88 changes: 88 additions & 0 deletions src/evidently/descriptors/generated_descriptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,50 @@ def BinaryClassificationLLMEval(
return FeatureDescriptor(feature=feature, alias=alias, tests=tests)


def CoherenceLLMEval(
column_name: str,
provider: str = "openai",
model: str = "gpt-4o-mini",
additional_columns: Optional[Dict[str, str]] = None,
include_category: Optional[bool] = None,
include_score: Optional[bool] = None,
include_reasoning: Optional[bool] = None,
uncertainty: Optional[Uncertainty] = None,
alias: Optional[str] = None,
tests: Optional[List[Union["DescriptorTest", "GenericTest"]]] = None,
):
"""Evaluate logical coherence and organization of text using LLM.

Checks whether the text is logically consistent and well-structured, without
requiring any reference output or ground-truth answer.

Args:
* `column_name`: Name of the text column to evaluate.
* `provider`: LLM provider name (e.g., "openai", "anthropic").
* `model`: Model name to use (e.g., "gpt-4o-mini").
* `additional_columns`: Optional mapping of prompt variables to column names.
* `include_category`: Whether to include category in output.
* `include_score`: Whether to include score in output.
* `include_reasoning`: Whether to include reasoning in output.
* `uncertainty`: Optional uncertainty handling strategy.
* `alias`: Optional alias for the descriptor.
* `tests`: Optional list of tests to apply.
"""
from evidently.legacy.descriptors.llm_judges import CoherenceLLMEval as CoherenceLLMEvalV1

feature = CoherenceLLMEvalV1(
provider=provider,
model=model,
additional_columns=additional_columns,
include_category=include_category,
include_score=include_score,
include_reasoning=include_reasoning,
uncertainty=uncertainty,
display_name=alias,
).feature(column_name)
return FeatureDescriptor(feature=feature, alias=alias, tests=tests)


def CompletenessLLMEval(
column_name: str,
context: str,
Expand Down Expand Up @@ -1014,6 +1058,50 @@ def FaithfulnessLLMEval(
return FeatureDescriptor(feature=feature, alias=alias, tests=tests)


def FluencyLLMEval(
column_name: str,
provider: str = "openai",
model: str = "gpt-4o-mini",
additional_columns: Optional[Dict[str, str]] = None,
include_category: Optional[bool] = None,
include_score: Optional[bool] = None,
include_reasoning: Optional[bool] = None,
uncertainty: Optional[Uncertainty] = None,
alias: Optional[str] = None,
tests: Optional[List[Union["DescriptorTest", "GenericTest"]]] = None,
):
"""Evaluate fluency of text using LLM.

Checks whether the text is grammatically correct, naturally written,
and easy to read — without requiring any reference output.

Args:
* `column_name`: Name of the text column to evaluate.
* `provider`: LLM provider name (e.g., "openai", "anthropic").
* `model`: Model name to use (e.g., "gpt-4o-mini").
* `additional_columns`: Optional mapping of prompt variables to column names.
* `include_category`: Whether to include category in output.
* `include_score`: Whether to include score in output.
* `include_reasoning`: Whether to include reasoning in output.
* `uncertainty`: Optional uncertainty handling strategy.
* `alias`: Optional alias for the descriptor.
* `tests`: Optional list of tests to apply.
"""
from evidently.legacy.descriptors.llm_judges import FluencyLLMEval as FluencyLLMEvalV1

feature = FluencyLLMEvalV1(
provider=provider,
model=model,
additional_columns=additional_columns,
include_category=include_category,
include_score=include_score,
include_reasoning=include_reasoning,
uncertainty=uncertainty,
display_name=alias,
).feature(column_name)
return FeatureDescriptor(feature=feature, alias=alias, tests=tests)


def LLMEval(
column_name: str,
provider: str,
Expand Down
62 changes: 62 additions & 0 deletions src/evidently/legacy/descriptors/llm_judges.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,68 @@ def get_input_columns(self, column_name: str) -> Dict[str, str]:
return input_columns


class FluencyLLMEval(BinaryClassificationLLMEval):
class Config:
type_alias = "evidently:descriptor:FluencyLLMEval"

name: ClassVar = "Fluency"
template: ClassVar = BinaryClassificationPromptTemplate(
criteria=textwrap.dedent(
"""
A "FLUENT" response is written in clear, natural, grammatically correct language that reads easily and smoothly.
It uses proper sentence structure, appropriate vocabulary, and flows naturally without awkward phrasing, excessive repetition,
or confusing constructions.

A "NOT_FLUENT" response contains significant grammatical errors, broken or incomplete sentences, highly unnatural phrasing,
or is otherwise difficult to read and understand due to language quality issues — regardless of the accuracy of its content.
""" # noqa: E501
).strip(),
target_category="FLUENT",
non_target_category="NOT_FLUENT",
uncertainty=Uncertainty.UNKNOWN,
include_reasoning=True,
pre_messages=[
LLMMessage.system(
"You are an impartial expert evaluator. You will be given a text. "
"Your task is to evaluate the fluency of the text.",
)
],
)
provider = "openai"
model = "gpt-4o-mini"


class CoherenceLLMEval(BinaryClassificationLLMEval):
class Config:
type_alias = "evidently:descriptor:CoherenceLLMEval"

name: ClassVar = "Coherence"
template: ClassVar = BinaryClassificationPromptTemplate(
criteria=textwrap.dedent(
"""
A "COHERENT" response presents ideas in a logically organized, consistent, and easy-to-follow manner.
Its arguments or statements flow naturally from one to the next, and the overall structure makes sense.
It does not contradict itself and stays on topic.

An "INCOHERENT" response is one that is difficult to follow due to logical inconsistencies, abrupt topic changes,
self-contradictions, or a disorganized structure — even if individual sentences are grammatically correct.
""" # noqa: E501
).strip(),
target_category="COHERENT",
non_target_category="INCOHERENT",
uncertainty=Uncertainty.UNKNOWN,
include_reasoning=True,
pre_messages=[
LLMMessage.system(
"You are an impartial expert evaluator. You will be given a text. "
"Your task is to evaluate the logical coherence and organization of the text.",
)
],
)
provider = "openai"
model = "gpt-4o-mini"


class MulticlassClassificationLLMEval(BaseLLMEval):
class Config:
type_alias = "evidently:descriptor:MulticlassClassificationLLMEval"
Expand Down
2 changes: 1 addition & 1 deletion src/evidently/llm/rag/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,4 +162,4 @@ def find_relevant_chunks(self, question: str, n_results: int = 3) -> List[Chunk]
n_results = min(n_results, len(self.chunks))
_, indexes = self.index.search(np.array([query_emb]), n_results)
relevant_chunks = [self.chunks[i] for i in indexes.reshape(-1)]
return relevant_chunks
return relevant_chunks
42 changes: 42 additions & 0 deletions test_output.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
============================= test session starts =============================
platform win32 -- Python 3.11.14, pytest-7.4.4, pluggy-1.6.0 -- C:\Users\mosta\anaconda3\envs\evidently\python.exe
cachedir: .pytest_cache
rootdir: C:\Users\mosta\Documents\GitHub\evidently
configfile: pyproject.toml
plugins: anyio-4.12.1, Faker-40.4.0, asyncio-0.23.7, mock-3.14.0
asyncio: mode=Mode.STRICT
collecting ... collected 10 items

tests/features/test_llm_judge.py::test_parse_response[template0-results0] PASSED [ 10%]
tests/features/test_llm_judge.py::test_parse_response[template1-results1] PASSED [ 20%]
tests/features/test_llm_judge.py::test_parse_response[template2-results2] PASSED [ 30%]
tests/features/test_llm_judge.py::test_parse_response[template3-results3] PASSED [ 40%]
tests/features/test_llm_judge.py::test_llm_judge PASSED [ 50%]
tests/features/test_llm_judge.py::test_multicol_llm_judge PASSED [ 60%]
tests/features/test_llm_judge.py::test_run_snapshot_with_llm_judge PASSED [ 70%]
tests/features/test_llm_judge.py::test_fluency_llm_eval PASSED [ 80%]
tests/features/test_llm_judge.py::test_coherence_llm_eval PASSED [ 90%]
tests/features/test_llm_judge.py::test_reference_free_evals_importable PASSED [100%]

============================== warnings summary ===============================
src\evidently\legacy\tests\utils.py:183
src\evidently\legacy\tests\utils.py:183
C:\Users\mosta\Documents\GitHub\evidently\src\evidently\legacy\tests\utils.py:183: DeprecationWarning: numpy.core is deprecated and has been renamed to numpy._core. The numpy._core namespace contains private NumPy internals and its use is discouraged, as NumPy internals can change without warning in any release. In practice, most real-world usage of numpy.core is to access functionality in the public NumPy API. If that is the case, use the public NumPy API. If not, you are using NumPy internals. If you would still like to access an internal attribute, use numpy._core.numeric.
np.core.numeric.ScalarType = np.core.numeric.ScalarType + (ApproxValue, ApproxValueNoDict) # type: ignore[attr-defined]

tests\conftest.py:12
tests\conftest.py:12
C:\Users\mosta\Documents\GitHub\evidently\tests\conftest.py:12: DeprecationWarning: numpy.core is deprecated and has been renamed to numpy._core. The numpy._core namespace contains private NumPy internals and its use is discouraged, as NumPy internals can change without warning in any release. In practice, most real-world usage of numpy.core is to access functionality in the public NumPy API. If that is the case, use the public NumPy API. If not, you are using NumPy internals. If you would still like to access an internal attribute, use numpy._core.numeric.
np.core.numeric.ScalarType = np.core.numeric.ScalarType + (ApproxValue,) # type: ignore[attr-defined]

tests/features/test_llm_judge.py::test_llm_judge
tests\features\test_llm_judge.py:103: PytestWarning: The test <Function test_llm_judge> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.
@pytest.mark.asyncio

tests/features/test_llm_judge.py::test_multicol_llm_judge
tests\features\test_llm_judge.py:119: PytestWarning: The test <Function test_multicol_llm_judge> is marked with '@pytest.mark.asyncio' but it is not an async function. Please remove the asyncio mark. If the test is not marked explicitly, check for global marks applied via 'pytestmark'.
@pytest.mark.asyncio

-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
======================= 10 passed, 6 warnings in 0.09s ========================

53 changes: 53 additions & 0 deletions tests/features/test_llm_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,56 @@ def test_run_snapshot_with_llm_judge():
}
]
}


def test_fluency_llm_eval():
"""FluencyLLMEval should run without a reference column and produce a 'category' column."""
from evidently.legacy.descriptors.llm_judges import FluencyLLMEval as FluencyLLMEvalV1

fluency_eval = FluencyLLMEvalV1(
provider="mock",
model="",
template=BinaryClassificationPromptTemplate(
target_category="FLUENT",
non_target_category="NOT_FLUENT",
),
)
judge = fluency_eval.feature("text")

data = pd.DataFrame({"text": ["FLUENT", "NOT_FLUENT"]})
dd = DataDefinition(columns={}, reference_present=False)
fts = judge.generate_features(data, dd, Options())

# MockLLMWrapper echoes first character of the input text captured by the regex
assert "category" in fts.columns
assert len(fts) == 2


def test_coherence_llm_eval():
"""CoherenceLLMEval should run without a reference column and produce a 'category' column."""
from evidently.legacy.descriptors.llm_judges import CoherenceLLMEval as CoherenceLLMEvalV1

coherence_eval = CoherenceLLMEvalV1(
provider="mock",
model="",
template=BinaryClassificationPromptTemplate(
target_category="COHERENT",
non_target_category="INCOHERENT",
),
)
judge = coherence_eval.feature("text")

data = pd.DataFrame({"text": ["COHERENT", "INCOHERENT"]})
dd = DataDefinition(columns={}, reference_present=False)
fts = judge.generate_features(data, dd, Options())

# MockLLMWrapper echoes first character of the input text captured by the regex
assert "category" in fts.columns
assert len(fts) == 2


def test_reference_free_evals_importable():
"""Both new descriptors should be importable from the public evidently.descriptors module."""
from evidently.descriptors import CoherenceLLMEval # noqa: F401
from evidently.descriptors import FluencyLLMEval # noqa: F401