Skip to content

Add validator module #61

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Mar 27, 2025
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1bc7370
add cleanlab-tlm as a dependency in pyproject.toml
elisno Mar 20, 2025
2529ae6
Add response validation functionality using TrustworthyRAG
elisno Mar 20, 2025
722d287
alt_answer -> expert_answer
elisno Mar 21, 2025
6f64a12
address comments
elisno Mar 21, 2025
a2c0ea5
have is_bad_response function take the BadResponseThreshold object in…
elisno Mar 21, 2025
b8a1e97
Enhance Validator with flexible thresholds and improved error handling
elisno Mar 22, 2025
db5fe24
move BadResponseThresholds
elisno Mar 22, 2025
29e231a
add prompt and form_prompt
elisno Mar 24, 2025
a741e15
fix formatting and type hints
elisno Mar 24, 2025
380b1ef
update docstrings
elisno Mar 24, 2025
4f40e3d
Add unit tests for Validator and BadResponseThresholds
elisno Mar 25, 2025
02b16e0
include type hints and fix formatting
elisno Mar 25, 2025
873f552
set "expert_answer" as first key
elisno Mar 25, 2025
b471371
clean up imports, type hints and docs
elisno Mar 25, 2025
be4745c
Update pyproject.toml
elisno Mar 26, 2025
54e866b
Update response_validation.py docstring to indicate module deprecatio…
elisno Mar 26, 2025
c632625
make remediate method private
elisno Mar 26, 2025
d422bcf
update docstrings
elisno Mar 26, 2025
2ae9b0f
Update types/response_validation.py docstring to indicate module depr…
elisno Mar 27, 2025
7322026
formatting
elisno Mar 27, 2025
8089c17
update changelog
elisno Mar 27, 2025
f8aeb52
clarify detect v validate further
jwmueller Mar 27, 2025
0f602e3
add prompt and format_prompt to docstrings
elisno Mar 27, 2025
76ca4c3
formatting
elisno Mar 27, 2025
3e4d8bb
deprecated
elisno Mar 27, 2025
ac8762f
Update CHANGELOG for version 1.0.5
elisno Mar 27, 2025
84799a5
Swap order of classes
elisno Mar 27, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

- Add `Validator` API
- Deprecate `response_validation.py` module.

## [1.0.4] - 2025-03-14

- Pass analytics metadata in headers for all Codex API requests.
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ classifiers = [
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
"cleanlab-tlm~=1.0.12",
"codex-sdk==0.1.0a12",
"pydantic>=2.0.0, <3",
]
Expand Down
3 changes: 2 additions & 1 deletion src/cleanlab_codex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
from cleanlab_codex.client import Client
from cleanlab_codex.codex_tool import CodexTool
from cleanlab_codex.project import Project
from cleanlab_codex.validator import Validator

__all__ = ["Client", "CodexTool", "Project"]
__all__ = ["Client", "CodexTool", "Project", "Validator"]
53 changes: 53 additions & 0 deletions src/cleanlab_codex/internal/validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any, Optional, Sequence, cast

from cleanlab_tlm.utils.rag import Eval, TrustworthyRAGScore, get_default_evals

from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore

if TYPE_CHECKING:
from cleanlab_codex.validator import BadResponseThresholds


"""Evaluation metrics (excluding trustworthiness) that are used to determine if a response is bad."""
DEFAULT_EVAL_METRICS = ["response_helpfulness"]


def get_default_evaluations() -> list[Eval]:
"""Get the default evaluations for the TrustworthyRAG.

Note:
This excludes trustworthiness, which is automatically computed by TrustworthyRAG.
"""
return [evaluation for evaluation in get_default_evals() if evaluation.name in DEFAULT_EVAL_METRICS]


def get_default_trustworthyrag_config() -> dict[str, Any]:
"""Get the default configuration for the TrustworthyRAG."""
return {
"options": {
"log": ["explanation"],
},
}


def update_scores_based_on_thresholds(
scores: TrustworthyRAGScore | Sequence[TrustworthyRAGScore], thresholds: BadResponseThresholds
) -> ThresholdedTrustworthyRAGScore:
"""Adds a `is_bad` flag to the scores dictionaries based on the thresholds."""

# Helper function to check if a score is bad
def is_bad(score: Optional[float], threshold: float) -> bool:
return score is not None and score < threshold

if isinstance(scores, Sequence):
raise NotImplementedError("Batching is not supported yet.")

thresholded_scores = {}
for eval_name, score_dict in scores.items():
thresholded_scores[eval_name] = {
**score_dict,
"is_bad": is_bad(score_dict["score"], thresholds.get_threshold(eval_name)),
}
return cast(ThresholdedTrustworthyRAGScore, thresholded_scores)
2 changes: 2 additions & 0 deletions src/cleanlab_codex/response_validation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
"""
This module is now superseded by this [Validator API](/codex/api/validator/).

Validation functions for evaluating LLM responses and determining if they should be replaced with Codex-generated alternatives.
"""

Expand Down
5 changes: 4 additions & 1 deletion src/cleanlab_codex/types/response_validation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
"""Types for response validation."""
"""
This module is now superseded by this [Validator API](/codex/api/validator/).

Types for response validation."""

from abc import ABC, abstractmethod
from collections import OrderedDict
Expand Down
35 changes: 35 additions & 0 deletions src/cleanlab_codex/types/validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from cleanlab_tlm.utils.rag import EvalMetric


class ThresholdedEvalMetric(EvalMetric):
is_bad: bool


ThresholdedEvalMetric.__doc__ = f"""
{EvalMetric.__doc__}

is_bad: bool
Whether the score is a certain threshold.
"""


class ThresholdedTrustworthyRAGScore(dict[str, ThresholdedEvalMetric]):
"""Object returned by `Validator.detect` containing evaluation scores from [TrustworthyRAGScore](/tlm/api/python/utils.rag/#class-trustworthyragscore)
along with a boolean flag, `is_bad`, indicating whether the score is below the threshold.

Example:
```python
{
"trustworthiness": {
"score": 0.92,
"log": {"explanation": "Did not find a reason to doubt trustworthiness."},
"is_bad": False
},
"response_helpfulness": {
"score": 0.35,
"is_bad": True
},
...
}
```
"""
237 changes: 237 additions & 0 deletions src/cleanlab_codex/validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
"""
Detect and remediate bad responses in RAG applications, by integrating Codex as-a-Backup.
"""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Callable, Optional, cast

from cleanlab_tlm import TrustworthyRAG
from pydantic import BaseModel, Field, field_validator

from cleanlab_codex.internal.validator import (
get_default_evaluations,
get_default_trustworthyrag_config,
)
from cleanlab_codex.internal.validator import update_scores_based_on_thresholds as _update_scores_based_on_thresholds
from cleanlab_codex.project import Project

if TYPE_CHECKING:
from cleanlab_codex.types.validator import ThresholdedTrustworthyRAGScore


class BadResponseThresholds(BaseModel):
"""Config for determining if a response is bad.
Each key is an evaluation metric and the value is a threshold such that a response is considered bad whenever the corresponding evaluation score falls below the threshold.

Default Thresholds:
- trustworthiness: 0.5
- response_helpfulness: 0.5
- Any custom eval: 0.5 (if not explicitly specified in bad_response_thresholds)
"""

trustworthiness: float = Field(
description="Threshold for trustworthiness.",
default=0.5,
ge=0.0,
le=1.0,
)
response_helpfulness: float = Field(
description="Threshold for response helpfulness.",
default=0.5,
ge=0.0,
le=1.0,
)

@property
def default_threshold(self) -> float:
"""The default threshold to use when an evaluation metric's threshold is not specified. This threshold is set to 0.5."""
return 0.5

def get_threshold(self, eval_name: str) -> float:
"""Get threshold for an eval, if it exists.

For fields defined in the model, returns their value (which may be the field's default).
For custom evals not defined in the model, returns the default threshold value (see `default_threshold`).
"""

# For fields defined in the model, use their value (which may be the field's default)
if eval_name in self.model_fields:
return cast(float, getattr(self, eval_name))

# For custom evals, use the default threshold
return getattr(self, eval_name, self.default_threshold)

@field_validator("*")
@classmethod
def validate_threshold(cls, v: Any) -> float:
"""Validate that all fields (including dynamic ones) are floats between 0 and 1."""
if not isinstance(v, (int, float)):
error_msg = f"Threshold must be a number, got {type(v)}"
raise TypeError(error_msg)
if not 0 <= float(v) <= 1:
error_msg = f"Threshold must be between 0 and 1, got {v}"
raise ValueError(error_msg)
return float(v)

model_config = {
"extra": "allow" # Allow additional fields for custom eval thresholds
}


class Validator:
def __init__(
self,
codex_access_key: str,
tlm_api_key: Optional[str] = None,
trustworthy_rag_config: Optional[dict[str, Any]] = None,
bad_response_thresholds: Optional[dict[str, float]] = None,
):
"""Real-time detection and remediation of bad responses in RAG applications, powered by Cleanlab's TrustworthyRAG and Codex.

This object combines Cleanlab's TrustworthyRAG evaluation scores with configurable thresholds to detect potentially bad responses
in your RAG application. When a bad response is detected, this Validator automatically attempts to remediate by retrieving an expert-provided
answer from the Codex Project you've integrated with your RAG app. If no expert answer is available,
the corresponding query is logged in the Codex Project for SMEs to answer.

For production, use the `validate()` method which provides a complete validation workflow including both detection and remediation.
A `detect()` method is separately available for you to test/tune detection configurations like score thresholds and TrustworthyRAG settings
without triggering any Codex lookups that otherwise could affect the state of the corresponding Codex Project.

Args:
codex_access_key (str): The [access key](/codex/web_tutorials/create_project/#access-keys) for a Codex project. Used to retrieve expert-provided answers
when bad responses are detected, or otherwise log the corresponding queries for SMEs to answer.

tlm_api_key (str, optional): API key for accessing [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag). If not provided, this must be specified
in `trustworthy_rag_config`.

trustworthy_rag_config (dict[str, Any], optional): Optional initialization arguments for [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag),
which is used to detect response issues. If not provided, a default configuration will be used.
By default, this Validator uses the same default configurations as [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag), except:
- Explanations are returned in logs for better debugging
- Only the `response_helpfulness` eval is run

bad_response_thresholds (dict[str, float], optional): Detection score thresholds used to flag whether
a response is bad or not. Each key corresponds to an Eval from [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag),
and the value indicates a threshold (between 0 and 1) below which Eval scores are treated as detected issues. A response
is flagged as bad if any issues are detected. If not provided, default thresholds will be used. See
[`BadResponseThresholds`](/codex/api/python/validator/#class-badresponsethresholds) for more details.

Raises:
ValueError: If both tlm_api_key and api_key in trustworthy_rag_config are provided.
ValueError: If bad_response_thresholds contains thresholds for non-existent evaluation metrics.
TypeError: If any threshold value is not a number.
ValueError: If any threshold value is not between 0 and 1.
"""
trustworthy_rag_config = trustworthy_rag_config or get_default_trustworthyrag_config()
if tlm_api_key is not None and "api_key" in trustworthy_rag_config:
error_msg = "Cannot specify both tlm_api_key and api_key in trustworthy_rag_config"
raise ValueError(error_msg)
if tlm_api_key is not None:
trustworthy_rag_config["api_key"] = tlm_api_key

self._project: Project = Project.from_access_key(access_key=codex_access_key)

trustworthy_rag_config.setdefault("evals", get_default_evaluations())
self._tlm_rag = TrustworthyRAG(**trustworthy_rag_config)

# Validate that all the necessary thresholds are present in the TrustworthyRAG.
_evals = [e.name for e in self._tlm_rag.get_evals()] + ["trustworthiness"]

self._bad_response_thresholds = BadResponseThresholds.model_validate(bad_response_thresholds or {})

_threshold_keys = self._bad_response_thresholds.model_dump().keys()

# Check if there are any thresholds without corresponding evals (this is an error)
_extra_thresholds = set(_threshold_keys) - set(_evals)
if _extra_thresholds:
error_msg = f"Found thresholds for non-existent evaluation metrics: {_extra_thresholds}"
raise ValueError(error_msg)

def validate(
self,
query: str,
context: str,
response: str,
prompt: Optional[str] = None,
form_prompt: Optional[Callable[[str, str], str]] = None,
) -> dict[str, Any]:
"""Evaluate whether the AI-generated response is bad, and if so, request an alternate expert answer.
If no expert answer is available, this query is still logged for SMEs to answer.

Args:
query (str): The user query that was used to generate the response.
context (str): The context that was retrieved from the RAG Knowledge Base and used to generate the response.
response (str): A reponse from your LLM/RAG system.

Returns:
dict[str, Any]: A dictionary containing:
- 'expert_answer': Alternate SME-provided answer from Codex if the response was flagged as bad and an answer was found in the Codex Project, or None otherwise.
- 'is_bad_response': True if the response is flagged as potentially bad, False otherwise. When True, a Codex lookup is performed, which logs this query into the Codex Project for SMEs to answer.
- Additional keys from a [`ThresholdedTrustworthyRAGScore`](/cleanlab_codex/types/validator/#class-thresholdedtrustworthyragscore) dictionary: each corresponds to a [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) evaluation metric, and points to the score for this evaluation as well as a boolean `is_bad` flagging whether the score falls below the corresponding threshold.
"""
scores, is_bad_response = self.detect(query, context, response, prompt, form_prompt)
expert_answer = None
if is_bad_response:
expert_answer = self._remediate(query)

return {
"expert_answer": expert_answer,
"is_bad_response": is_bad_response,
**scores,
}

def detect(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a user just wants to detect bad responses, should they use TrustworthyRAG or Validate.detect? How is a user supposed to understand how these two relate to each other?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea (which docstring should be updated to reflect) was that Validator is just a version of TrustworthyRAG with different default evals & predetermined thresholds.

The practical impact of those thresholds is they determine when we lookup things in Codex (what is logged in the Project for SME to answer, what gets answered by Codex instead of RAG app). But that impact is primarily realized in Validator.validate(), not in Validator.detect().

So we could make detect() a private method? It's essentially just another version of the .validate() method that is not hooked up to any Codex project (e.g. for testing detection configurations out without impacting the Codex project via logging).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That solution sounds fine to me—making it private, and updating the instructions to indicate that detect -> TrustworthyRAG, detect + remediate -> Validator.

Copy link
Member

@jwmueller jwmueller Mar 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sgtm. @elisno can you also add an optional flag to Validator.validate(), which allows users to run the detection for testing purposes but without interacting with Codex in anyway? (no querying codex at all to ensure testing runs aren't polluting the Codex Project).

This flag could be something like: testing_mode = False by default (try to think of better name)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second thought, we should keep the detect() method public for threshold-tuning and testing purposes (without affecting Codex). I've updated the docstring to reflect this.

No need for another optional flag in Validator.validate().

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also include screenshot of tutorial where you show that it's clearly explained when to use validate() vs. detect()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I pushed more docstring changes to clearly distinguish these, so review those

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also include screenshot of tutorial where you show that it's clearly explained when to use validate() vs. detect()

https://github.com/cleanlab/cleanlab-studio-docs/pull/868#issuecomment-2756947611

Copy link
Member

@jwmueller jwmueller Mar 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that screenshot does not explain the main reason to use detect(), which is to test/tune detection configurations like the evaluation score thresholds and TrustworthyRAG settings

self,
query: str,
context: str,
response: str,
prompt: Optional[str] = None,
form_prompt: Optional[Callable[[str, str], str]] = None,
) -> tuple[ThresholdedTrustworthyRAGScore, bool]:
"""Score response quality using TrustworthyRAG and flag bad responses based on configured thresholds.

Note:
Use this method instead of `validate()` to test/tune detection configurations like score thresholds and TrustworthyRAG settings.
This `detect()` method will not affect your Codex Project, whereas `validate()` will log queries whose response was detected as bad into the Codex Project and is thus only suitable for production, not testing.
Both this method and `validate()` rely on this same detection logic, so you can use this method to first optimize detections and then switch to using `validate()`.

Args:
query (str): The user query that was used to generate the response.
context (str): The context that was retrieved from the RAG Knowledge Base and used to generate the response.
response (str): A reponse from your LLM/RAG system.

Returns:
tuple[ThresholdedTrustworthyRAGScore, bool]: A tuple containing:
- ThresholdedTrustworthyRAGScore: Quality scores for different evaluation metrics like trustworthiness
and response helpfulness. Each metric has a score between 0-1. It also has a boolean flag, `is_bad` indicating whether the score is below a given threshold.
- bool: True if the response is determined to be bad based on the evaluation scores
and configured thresholds, False otherwise.
"""
scores = self._tlm_rag.score(
response=response,
query=query,
context=context,
prompt=prompt,
form_prompt=form_prompt,
)

thresholded_scores = _update_scores_based_on_thresholds(
scores=scores,
thresholds=self._bad_response_thresholds,
)

is_bad_response = any(score_dict["is_bad"] for score_dict in thresholded_scores.values())
return thresholded_scores, is_bad_response

def _remediate(self, query: str) -> str | None:
"""Request a SME-provided answer for this query, if one is available in Codex.

Args:
query (str): The user's original query to get SME-provided answer for.

Returns:
str | None: The SME-provided answer from Codex, or None if no answer could be found in the Codex Project.
"""
codex_answer, _ = self._project.query(question=query)
return codex_answer
29 changes: 29 additions & 0 deletions tests/internal/test_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from typing import cast

from cleanlab_tlm.utils.rag import TrustworthyRAGScore

from cleanlab_codex.internal.validator import get_default_evaluations
from cleanlab_codex.validator import BadResponseThresholds


def make_scores(trustworthiness: float, response_helpfulness: float) -> TrustworthyRAGScore:
scores = {
"trustworthiness": {
"score": trustworthiness,
},
"response_helpfulness": {
"score": response_helpfulness,
},
}
return cast(TrustworthyRAGScore, scores)


def make_is_bad_response_config(trustworthiness: float, response_helpfulness: float) -> BadResponseThresholds:
return BadResponseThresholds(
trustworthiness=trustworthiness,
response_helpfulness=response_helpfulness,
)


def test_get_default_evaluations() -> None:
assert {evaluation.name for evaluation in get_default_evaluations()} == {"response_helpfulness"}
Loading
Loading