Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 4e92233

Browse files
authoredApr 14, 2025··
add async version of validate method (#74)
1 parent cf95c63 commit 4e92233

File tree

4 files changed

+91
-4
lines changed

4 files changed

+91
-4
lines changed
 

‎CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [1.0.10] - 2025-04-15
11+
12+
- Add async support to `Validator` API.
13+
1014
## [1.0.9] - 2025-04-10
1115

1216
- Refactor threshold validation in the `Validator` class to only check user-provided metrics.
@@ -51,7 +55,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
5155

5256
- Initial release of the `cleanlab-codex` client library.
5357

54-
[Unreleased]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.9...HEAD
58+
[Unreleased]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.10...HEAD
59+
[1.0.10]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.9...v1.0.10
5560
[1.0.9]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.8...v1.0.9
5661
[1.0.8]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.7...v1.0.8
5762
[1.0.7]: https://github.com/cleanlab/cleanlab-codex/compare/v1.0.6...v1.0.7

‎pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ classifiers = [
2525
"Programming Language :: Python :: Implementation :: PyPy",
2626
]
2727
dependencies = [
28-
"cleanlab-tlm~=1.0.12",
28+
"cleanlab-tlm~=1.0.18",
2929
"codex-sdk==0.1.0-alpha.14",
3030
"pydantic>=2.0.0, <3",
3131
]

‎src/cleanlab_codex/__about__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# SPDX-License-Identifier: MIT
2-
__version__ = "1.0.9"
2+
__version__ = "1.0.10"

‎src/cleanlab_codex/validator.py

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313
get_default_evaluations,
1414
get_default_trustworthyrag_config,
1515
)
16-
from cleanlab_codex.internal.validator import update_scores_based_on_thresholds as _update_scores_based_on_thresholds
16+
from cleanlab_codex.internal.validator import (
17+
update_scores_based_on_thresholds as _update_scores_based_on_thresholds,
18+
)
1719
from cleanlab_codex.project import Project
1820

1921
if TYPE_CHECKING:
@@ -131,6 +133,41 @@ def validate(
131133
**scores,
132134
}
133135

136+
async def validate_async(
137+
self,
138+
query: str,
139+
context: str,
140+
response: str,
141+
prompt: Optional[str] = None,
142+
form_prompt: Optional[Callable[[str, str], str]] = None,
143+
) -> dict[str, Any]:
144+
"""Evaluate whether the AI-generated response is bad, and if so, request an alternate expert answer.
145+
If no expert answer is available, this query is still logged for SMEs to answer.
146+
147+
Args:
148+
query (str): The user query that was used to generate the response.
149+
context (str): The context that was retrieved from the RAG Knowledge Base and used to generate the response.
150+
response (str): A reponse from your LLM/RAG system.
151+
prompt (str, optional): Optional prompt representing the actual inputs (combining query, context, and system instructions into one string) to the LLM that generated the response.
152+
form_prompt (Callable[[str, str], str], optional): Optional function to format the prompt based on query and context. Cannot be provided together with prompt, provide one or the other. This function should take query and context as parameters and return a formatted prompt string. If not provided, a default prompt formatter will be used. To include a system prompt or any other special instructions for your LLM, incorporate them directly in your custom form_prompt() function definition.
153+
154+
Returns:
155+
dict[str, Any]: A dictionary containing:
156+
- 'expert_answer': Alternate SME-provided answer from Codex if the response was flagged as bad and an answer was found in the Codex Project, or None otherwise.
157+
- 'is_bad_response': True if the response is flagged as potentially bad, False otherwise. When True, a Codex lookup is performed, which logs this query into the Codex Project for SMEs to answer.
158+
- Additional keys from a [`ThresholdedTrustworthyRAGScore`](/codex/api/python/types.validator/#class-thresholdedtrustworthyragscore) dictionary: each corresponds to a [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) evaluation metric, and points to the score for this evaluation as well as a boolean `is_bad` flagging whether the score falls below the corresponding threshold.
159+
"""
160+
scores, is_bad_response = await self.detect_async(query, context, response, prompt, form_prompt)
161+
expert_answer = None
162+
if is_bad_response:
163+
expert_answer = self._remediate(query)
164+
165+
return {
166+
"expert_answer": expert_answer,
167+
"is_bad_response": is_bad_response,
168+
**scores,
169+
}
170+
134171
def detect(
135172
self,
136173
query: str,
@@ -176,6 +213,51 @@ def detect(
176213
is_bad_response = any(score_dict["is_bad"] for score_dict in thresholded_scores.values())
177214
return thresholded_scores, is_bad_response
178215

216+
async def detect_async(
217+
self,
218+
query: str,
219+
context: str,
220+
response: str,
221+
prompt: Optional[str] = None,
222+
form_prompt: Optional[Callable[[str, str], str]] = None,
223+
) -> tuple[ThresholdedTrustworthyRAGScore, bool]:
224+
"""Score response quality using TrustworthyRAG and flag bad responses based on configured thresholds.
225+
226+
Note:
227+
Use this method instead of `validate()` to test/tune detection configurations like score thresholds and TrustworthyRAG settings.
228+
This `detect()` method will not affect your Codex Project, whereas `validate()` will log queries whose response was detected as bad into the Codex Project and is thus only suitable for production, not testing.
229+
Both this method and `validate()` rely on this same detection logic, so you can use this method to first optimize detections and then switch to using `validate()`.
230+
231+
Args:
232+
query (str): The user query that was used to generate the response.
233+
context (str): The context that was retrieved from the RAG Knowledge Base and used to generate the response.
234+
response (str): A reponse from your LLM/RAG system.
235+
prompt (str, optional): Optional prompt representing the actual inputs (combining query, context, and system instructions into one string) to the LLM that generated the response.
236+
form_prompt (Callable[[str, str], str], optional): Optional function to format the prompt based on query and context. Cannot be provided together with prompt, provide one or the other. This function should take query and context as parameters and return a formatted prompt string. If not provided, a default prompt formatter will be used. To include a system prompt or any other special instructions for your LLM, incorporate them directly in your custom form_prompt() function definition.
237+
238+
Returns:
239+
tuple[ThresholdedTrustworthyRAGScore, bool]: A tuple containing:
240+
- ThresholdedTrustworthyRAGScore: Quality scores for different evaluation metrics like trustworthiness
241+
and response helpfulness. Each metric has a score between 0-1. It also has a boolean flag, `is_bad` indicating whether the score is below a given threshold.
242+
- bool: True if the response is determined to be bad based on the evaluation scores
243+
and configured thresholds, False otherwise.
244+
"""
245+
scores = await self._tlm_rag.score_async(
246+
response=response,
247+
query=query,
248+
context=context,
249+
prompt=prompt,
250+
form_prompt=form_prompt,
251+
)
252+
253+
thresholded_scores = _update_scores_based_on_thresholds(
254+
scores=scores,
255+
thresholds=self._bad_response_thresholds,
256+
)
257+
258+
is_bad_response = any(score_dict["is_bad"] for score_dict in thresholded_scores.values())
259+
return thresholded_scores, is_bad_response
260+
179261
def _remediate(self, query: str) -> str | None:
180262
"""Request a SME-provided answer for this query, if one is available in Codex.
181263

0 commit comments

Comments
 (0)
Please sign in to comment.