|
13 | 13 | get_default_evaluations,
|
14 | 14 | get_default_trustworthyrag_config,
|
15 | 15 | )
|
16 |
| -from cleanlab_codex.internal.validator import update_scores_based_on_thresholds as _update_scores_based_on_thresholds |
| 16 | +from cleanlab_codex.internal.validator import ( |
| 17 | + update_scores_based_on_thresholds as _update_scores_based_on_thresholds, |
| 18 | +) |
17 | 19 | from cleanlab_codex.project import Project
|
18 | 20 |
|
19 | 21 | if TYPE_CHECKING:
|
@@ -131,6 +133,41 @@ def validate(
|
131 | 133 | **scores,
|
132 | 134 | }
|
133 | 135 |
|
| 136 | + async def validate_async( |
| 137 | + self, |
| 138 | + query: str, |
| 139 | + context: str, |
| 140 | + response: str, |
| 141 | + prompt: Optional[str] = None, |
| 142 | + form_prompt: Optional[Callable[[str, str], str]] = None, |
| 143 | + ) -> dict[str, Any]: |
| 144 | + """Evaluate whether the AI-generated response is bad, and if so, request an alternate expert answer. |
| 145 | + If no expert answer is available, this query is still logged for SMEs to answer. |
| 146 | +
|
| 147 | + Args: |
| 148 | + query (str): The user query that was used to generate the response. |
| 149 | + context (str): The context that was retrieved from the RAG Knowledge Base and used to generate the response. |
| 150 | + response (str): A reponse from your LLM/RAG system. |
| 151 | + prompt (str, optional): Optional prompt representing the actual inputs (combining query, context, and system instructions into one string) to the LLM that generated the response. |
| 152 | + form_prompt (Callable[[str, str], str], optional): Optional function to format the prompt based on query and context. Cannot be provided together with prompt, provide one or the other. This function should take query and context as parameters and return a formatted prompt string. If not provided, a default prompt formatter will be used. To include a system prompt or any other special instructions for your LLM, incorporate them directly in your custom form_prompt() function definition. |
| 153 | +
|
| 154 | + Returns: |
| 155 | + dict[str, Any]: A dictionary containing: |
| 156 | + - 'expert_answer': Alternate SME-provided answer from Codex if the response was flagged as bad and an answer was found in the Codex Project, or None otherwise. |
| 157 | + - 'is_bad_response': True if the response is flagged as potentially bad, False otherwise. When True, a Codex lookup is performed, which logs this query into the Codex Project for SMEs to answer. |
| 158 | + - Additional keys from a [`ThresholdedTrustworthyRAGScore`](/codex/api/python/types.validator/#class-thresholdedtrustworthyragscore) dictionary: each corresponds to a [TrustworthyRAG](/tlm/api/python/utils.rag/#class-trustworthyrag) evaluation metric, and points to the score for this evaluation as well as a boolean `is_bad` flagging whether the score falls below the corresponding threshold. |
| 159 | + """ |
| 160 | + scores, is_bad_response = await self.detect_async(query, context, response, prompt, form_prompt) |
| 161 | + expert_answer = None |
| 162 | + if is_bad_response: |
| 163 | + expert_answer = self._remediate(query) |
| 164 | + |
| 165 | + return { |
| 166 | + "expert_answer": expert_answer, |
| 167 | + "is_bad_response": is_bad_response, |
| 168 | + **scores, |
| 169 | + } |
| 170 | + |
134 | 171 | def detect(
|
135 | 172 | self,
|
136 | 173 | query: str,
|
@@ -176,6 +213,51 @@ def detect(
|
176 | 213 | is_bad_response = any(score_dict["is_bad"] for score_dict in thresholded_scores.values())
|
177 | 214 | return thresholded_scores, is_bad_response
|
178 | 215 |
|
| 216 | + async def detect_async( |
| 217 | + self, |
| 218 | + query: str, |
| 219 | + context: str, |
| 220 | + response: str, |
| 221 | + prompt: Optional[str] = None, |
| 222 | + form_prompt: Optional[Callable[[str, str], str]] = None, |
| 223 | + ) -> tuple[ThresholdedTrustworthyRAGScore, bool]: |
| 224 | + """Score response quality using TrustworthyRAG and flag bad responses based on configured thresholds. |
| 225 | +
|
| 226 | + Note: |
| 227 | + Use this method instead of `validate()` to test/tune detection configurations like score thresholds and TrustworthyRAG settings. |
| 228 | + This `detect()` method will not affect your Codex Project, whereas `validate()` will log queries whose response was detected as bad into the Codex Project and is thus only suitable for production, not testing. |
| 229 | + Both this method and `validate()` rely on this same detection logic, so you can use this method to first optimize detections and then switch to using `validate()`. |
| 230 | +
|
| 231 | + Args: |
| 232 | + query (str): The user query that was used to generate the response. |
| 233 | + context (str): The context that was retrieved from the RAG Knowledge Base and used to generate the response. |
| 234 | + response (str): A reponse from your LLM/RAG system. |
| 235 | + prompt (str, optional): Optional prompt representing the actual inputs (combining query, context, and system instructions into one string) to the LLM that generated the response. |
| 236 | + form_prompt (Callable[[str, str], str], optional): Optional function to format the prompt based on query and context. Cannot be provided together with prompt, provide one or the other. This function should take query and context as parameters and return a formatted prompt string. If not provided, a default prompt formatter will be used. To include a system prompt or any other special instructions for your LLM, incorporate them directly in your custom form_prompt() function definition. |
| 237 | +
|
| 238 | + Returns: |
| 239 | + tuple[ThresholdedTrustworthyRAGScore, bool]: A tuple containing: |
| 240 | + - ThresholdedTrustworthyRAGScore: Quality scores for different evaluation metrics like trustworthiness |
| 241 | + and response helpfulness. Each metric has a score between 0-1. It also has a boolean flag, `is_bad` indicating whether the score is below a given threshold. |
| 242 | + - bool: True if the response is determined to be bad based on the evaluation scores |
| 243 | + and configured thresholds, False otherwise. |
| 244 | + """ |
| 245 | + scores = await self._tlm_rag.score_async( |
| 246 | + response=response, |
| 247 | + query=query, |
| 248 | + context=context, |
| 249 | + prompt=prompt, |
| 250 | + form_prompt=form_prompt, |
| 251 | + ) |
| 252 | + |
| 253 | + thresholded_scores = _update_scores_based_on_thresholds( |
| 254 | + scores=scores, |
| 255 | + thresholds=self._bad_response_thresholds, |
| 256 | + ) |
| 257 | + |
| 258 | + is_bad_response = any(score_dict["is_bad"] for score_dict in thresholded_scores.values()) |
| 259 | + return thresholded_scores, is_bad_response |
| 260 | + |
179 | 261 | def _remediate(self, query: str) -> str | None:
|
180 | 262 | """Request a SME-provided answer for this query, if one is available in Codex.
|
181 | 263 |
|
|
0 commit comments