Azure · ralph-msft · Mar 13, 2025 · Mar 13, 2025 · Mar 17, 2025 · Mar 17, 2025
@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 
 import math
-from typing import List, Callable, Any
+from typing import List, Callable, Any, Sequence
 
 from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 
@@ -32,7 +32,7 @@ def list_mean(lst: List[float]) -> float:
     return list_sum(lst) / len(lst)
 
 
-def list_mean_nan_safe(lst: List[float]) -> float:
+def list_mean_nan_safe(lst: Sequence[float]) -> float:
     """Given a list of floats, remove all nan or None values, then calculate the mean of the remaining values.
 
     :param lst: A list of floats.

@@ -44,6 +44,7 @@
 
 INFERENCE_OF_SENSITIVE_ATTRIBUTES = "inference_sensitive_attributes"
 
+
 def get_formatted_template(data: dict, annotation_task: str) -> str:
     """Given the task and input data, produce a formatted string that will serve as the main
     payload for the RAI service. Requires specific per-task logic.
@@ -66,16 +67,13 @@ def get_formatted_template(data: dict, annotation_task: str) -> str:
         }
         return json.dumps(as_dict)
     if annotation_task == Tasks.CODE_VULNERABILITY:
-        as_dict = {
-            "context": data.get("query", ""),
-            "completion": data.get("response", "")
-        }
+        as_dict = {"context": data.get("query", ""), "completion": data.get("response", "")}
         return json.dumps(as_dict)
     if annotation_task == Tasks.UNGROUNDED_ATTRIBUTES:
         as_dict = {
             "query": data.get("query", ""),
             "response": data.get("response", ""),
-            "context": data.get("context", "")
+            "context": data.get("context", ""),
         }
         return json.dumps(as_dict)
     as_dict = {
@@ -267,6 +265,7 @@ async def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCre
         sleep_time = RAIService.SLEEP_TIME**request_count
         await asyncio.sleep(sleep_time)
 
+
 def parse_response(  # pylint: disable=too-many-branches,too-many-statements
     batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
 ) -> Dict[str, Union[str, float]]:
@@ -295,10 +294,13 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
         result = {}
         if not batch_response or len(batch_response[0]) == 0:
             return {}
-        if metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES and INFERENCE_OF_SENSITIVE_ATTRIBUTES in batch_response[0]:
-            batch_response[0] = { 
-                EvaluationMetrics.UNGROUNDED_ATTRIBUTES: batch_response[0][INFERENCE_OF_SENSITIVE_ATTRIBUTES] 
-            } 
+        if (
+            metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES
+            and INFERENCE_OF_SENSITIVE_ATTRIBUTES in batch_response[0]
+        ):
+            batch_response[0] = {
+                EvaluationMetrics.UNGROUNDED_ATTRIBUTES: batch_response[0][INFERENCE_OF_SENSITIVE_ATTRIBUTES]
+            }
         if metric_name == EvaluationMetrics.PROTECTED_MATERIAL and metric_name not in batch_response[0]:
             pm_metric_names = {"artwork", "fictional_characters", "logos_and_brands"}
             for pm_metric_name in pm_metric_names:
@@ -334,7 +336,7 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
             result[metric_display_name + "_information_gathering"] = (
                 parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
             )
-        if metric_name == EvaluationMetrics.CODE_VULNERABILITY or metric_name == EvaluationMetrics.UNGROUNDED_ATTRIBUTES:
+        if metric_name in (EvaluationMetrics.CODE_VULNERABILITY, EvaluationMetrics.UNGROUNDED_ATTRIBUTES):
             # Add all attributes under the details.
             details = {}
             for key, value in parsed_response.items():
@@ -345,7 +347,7 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
     return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
 
 
-def _parse_content_harm_response(
+def _parse_content_harm_response(  # pylint: disable=too-many-branches,too-many-statements
     batch_response: List[Dict], metric_name: str, metric_display_name: Optional[str] = None
 ) -> Dict[str, Union[str, float]]:
     """Parse the annotation response from Responsible AI service for a content harm evaluation.

@@ -6,7 +6,7 @@
 import logging
 import os
 import re
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, TypeVar, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, TypedDict, TypeVar, Union, cast
 
 import pandas as pd
 from promptflow._sdk._constants import LINE_NUMBER
@@ -161,44 +161,48 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
         metric_name = col.split(".")[1]
         if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
             label_cols.append(col)
-        if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:    
+        if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
             details_cols = col
 
     label_df = df[label_cols]
     defect_rates = {}
     for col in label_df.columns:
         defect_rate_name = col.replace("_label", "_defect_rate")
-        col_with_boolean_values = pd.to_numeric(label_df[col], errors="coerce")
+        col_with_boolean_values = cast(Sequence[float], pd.to_numeric(label_df[col], errors="coerce"))
         try:
             defect_rates[defect_rate_name] = round(list_mean_nan_safe(col_with_boolean_values), 2)
         except EvaluationException:  # only exception that can be cause is all NaN values
             msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
             LOGGER.warning(msg)
-    
+
     if details_cols:
         details_df = df[details_cols]
-        detail_defect_rates = {}
-        
+        detail_defect_rates: Dict[str, float] = {}
+
         for key, value in details_df.items():
             _process_rows(value, detail_defect_rates)
-                    
+
         for key, value in detail_defect_rates.items():
-            col_with_boolean_values = pd.to_numeric(value, errors="coerce")
+            col_with_boolean_values = cast(Sequence[float], pd.to_numeric(value, errors="coerce"))
             try:
-                defect_rates[f"{details_cols}.{key}_defect_rate"] = round(list_mean_nan_safe(col_with_boolean_values), 2)
+                defect_rates[f"{details_cols}.{key}_defect_rate"] = round(
+                    list_mean_nan_safe(col_with_boolean_values), 2
+                )
             except EvaluationException:  # only exception that can be cause is all NaN values
                 msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
                 LOGGER.warning(msg)
-                
+
     return label_cols, defect_rates
 
+
 def _process_rows(row, detail_defect_rates):
     for key, value in row.items():
         if key not in detail_defect_rates:
             detail_defect_rates[key] = []
         detail_defect_rates[key].append(value)
     return detail_defect_rates
 
+
 def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
     """Aggregate metrics from the evaluation results.
     On top of naively calculating the mean of most metrics, this function also identifies certain columns
@@ -478,7 +482,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
 def _apply_target_to_data(
     target: Callable,
     data: Union[str, os.PathLike],
-    batch_client: TClient,
+    batch_client: ProxyClient,
     initial_data: pd.DataFrame,
     evaluation_name: Optional[str] = None,
     **kwargs,

@@ -61,7 +61,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, float]:
             "bleu_score": score,
         }
 
-    @overload  # type: ignore
+    @overload
     def __call__(self, *, response: str, ground_truth: str):
         """
         Evaluate the BLEU score between the response and the ground truth.
@@ -73,20 +73,33 @@ def __call__(self, *, response: str, ground_truth: str):
         :return: The BLEU score.
         :rtype: Dict[str, float]
         """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        """
+        Evaluate the BLEU score between the response and the ground truth.
+
+        :param args: The arguments to pass to the evaluation function.
+        :type args: Any
+        :rtype: Dict[str, float]"""
+        ...
 
     @override
-    def __call__(  # pylint: disable=docstring-missing-param
+    def __call__(
         self,
         *args,
         **kwargs,
     ):
         """
         Evaluate the BLEU score between the response and the ground truth.
 
-        :keyword response: The response to be evaluated.
-        :paramtype response: str
-        :keyword ground_truth: The ground truth to be compared against.
-        :paramtype ground_truth: str
+        :param args: The arguments to pass to the evaluation function.
+        :type args: Any
         :return: The BLEU score.
         :rtype: Dict[str, float]
         """

@@ -1,22 +1,24 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from typing_extensions import overload, override
 from typing import Dict, Union
+from typing_extensions import overload, override
 
 from azure.ai.evaluation._common._experimental import experimental
 from azure.ai.evaluation._common.constants import EvaluationMetrics
 from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
 
+
 # cspell:ignore ssrf, vuln
 @experimental
 class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     """
-    Evaluates code vulnerability for a given query and response for a single-turn evaluation only, 
-    where query represents the user query or code before the completion, and response represents the code recommended by the assistant.
+    Evaluates code vulnerability for a given query and response for a single-turn evaluation only,
+    where query represents the user query or code before the completion, and response represents the
+    code recommended by the assistant.
 
     The code vulnerability evaluation checks for vulnerabilities in the following coding languages:
-    
+
     - Python
     - Java
     - C++
@@ -26,7 +28,7 @@ class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
     - SQL
 
     The code vulnerability evaluation identifies the following vulnerabilities:
-    
+
     - path-injection
     - sql-injection
     - code-injection
@@ -85,13 +87,13 @@ def __init__(
             credential=credential,
         )
 
-    @overload
+    @overload  # type: ignore[override]
     def __call__(
         self,
         *,
         query: str,
         response: str,
-    ) -> Dict[str, Union[str, float]]: 
+    ) -> Dict[str, Union[str, float]]:
         """Evaluate a given query/response pair for code vulnerability
 
         :keyword query: The query to be evaluated.
@@ -101,20 +103,29 @@ def __call__(
         :return: The code vulnerability label.
         :rtype: Dict[str, Union[str, bool]]
         """
+        ...
+
+    @overload
+    def __call__(self, *args, **kwargs):
+        """Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only.
+
+        :param Any args: The arguments to pass to the evaluator.
+        :return: The code vulnerability label.
+        :rtype: Dict[str, Union[str, bool]]
+        """
+        ...
 
     @override
-    def __call__(  # pylint: disable=docstring-missing-param
+    def __call__(
         self,
         *args,
         **kwargs,
     ):
-        """Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only. 
+        """Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only.
 
-        :keyword query: The query to be evaluated.
-        :paramtype query: Optional[str]
-        :keyword response: The response to be evaluated.
-        :paramtype response: Optional[str]
+        :param Any args: The arguments to pass to the evaluator.
+        :return: The code vulnerability label.
         :rtype: Dict[str, Union[str, bool]]
         """
-        
+
         return super().__call__(*args, **kwargs)
@@ -2,7 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 import os
-from typing import Dict, Union, List
+from typing import Dict, List, Optional, Union
 
 from typing_extensions import overload, override
 
@@ -67,7 +67,7 @@ def __call__(
         :rtype: Dict[str, float]
         """
 
-    @overload
+    @overload  # type: ignore[override]
     def __call__(
         self,
         *,
@@ -82,9 +82,27 @@ def __call__(
         :return: The coherence score.
         :rtype: Dict[str, Union[float, Dict[str, List[float]]]]
         """
+        ...
+
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: Optional[str] = None,
+    ):
+        """
+        Evaluate coherence for a query and response.
+
+        :keyword str query: The query to be evaluated.
+        :keyword Optional[str] response: The response to be evaluated.
+        :return: The relevance score.
+        :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
+        """
+        ...
 
     @override
-    def __call__(  # pylint: disable=docstring-missing-param
+    def __call__(
         self,
         *args,
         **kwargs,
@@ -93,14 +111,7 @@ def __call__(  # pylint: disable=docstring-missing-param
         or a conversation for a potentially multi-turn evaluation. If the conversation has more than one pair of
         turns, the evaluator will aggregate the results of each turn.
 
-        :keyword query: The query to be evaluated.
-        :paramtype query: str
-        :keyword response: The response to be evaluated.
-        :paramtype response: Optional[str]
-        :keyword conversation: The conversation to evaluate. Expected to contain a list of conversation turns under the
-            key "messages". Conversation turns are expected
-            to be dictionaries with keys "content" and "role".
-        :paramtype conversation: Optional[~azure.ai.evaluation.Conversation]
+        :param Any args: The arguments to evaluate.
         :return: The relevance score.
         :rtype: Union[Dict[str, float], Dict[str, Union[float, Dict[str, List[float]]]]]
         """